diff options
Diffstat (limited to 'src/third_party/re2/dist/re2/testing/re2_test.cc')
-rw-r--r-- | src/third_party/re2/dist/re2/testing/re2_test.cc | 1659 |
1 files changed, 1659 insertions, 0 deletions
diff --git a/src/third_party/re2/dist/re2/testing/re2_test.cc b/src/third_party/re2/dist/re2/testing/re2_test.cc new file mode 100644 index 00000000000..b1f7d7335d5 --- /dev/null +++ b/src/third_party/re2/dist/re2/testing/re2_test.cc @@ -0,0 +1,1659 @@ +// -*- coding: utf-8 -*- +// Copyright 2002-2009 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// TODO: Test extractions for PartialMatch/Consume + +#include <errno.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> +#include <map> +#include <string> +#include <utility> +#include <vector> +#if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__) +#include <sys/mman.h> +#include <unistd.h> /* for sysconf */ +#endif + +#include "util/test.h" +#include "util/logging.h" +#include "util/strutil.h" +#include "re2/re2.h" +#include "re2/regexp.h" + +namespace re2 { + +TEST(RE2, HexTests) { +#define ASSERT_HEX(type, value) \ + do { \ + type v; \ + ASSERT_TRUE( \ + RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \ + ASSERT_EQ(v, 0x##value); \ + ASSERT_TRUE(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*", \ + RE2::CRadix(&v))); \ + ASSERT_EQ(v, 0x##value); \ + } while (0) + + ASSERT_HEX(short, 2bad); + ASSERT_HEX(unsigned short, 2badU); + ASSERT_HEX(int, dead); + ASSERT_HEX(unsigned int, deadU); + ASSERT_HEX(long, 7eadbeefL); + ASSERT_HEX(unsigned long, deadbeefUL); + ASSERT_HEX(long long, 12345678deadbeefLL); + ASSERT_HEX(unsigned long long, cafebabedeadbeefULL); + +#undef ASSERT_HEX +} + +TEST(RE2, OctalTests) { +#define ASSERT_OCTAL(type, value) \ + do { \ + type v; \ + ASSERT_TRUE(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \ + ASSERT_EQ(v, 0##value); \ + ASSERT_TRUE(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*", \ + RE2::CRadix(&v))); \ + ASSERT_EQ(v, 0##value); \ + } while (0) + + ASSERT_OCTAL(short, 77777); + ASSERT_OCTAL(unsigned short, 177777U); + ASSERT_OCTAL(int, 17777777777); + ASSERT_OCTAL(unsigned int, 37777777777U); + ASSERT_OCTAL(long, 17777777777L); + ASSERT_OCTAL(unsigned long, 37777777777UL); + ASSERT_OCTAL(long long, 777777777777777777777LL); + ASSERT_OCTAL(unsigned long long, 1777777777777777777777ULL); + +#undef ASSERT_OCTAL +} + +TEST(RE2, DecimalTests) { +#define ASSERT_DECIMAL(type, value) \ + do { \ + type v; \ + ASSERT_TRUE(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v)); \ + ASSERT_EQ(v, value); \ + ASSERT_TRUE( \ + RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \ + ASSERT_EQ(v, value); \ + } while (0) + + ASSERT_DECIMAL(short, -1); + ASSERT_DECIMAL(unsigned short, 9999); + ASSERT_DECIMAL(int, -1000); + ASSERT_DECIMAL(unsigned int, 12345U); + ASSERT_DECIMAL(long, -10000000L); + ASSERT_DECIMAL(unsigned long, 3083324652U); + ASSERT_DECIMAL(long long, -100000000000000LL); + ASSERT_DECIMAL(unsigned long long, 1234567890987654321ULL); + +#undef ASSERT_DECIMAL +} + +TEST(RE2, Replace) { + struct ReplaceTest { + const char *regexp; + const char *rewrite; + const char *original; + const char *single; + const char *global; + int greplace_count; + }; + static const ReplaceTest tests[] = { + { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)", + "\\2\\1ay", + "the quick brown fox jumps over the lazy dogs.", + "ethay quick brown fox jumps over the lazy dogs.", + "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.", + 9 }, + { "\\w+", + "\\0-NOSPAM", + "abcd.efghi@google.com", + "abcd-NOSPAM.efghi@google.com", + "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM", + 4 }, + { "^", + "(START)", + "foo", + "(START)foo", + "(START)foo", + 1 }, + { "^", + "(START)", + "", + "(START)", + "(START)", + 1 }, + { "$", + "(END)", + "", + "(END)", + "(END)", + 1 }, + { "b", + "bb", + "ababababab", + "abbabababab", + "abbabbabbabbabb", + 5 }, + { "b", + "bb", + "bbbbbb", + "bbbbbbb", + "bbbbbbbbbbbb", + 6 }, + { "b+", + "bb", + "bbbbbb", + "bb", + "bb", + 1 }, + { "b*", + "bb", + "bbbbbb", + "bb", + "bb", + 1 }, + { "b*", + "bb", + "aaaaa", + "bbaaaaa", + "bbabbabbabbabbabb", + 6 }, + // Check newline handling + { "a.*a", + "(\\0)", + "aba\naba", + "(aba)\naba", + "(aba)\n(aba)", + 2 }, + { "", NULL, NULL, NULL, NULL, 0 } + }; + + for (const ReplaceTest* t = tests; t->original != NULL; t++) { + std::string one(t->original); + ASSERT_TRUE(RE2::Replace(&one, t->regexp, t->rewrite)); + ASSERT_EQ(one, t->single); + std::string all(t->original); + ASSERT_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count) + << "Got: " << all; + ASSERT_EQ(all, t->global); + } +} + +static void TestCheckRewriteString(const char* regexp, const char* rewrite, + bool expect_ok) { + std::string error; + RE2 exp(regexp); + bool actual_ok = exp.CheckRewriteString(rewrite, &error); + EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error; +} + +TEST(CheckRewriteString, all) { + TestCheckRewriteString("abc", "foo", true); + TestCheckRewriteString("abc", "foo\\", false); + TestCheckRewriteString("abc", "foo\\0bar", true); + + TestCheckRewriteString("a(b)c", "foo", true); + TestCheckRewriteString("a(b)c", "foo\\0bar", true); + TestCheckRewriteString("a(b)c", "foo\\1bar", true); + TestCheckRewriteString("a(b)c", "foo\\2bar", false); + TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true); + + TestCheckRewriteString("a(b)(c)", "foo\\12", true); + TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true); + TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false); +} + +TEST(RE2, Extract) { + std::string s; + + ASSERT_TRUE(RE2::Extract("boris@kremvax.ru", "(.*)@([^.]*)", "\\2!\\1", &s)); + ASSERT_EQ(s, "kremvax!boris"); + + ASSERT_TRUE(RE2::Extract("foo", ".*", "'\\0'", &s)); + ASSERT_EQ(s, "'foo'"); + // check that false match doesn't overwrite + ASSERT_FALSE(RE2::Extract("baz", "bar", "'\\0'", &s)); + ASSERT_EQ(s, "'foo'"); +} + +TEST(RE2, MaxSubmatchTooLarge) { + std::string s; + ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s)); + s = "foo"; + ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2")); + s = "foo"; + ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2")); +} + +TEST(RE2, Consume) { + RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace + std::string word; + + std::string s(" aaa b!@#$@#$cccc"); + StringPiece input(s); + + ASSERT_TRUE(RE2::Consume(&input, r, &word)); + ASSERT_EQ(word, "aaa") << " input: " << input; + ASSERT_TRUE(RE2::Consume(&input, r, &word)); + ASSERT_EQ(word, "b") << " input: " << input; + ASSERT_FALSE(RE2::Consume(&input, r, &word)) << " input: " << input; +} + +TEST(RE2, ConsumeN) { + const std::string s(" one two three 4"); + StringPiece input(s); + + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 0)); // Skips "one". + + // 1 arg + std::string word; + argv[0] = &word; + EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 1)); + EXPECT_EQ("two", word); + + // Multi-args + int n; + argv[1] = &n; + EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)", args, 2)); + EXPECT_EQ("three", word); + EXPECT_EQ(4, n); +} + +TEST(RE2, FindAndConsume) { + RE2 r("(\\w+)"); // matches a word + std::string word; + + std::string s(" aaa b!@#$@#$cccc"); + StringPiece input(s); + + ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word)); + ASSERT_EQ(word, "aaa"); + ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word)); + ASSERT_EQ(word, "b"); + ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word)); + ASSERT_EQ(word, "cccc"); + ASSERT_FALSE(RE2::FindAndConsume(&input, r, &word)); + + // Check that FindAndConsume works without any submatches. + // Earlier version used uninitialized data for + // length to consume. + input = "aaa"; + ASSERT_TRUE(RE2::FindAndConsume(&input, "aaa")); + ASSERT_EQ(input, ""); +} + +TEST(RE2, FindAndConsumeN) { + const std::string s(" one two three 4"); + StringPiece input(s); + + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 0)); // Skips "one". + + // 1 arg + std::string word; + argv[0] = &word; + EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 1)); + EXPECT_EQ("two", word); + + // Multi-args + int n; + argv[1] = &n; + EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)", args, 2)); + EXPECT_EQ("three", word); + EXPECT_EQ(4, n); +} + +TEST(RE2, MatchNumberPeculiarity) { + RE2 r("(foo)|(bar)|(baz)"); + std::string word1; + std::string word2; + std::string word3; + + ASSERT_TRUE(RE2::PartialMatch("foo", r, &word1, &word2, &word3)); + ASSERT_EQ(word1, "foo"); + ASSERT_EQ(word2, ""); + ASSERT_EQ(word3, ""); + ASSERT_TRUE(RE2::PartialMatch("bar", r, &word1, &word2, &word3)); + ASSERT_EQ(word1, ""); + ASSERT_EQ(word2, "bar"); + ASSERT_EQ(word3, ""); + ASSERT_TRUE(RE2::PartialMatch("baz", r, &word1, &word2, &word3)); + ASSERT_EQ(word1, ""); + ASSERT_EQ(word2, ""); + ASSERT_EQ(word3, "baz"); + ASSERT_FALSE(RE2::PartialMatch("f", r, &word1, &word2, &word3)); + + std::string a; + ASSERT_TRUE(RE2::FullMatch("hello", "(foo)|hello", &a)); + ASSERT_EQ(a, ""); +} + +TEST(RE2, Match) { + RE2 re("((\\w+):([0-9]+))"); // extracts host and port + StringPiece group[4]; + + // No match. + StringPiece s = "zyzzyva"; + ASSERT_FALSE( + re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group))); + + // Matches and extracts. + s = "a chrisr:9000 here"; + ASSERT_TRUE( + re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group))); + ASSERT_EQ(group[0], "chrisr:9000"); + ASSERT_EQ(group[1], "chrisr:9000"); + ASSERT_EQ(group[2], "chrisr"); + ASSERT_EQ(group[3], "9000"); + + std::string all, host; + int port; + ASSERT_TRUE(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port)); + ASSERT_EQ(all, "chrisr:9000"); + ASSERT_EQ(host, "chrisr"); + ASSERT_EQ(port, 9000); +} + +static void TestRecursion(int size, const char* pattern) { + // Fill up a string repeating the pattern given + std::string domain; + domain.resize(size); + size_t patlen = strlen(pattern); + for (int i = 0; i < size; i++) { + domain[i] = pattern[i % patlen]; + } + // Just make sure it doesn't crash due to too much recursion. + RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet); + RE2::FullMatch(domain, re); +} + +// A meta-quoted string, interpreted as a pattern, should always match +// the original unquoted string. +static void TestQuoteMeta(const std::string& unquoted, + const RE2::Options& options = RE2::DefaultOptions) { + std::string quoted = RE2::QuoteMeta(unquoted); + RE2 re(quoted, options); + EXPECT_TRUE(RE2::FullMatch(unquoted, re)) + << "Unquoted='" << unquoted << "', quoted='" << quoted << "'."; +} + +// A meta-quoted string, interpreted as a pattern, should always match +// the original unquoted string. +static void NegativeTestQuoteMeta( + const std::string& unquoted, const std::string& should_not_match, + const RE2::Options& options = RE2::DefaultOptions) { + std::string quoted = RE2::QuoteMeta(unquoted); + RE2 re(quoted, options); + EXPECT_FALSE(RE2::FullMatch(should_not_match, re)) + << "Unquoted='" << unquoted << "', quoted='" << quoted << "'."; +} + +// Tests that quoted meta characters match their original strings, +// and that a few things that shouldn't match indeed do not. +TEST(QuoteMeta, Simple) { + TestQuoteMeta("foo"); + TestQuoteMeta("foo.bar"); + TestQuoteMeta("foo\\.bar"); + TestQuoteMeta("[1-9]"); + TestQuoteMeta("1.5-2.0?"); + TestQuoteMeta("\\d"); + TestQuoteMeta("Who doesn't like ice cream?"); + TestQuoteMeta("((a|b)c?d*e+[f-h]i)"); + TestQuoteMeta("((?!)xxx).*yyy"); + TestQuoteMeta("(["); +} +TEST(QuoteMeta, SimpleNegative) { + NegativeTestQuoteMeta("foo", "bar"); + NegativeTestQuoteMeta("...", "bar"); + NegativeTestQuoteMeta("\\.", "."); + NegativeTestQuoteMeta("\\.", ".."); + NegativeTestQuoteMeta("(a)", "a"); + NegativeTestQuoteMeta("(a|b)", "a"); + NegativeTestQuoteMeta("(a|b)", "(a)"); + NegativeTestQuoteMeta("(a|b)", "a|b"); + NegativeTestQuoteMeta("[0-9]", "0"); + NegativeTestQuoteMeta("[0-9]", "0-9"); + NegativeTestQuoteMeta("[0-9]", "[9]"); + NegativeTestQuoteMeta("((?!)xxx)", "xxx"); +} + +TEST(QuoteMeta, Latin1) { + TestQuoteMeta("3\xb2 = 9", RE2::Latin1); +} + +TEST(QuoteMeta, UTF8) { + TestQuoteMeta("Plácido Domingo"); + TestQuoteMeta("xyz"); // No fancy utf8. + TestQuoteMeta("\xc2\xb0"); // 2-byte utf8 -- a degree symbol. + TestQuoteMeta("27\xc2\xb0 degrees"); // As a middle character. + TestQuoteMeta("\xe2\x80\xb3"); // 3-byte utf8 -- a double prime. + TestQuoteMeta("\xf0\x9d\x85\x9f"); // 4-byte utf8 -- a music note. + TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, this should + // still work. + NegativeTestQuoteMeta("27\xc2\xb0", + "27\\\xc2\\\xb0"); // 2-byte utf8 -- a degree symbol. +} + +TEST(QuoteMeta, HasNull) { + std::string has_null; + + // string with one null character + has_null += '\0'; + TestQuoteMeta(has_null); + NegativeTestQuoteMeta(has_null, ""); + + // Don't want null-followed-by-'1' to be interpreted as '\01'. + has_null += '1'; + TestQuoteMeta(has_null); + NegativeTestQuoteMeta(has_null, "\1"); +} + +TEST(ProgramSize, BigProgram) { + RE2 re_simple("simple regexp"); + RE2 re_medium("medium.*regexp"); + RE2 re_complex("complex.{1,128}regexp"); + + ASSERT_GT(re_simple.ProgramSize(), 0); + ASSERT_GT(re_medium.ProgramSize(), re_simple.ProgramSize()); + ASSERT_GT(re_complex.ProgramSize(), re_medium.ProgramSize()); + + ASSERT_GT(re_simple.ReverseProgramSize(), 0); + ASSERT_GT(re_medium.ReverseProgramSize(), re_simple.ReverseProgramSize()); + ASSERT_GT(re_complex.ReverseProgramSize(), re_medium.ReverseProgramSize()); +} + +TEST(ProgramFanout, BigProgram) { + RE2 re1("(?:(?:(?:(?:(?:.)?){1})*)+)"); + RE2 re10("(?:(?:(?:(?:(?:.)?){10})*)+)"); + RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)"); + RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)"); + + std::vector<int> histogram; + + // 3 is the largest non-empty bucket and has 2 element. + ASSERT_EQ(3, re1.ProgramFanout(&histogram)); + ASSERT_EQ(2, histogram[3]); + + // 6 is the largest non-empty bucket and has 11 elements. + ASSERT_EQ(6, re10.ProgramFanout(&histogram)); + ASSERT_EQ(11, histogram[6]); + + // 9 is the largest non-empty bucket and has 101 elements. + ASSERT_EQ(9, re100.ProgramFanout(&histogram)); + ASSERT_EQ(101, histogram[9]); + + // 13 is the largest non-empty bucket and has 1001 elements. + ASSERT_EQ(13, re1000.ProgramFanout(&histogram)); + ASSERT_EQ(1001, histogram[13]); + + // 2 is the largest non-empty bucket and has 2 element. + ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram)); + ASSERT_EQ(2, histogram[2]); + + // 5 is the largest non-empty bucket and has 11 elements. + ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram)); + ASSERT_EQ(11, histogram[5]); + + // 9 is the largest non-empty bucket and has 101 elements. + ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram)); + ASSERT_EQ(101, histogram[9]); + + // 12 is the largest non-empty bucket and has 1001 elements. + ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram)); + ASSERT_EQ(1001, histogram[12]); +} + +// Issue 956519: handling empty character sets was +// causing NULL dereference. This tests a few empty character sets. +// (The way to get an empty character set is to negate a full one.) +TEST(EmptyCharset, Fuzz) { + static const char *empties[] = { + "[^\\S\\s]", + "[^\\S[:space:]]", + "[^\\D\\d]", + "[^\\D[:digit:]]" + }; + for (size_t i = 0; i < arraysize(empties); i++) + ASSERT_FALSE(RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0)); +} + +// Bitstate assumes that kInstFail instructions in +// alternations or capture groups have been "compiled away". +TEST(EmptyCharset, BitstateAssumptions) { + // Captures trigger use of Bitstate. + static const char *nop_empties[] = { + "((((()))))" "[^\\S\\s]?", + "((((()))))" "([^\\S\\s])?", + "((((()))))" "([^\\S\\s]|[^\\S\\s])?", + "((((()))))" "(([^\\S\\s]|[^\\S\\s])|)" + }; + StringPiece group[6]; + for (size_t i = 0; i < arraysize(nop_empties); i++) + ASSERT_TRUE(RE2(nop_empties[i]).Match("", 0, 0, RE2::UNANCHORED, group, 6)); +} + +// Test that named groups work correctly. +TEST(Capture, NamedGroups) { + { + RE2 re("(hello world)"); + ASSERT_EQ(re.NumberOfCapturingGroups(), 1); + const std::map<std::string, int>& m = re.NamedCapturingGroups(); + ASSERT_EQ(m.size(), 0); + } + + { + RE2 re("(?P<A>expr(?P<B>expr)(?P<C>expr))((expr)(?P<D>expr))"); + ASSERT_EQ(re.NumberOfCapturingGroups(), 6); + const std::map<std::string, int>& m = re.NamedCapturingGroups(); + ASSERT_EQ(m.size(), 4); + ASSERT_EQ(m.find("A")->second, 1); + ASSERT_EQ(m.find("B")->second, 2); + ASSERT_EQ(m.find("C")->second, 3); + ASSERT_EQ(m.find("D")->second, 6); // $4 and $5 are anonymous + } +} + +TEST(RE2, CapturedGroupTest) { + RE2 re("directions from (?P<S>.*) to (?P<D>.*)"); + int num_groups = re.NumberOfCapturingGroups(); + EXPECT_EQ(2, num_groups); + std::string args[4]; + RE2::Arg arg0(&args[0]); + RE2::Arg arg1(&args[1]); + RE2::Arg arg2(&args[2]); + RE2::Arg arg3(&args[3]); + + const RE2::Arg* const matches[4] = {&arg0, &arg1, &arg2, &arg3}; + EXPECT_TRUE(RE2::FullMatchN("directions from mountain view to san jose", + re, matches, num_groups)); + const std::map<std::string, int>& named_groups = re.NamedCapturingGroups(); + EXPECT_TRUE(named_groups.find("S") != named_groups.end()); + EXPECT_TRUE(named_groups.find("D") != named_groups.end()); + + // The named group index is 1-based. + int source_group_index = named_groups.find("S")->second; + int destination_group_index = named_groups.find("D")->second; + EXPECT_EQ(1, source_group_index); + EXPECT_EQ(2, destination_group_index); + + // The args is zero-based. + EXPECT_EQ("mountain view", args[source_group_index - 1]); + EXPECT_EQ("san jose", args[destination_group_index - 1]); +} + +TEST(RE2, FullMatchWithNoArgs) { + ASSERT_TRUE(RE2::FullMatch("h", "h")); + ASSERT_TRUE(RE2::FullMatch("hello", "hello")); + ASSERT_TRUE(RE2::FullMatch("hello", "h.*o")); + ASSERT_FALSE(RE2::FullMatch("othello", "h.*o")); // Must be anchored at front + ASSERT_FALSE(RE2::FullMatch("hello!", "h.*o")); // Must be anchored at end +} + +TEST(RE2, PartialMatch) { + ASSERT_TRUE(RE2::PartialMatch("x", "x")); + ASSERT_TRUE(RE2::PartialMatch("hello", "h.*o")); + ASSERT_TRUE(RE2::PartialMatch("othello", "h.*o")); + ASSERT_TRUE(RE2::PartialMatch("hello!", "h.*o")); + ASSERT_TRUE(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))")); +} + +TEST(RE2, PartialMatchN) { + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::PartialMatchN("hello", "e.*o", args, 0)); + EXPECT_FALSE(RE2::PartialMatchN("othello", "a.*o", args, 0)); + + // 1 arg + int i; + argv[0] = &i; + EXPECT_TRUE(RE2::PartialMatchN("1001 nights", "(\\d+)", args, 1)); + EXPECT_EQ(1001, i); + EXPECT_FALSE(RE2::PartialMatchN("three", "(\\d+)", args, 1)); + + // Multi-arg + std::string s; + argv[1] = &s; + EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life", "(\\d+):(\\w+)", args, 2)); + EXPECT_EQ(42, i); + EXPECT_EQ("life", s); + EXPECT_FALSE(RE2::PartialMatchN("hi1", "(\\w+)(1)", args, 2)); +} + +TEST(RE2, FullMatchZeroArg) { + // Zero-arg + ASSERT_TRUE(RE2::FullMatch("1001", "\\d+")); +} + +TEST(RE2, FullMatchOneArg) { + int i; + + // Single-arg + ASSERT_TRUE(RE2::FullMatch("1001", "(\\d+)", &i)); + ASSERT_EQ(i, 1001); + ASSERT_TRUE(RE2::FullMatch("-123", "(-?\\d+)", &i)); + ASSERT_EQ(i, -123); + ASSERT_FALSE(RE2::FullMatch("10", "()\\d+", &i)); + ASSERT_FALSE( + RE2::FullMatch("1234567890123456789012345678901234567890", "(\\d+)", &i)); +} + +TEST(RE2, FullMatchIntegerArg) { + int i; + + // Digits surrounding integer-arg + ASSERT_TRUE(RE2::FullMatch("1234", "1(\\d*)4", &i)); + ASSERT_EQ(i, 23); + ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)\\d+", &i)); + ASSERT_EQ(i, 1); + ASSERT_TRUE(RE2::FullMatch("-1234", "(-\\d)\\d+", &i)); + ASSERT_EQ(i, -1); + ASSERT_TRUE(RE2::PartialMatch("1234", "(\\d)", &i)); + ASSERT_EQ(i, 1); + ASSERT_TRUE(RE2::PartialMatch("-1234", "(-\\d)", &i)); + ASSERT_EQ(i, -1); +} + +TEST(RE2, FullMatchStringArg) { + std::string s; + // String-arg + ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", &s)); + ASSERT_EQ(s, std::string("ell")); +} + +TEST(RE2, FullMatchStringPieceArg) { + int i; + // StringPiece-arg + StringPiece sp; + ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i)); + ASSERT_EQ(sp.size(), 4); + ASSERT_TRUE(memcmp(sp.data(), "ruby", 4) == 0); + ASSERT_EQ(i, 1234); +} + +TEST(RE2, FullMatchMultiArg) { + int i; + std::string s; + // Multi-arg + ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i)); + ASSERT_EQ(s, std::string("ruby")); + ASSERT_EQ(i, 1234); +} + +TEST(RE2, FullMatchN) { + RE2::Arg argv[2]; + const RE2::Arg* const args[2] = { &argv[0], &argv[1] }; + + // 0 arg + EXPECT_TRUE(RE2::FullMatchN("hello", "h.*o", args, 0)); + EXPECT_FALSE(RE2::FullMatchN("othello", "h.*o", args, 0)); + + // 1 arg + int i; + argv[0] = &i; + EXPECT_TRUE(RE2::FullMatchN("1001", "(\\d+)", args, 1)); + EXPECT_EQ(1001, i); + EXPECT_FALSE(RE2::FullMatchN("three", "(\\d+)", args, 1)); + + // Multi-arg + std::string s; + argv[1] = &s; + EXPECT_TRUE(RE2::FullMatchN("42:life", "(\\d+):(\\w+)", args, 2)); + EXPECT_EQ(42, i); + EXPECT_EQ("life", s); + EXPECT_FALSE(RE2::FullMatchN("hi1", "(\\w+)(1)", args, 2)); +} + +TEST(RE2, FullMatchIgnoredArg) { + int i; + std::string s; + + // Old-school NULL should be ignored. + ASSERT_TRUE( + RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i)); + ASSERT_EQ(s, std::string("ruby")); + ASSERT_EQ(i, 1234); + + // C++11 nullptr should also be ignored. + ASSERT_TRUE(RE2::FullMatch("rubz:1235", "(\\w+)(:)(\\d+)", &s, nullptr, &i)); + ASSERT_EQ(s, std::string("rubz")); + ASSERT_EQ(i, 1235); +} + +TEST(RE2, FullMatchTypedNullArg) { + std::string s; + + // Ignore non-void* NULL arg + ASSERT_TRUE(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL)); + ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (std::string*)NULL)); + ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (StringPiece*)NULL)); + ASSERT_TRUE(RE2::FullMatch("1234", "(.*)", (int*)NULL)); + ASSERT_TRUE(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL)); + ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL)); + ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL)); + + // Fail on non-void* NULL arg if the match doesn't parse for the given type. + ASSERT_FALSE(RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL)); + ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (int*)NULL)); + ASSERT_FALSE(RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL)); + ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (double*)NULL)); + ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (float*)NULL)); +} + +// Check that numeric parsing code does not read past the end of +// the number being parsed. +// This implementation requires mmap(2) et al. and thus cannot +// be used unless they are available. +TEST(RE2, NULTerminated) { +#if defined(_POSIX_MAPPED_FILES) && _POSIX_MAPPED_FILES > 0 + char *v; + int x; + long pagesize = sysconf(_SC_PAGE_SIZE); + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + v = static_cast<char*>(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, -1, 0)); + ASSERT_TRUE(v != reinterpret_cast<char*>(-1)); + LOG(INFO) << "Memory at " << (void*)v; + ASSERT_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno; + v[pagesize - 1] = '1'; + + x = 0; + ASSERT_TRUE(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x)); + ASSERT_EQ(x, 1); +#endif +} + +TEST(RE2, FullMatchTypeTests) { + // Type tests + std::string zeros(1000, '0'); + { + char c; + ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c)); + ASSERT_EQ(c, 'H'); + } + { + unsigned char c; + ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c)); + ASSERT_EQ(c, static_cast<unsigned char>('H')); + } + { + int16_t v; + ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100); + ASSERT_TRUE(RE2::FullMatch("32767", "(-?\\d+)", &v)); ASSERT_EQ(v, 32767); + ASSERT_TRUE(RE2::FullMatch("-32768", "(-?\\d+)", &v)); ASSERT_EQ(v, -32768); + ASSERT_FALSE(RE2::FullMatch("-32769", "(-?\\d+)", &v)); + ASSERT_FALSE(RE2::FullMatch("32768", "(-?\\d+)", &v)); + } + { + uint16_t v; + ASSERT_TRUE(RE2::FullMatch("100", "(\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("32767", "(\\d+)", &v)); ASSERT_EQ(v, 32767); + ASSERT_TRUE(RE2::FullMatch("65535", "(\\d+)", &v)); ASSERT_EQ(v, 65535); + ASSERT_FALSE(RE2::FullMatch("65536", "(\\d+)", &v)); + } + { + int32_t v; + static const int32_t max = INT32_C(0x7fffffff); + static const int32_t min = -max - 1; + ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100); + ASSERT_TRUE(RE2::FullMatch("2147483647", "(-?\\d+)", &v)); ASSERT_EQ(v, max); + ASSERT_TRUE(RE2::FullMatch("-2147483648", "(-?\\d+)", &v)); ASSERT_EQ(v, min); + ASSERT_FALSE(RE2::FullMatch("-2147483649", "(-?\\d+)", &v)); + ASSERT_FALSE(RE2::FullMatch("2147483648", "(-?\\d+)", &v)); + + ASSERT_TRUE(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v)); + ASSERT_EQ(v, max); + ASSERT_TRUE(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v)); + ASSERT_EQ(v, min); + + ASSERT_FALSE(RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v)); + ASSERT_TRUE(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v))); + ASSERT_EQ(v, max); + ASSERT_FALSE(RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v))); + } + { + uint32_t v; + static const uint32_t max = UINT32_C(0xffffffff); + ASSERT_TRUE(RE2::FullMatch("100", "(\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("4294967295", "(\\d+)", &v)); ASSERT_EQ(v, max); + ASSERT_FALSE(RE2::FullMatch("4294967296", "(\\d+)", &v)); + ASSERT_FALSE(RE2::FullMatch("-1", "(\\d+)", &v)); + + ASSERT_TRUE(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); ASSERT_EQ(v, max); + } + { + int64_t v; + static const int64_t max = INT64_C(0x7fffffffffffffff); + static const int64_t min = -max - 1; + std::string str; + + ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100); + + str = std::to_string(max); + ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, max); + + str = std::to_string(min); + ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, min); + + str = std::to_string(max); + ASSERT_NE(str.back(), '9'); + str.back()++; + ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v)); + + str = std::to_string(min); + ASSERT_NE(str.back(), '9'); + str.back()++; + ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v)); + } + { + uint64_t v; + int64_t v2; + static const uint64_t max = UINT64_C(0xffffffffffffffff); + std::string str; + + ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v2)); ASSERT_EQ(v2, -100); + + str = std::to_string(max); + ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, max); + + ASSERT_NE(str.back(), '9'); + str.back()++; + ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v)); + } +} + +TEST(RE2, FloatingPointFullMatchTypes) { + std::string zeros(1000, '0'); + { + float v; + ASSERT_TRUE(RE2::FullMatch("100", "(.*)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100); + ASSERT_TRUE(RE2::FullMatch("1e23", "(.*)", &v)); ASSERT_EQ(v, float(1e23)); + ASSERT_TRUE(RE2::FullMatch(" 100", "(.*)", &v)); ASSERT_EQ(v, 100); + + ASSERT_TRUE(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); + ASSERT_EQ(v, float(1e23)); + + // 6700000000081920.1 is an edge case. + // 6700000000081920 is exactly halfway between + // two float32s, so the .1 should make it round up. + // However, the .1 is outside the precision possible with + // a float64: the nearest float64 is 6700000000081920. + // So if the code uses strtod and then converts to float32, + // round-to-even will make it round down instead of up. + // To pass the test, the parser must call strtof directly. + // This test case is carefully chosen to use only a 17-digit + // number, since C does not guarantee to get the correctly + // rounded answer for strtod and strtof unless the input is + // short. + // + // This is known to fail on Cygwin and MinGW due to a broken + // implementation of strtof(3). And apparently MSVC too. Sigh. +#if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__) + ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v)); + ASSERT_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f); + ASSERT_TRUE(RE2::FullMatch("6700000000081920.1", "(.*)", &v)); + ASSERT_EQ(v, 6700000000081920.1f) + << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f); +#endif + } + { + double v; + ASSERT_TRUE(RE2::FullMatch("100", "(.*)", &v)); ASSERT_EQ(v, 100); + ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100); + ASSERT_TRUE(RE2::FullMatch("1e23", "(.*)", &v)); ASSERT_EQ(v, 1e23); + ASSERT_TRUE(RE2::FullMatch(zeros + "1e23", "(.*)", &v)); + ASSERT_EQ(v, double(1e23)); + + ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v)); + ASSERT_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1); + ASSERT_TRUE(RE2::FullMatch("1.00000005960464485", "(.*)", &v)); + ASSERT_EQ(v, 1.0000000596046448) + << StringPrintf("%.17g != %.17g", v, 1.0000000596046448); + } +} + +TEST(RE2, FullMatchAnchored) { + int i; + // Check that matching is fully anchored + ASSERT_FALSE(RE2::FullMatch("x1001", "(\\d+)", &i)); + ASSERT_FALSE(RE2::FullMatch("1001x", "(\\d+)", &i)); + ASSERT_TRUE(RE2::FullMatch("x1001", "x(\\d+)", &i)); ASSERT_EQ(i, 1001); + ASSERT_TRUE(RE2::FullMatch("1001x", "(\\d+)x", &i)); ASSERT_EQ(i, 1001); +} + +TEST(RE2, FullMatchBraces) { + // Braces + ASSERT_TRUE(RE2::FullMatch("0abcd", "[0-9a-f+.-]{5,}")); + ASSERT_TRUE(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}")); + ASSERT_FALSE(RE2::FullMatch("0abc", "[0-9a-f+.-]{5,}")); +} + +TEST(RE2, Complicated) { + // Complicated RE2 + ASSERT_TRUE(RE2::FullMatch("foo", "foo|bar|[A-Z]")); + ASSERT_TRUE(RE2::FullMatch("bar", "foo|bar|[A-Z]")); + ASSERT_TRUE(RE2::FullMatch("X", "foo|bar|[A-Z]")); + ASSERT_FALSE(RE2::FullMatch("XY", "foo|bar|[A-Z]")); +} + +TEST(RE2, FullMatchEnd) { + // Check full-match handling (needs '$' tacked on internally) + ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo")); + ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo")); + ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo$")); + ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo$")); + ASSERT_TRUE(RE2::FullMatch("foo", "foo$")); + ASSERT_FALSE(RE2::FullMatch("foo$bar", "foo\\$")); + ASSERT_FALSE(RE2::FullMatch("fox", "fo|bar")); + + // Uncomment the following if we change the handling of '$' to + // prevent it from matching a trailing newline + if (false) { + // Check that we don't get bitten by pcre's special handling of a + // '\n' at the end of the string matching '$' + ASSERT_FALSE(RE2::PartialMatch("foo\n", "foo$")); + } +} + +TEST(RE2, FullMatchArgCount) { + // Number of args + int a[16]; + ASSERT_TRUE(RE2::FullMatch("", "")); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("1", "(\\d){1}", &a[0])); + ASSERT_EQ(a[0], 1); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("12", "(\\d)(\\d)", &a[0], &a[1])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("123", "(\\d)(\\d)(\\d)", &a[0], &a[1], &a[2])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1], + &a[2], &a[3])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + ASSERT_EQ(a[3], 4); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("12345", "(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1], + &a[2], &a[3], &a[4])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + ASSERT_EQ(a[3], 4); + ASSERT_EQ(a[4], 5); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("123456", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0], + &a[1], &a[2], &a[3], &a[4], &a[5])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + ASSERT_EQ(a[3], 4); + ASSERT_EQ(a[4], 5); + ASSERT_EQ(a[5], 6); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("1234567", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + ASSERT_EQ(a[3], 4); + ASSERT_EQ(a[4], 5); + ASSERT_EQ(a[5], 6); + ASSERT_EQ(a[6], 7); + + memset(a, 0, sizeof(0)); + ASSERT_TRUE(RE2::FullMatch("1234567890123456", + "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)" + "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", + &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], + &a[7], &a[8], &a[9], &a[10], &a[11], &a[12], + &a[13], &a[14], &a[15])); + ASSERT_EQ(a[0], 1); + ASSERT_EQ(a[1], 2); + ASSERT_EQ(a[2], 3); + ASSERT_EQ(a[3], 4); + ASSERT_EQ(a[4], 5); + ASSERT_EQ(a[5], 6); + ASSERT_EQ(a[6], 7); + ASSERT_EQ(a[7], 8); + ASSERT_EQ(a[8], 9); + ASSERT_EQ(a[9], 0); + ASSERT_EQ(a[10], 1); + ASSERT_EQ(a[11], 2); + ASSERT_EQ(a[12], 3); + ASSERT_EQ(a[13], 4); + ASSERT_EQ(a[14], 5); + ASSERT_EQ(a[15], 6); +} + +TEST(RE2, Accessors) { + // Check the pattern() accessor + { + const std::string kPattern = "http://([^/]+)/.*"; + const RE2 re(kPattern); + ASSERT_EQ(kPattern, re.pattern()); + } + + // Check RE2 error field. + { + RE2 re("foo"); + ASSERT_TRUE(re.error().empty()); // Must have no error + ASSERT_TRUE(re.ok()); + ASSERT_EQ(re.error_code(), RE2::NoError); + } +} + +TEST(RE2, UTF8) { + // Check UTF-8 handling + // Three Japanese characters (nihongo) + const char utf8_string[] = { + (char)0xe6, (char)0x97, (char)0xa5, // 65e5 + (char)0xe6, (char)0x9c, (char)0xac, // 627c + (char)0xe8, (char)0xaa, (char)0x9e, // 8a9e + 0 + }; + const char utf8_pattern[] = { + '.', + (char)0xe6, (char)0x9c, (char)0xac, // 627c + '.', + 0 + }; + + // Both should match in either mode, bytes or UTF-8 + RE2 re_test1(".........", RE2::Latin1); + ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test1)); + RE2 re_test2("..."); + ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test2)); + + // Check that '.' matches one byte or UTF-8 character + // according to the mode. + std::string s; + RE2 re_test3("(.)", RE2::Latin1); + ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test3, &s)); + ASSERT_EQ(s, std::string("\xe6")); + RE2 re_test4("(.)"); + ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test4, &s)); + ASSERT_EQ(s, std::string("\xe6\x97\xa5")); + + // Check that string matches itself in either mode + RE2 re_test5(utf8_string, RE2::Latin1); + ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test5)); + RE2 re_test6(utf8_string); + ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test6)); + + // Check that pattern matches string only in UTF8 mode + RE2 re_test7(utf8_pattern, RE2::Latin1); + ASSERT_FALSE(RE2::FullMatch(utf8_string, re_test7)); + RE2 re_test8(utf8_pattern); + ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test8)); +} + +TEST(RE2, UngreedyUTF8) { + // Check that ungreedy, UTF8 regular expressions don't match when they + // oughtn't -- see bug 82246. + { + // This code always worked. + const char* pattern = "\\w+X"; + const std::string target = "a aX"; + RE2 match_sentence(pattern, RE2::Latin1); + RE2 match_sentence_re(pattern); + + ASSERT_FALSE(RE2::FullMatch(target, match_sentence)); + ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re)); + } + { + const char* pattern = "(?U)\\w+X"; + const std::string target = "a aX"; + RE2 match_sentence(pattern, RE2::Latin1); + ASSERT_EQ(match_sentence.error(), ""); + RE2 match_sentence_re(pattern); + + ASSERT_FALSE(RE2::FullMatch(target, match_sentence)); + ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re)); + } +} + +TEST(RE2, Rejects) { + { + RE2 re("a\\1", RE2::Quiet); + ASSERT_FALSE(re.ok()); } + { + RE2 re("a[x", RE2::Quiet); + ASSERT_FALSE(re.ok()); + } + { + RE2 re("a[z-a]", RE2::Quiet); + ASSERT_FALSE(re.ok()); + } + { + RE2 re("a[[:foobar:]]", RE2::Quiet); + ASSERT_FALSE(re.ok()); + } + { + RE2 re("a(b", RE2::Quiet); + ASSERT_FALSE(re.ok()); + } + { + RE2 re("a\\", RE2::Quiet); + ASSERT_FALSE(re.ok()); + } +} + +TEST(RE2, NoCrash) { + // Test that using a bad regexp doesn't crash. + { + RE2 re("a\\", RE2::Quiet); + ASSERT_FALSE(re.ok()); + ASSERT_FALSE(RE2::PartialMatch("a\\b", re)); + } + + // Test that using an enormous regexp doesn't crash + { + RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet); + ASSERT_FALSE(re.ok()); + ASSERT_FALSE(RE2::PartialMatch("aaa", re)); + } + + // Test that a crazy regexp still compiles and runs. + { + RE2 re(".{512}x", RE2::Quiet); + ASSERT_TRUE(re.ok()); + std::string s; + s.append(515, 'c'); + s.append("x"); + ASSERT_TRUE(RE2::PartialMatch(s, re)); + } +} + +TEST(RE2, Recursion) { + // Test that recursion is stopped. + // This test is PCRE-legacy -- there's no recursion in RE2. + int bytes = 15 * 1024; // enough to crash PCRE + TestRecursion(bytes, "."); + TestRecursion(bytes, "a"); + TestRecursion(bytes, "a."); + TestRecursion(bytes, "ab."); + TestRecursion(bytes, "abc."); +} + +TEST(RE2, BigCountedRepetition) { + // Test that counted repetition works, given tons of memory. + RE2::Options opt; + opt.set_max_mem(256<<20); + + RE2 re(".{512}x", opt); + ASSERT_TRUE(re.ok()); + std::string s; + s.append(515, 'c'); + s.append("x"); + ASSERT_TRUE(RE2::PartialMatch(s, re)); +} + +TEST(RE2, DeepRecursion) { + // Test for deep stack recursion. This would fail with a + // segmentation violation due to stack overflow before pcre was + // patched. + // Again, a PCRE legacy test. RE2 doesn't recurse. + std::string comment("x*"); + std::string a(131072, 'a'); + comment += a; + comment += "*x"; + RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)"); + ASSERT_TRUE(RE2::FullMatch(comment, re)); +} + +// Suggested by Josh Hyman. Failed when SearchOnePass was +// not implementing case-folding. +TEST(CaseInsensitive, MatchAndConsume) { + std::string text = "A fish named *Wanda*"; + StringPiece sp(text); + StringPiece result; + EXPECT_TRUE(RE2::PartialMatch(text, "(?i)([wand]{5})", &result)); + EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result)); +} + +// RE2 should permit implicit conversions from string, StringPiece, const char*, +// and C string literals. +TEST(RE2, ImplicitConversions) { + std::string re_string("."); + StringPiece re_stringpiece("."); + const char* re_cstring = "."; + EXPECT_TRUE(RE2::PartialMatch("e", re_string)); + EXPECT_TRUE(RE2::PartialMatch("e", re_stringpiece)); + EXPECT_TRUE(RE2::PartialMatch("e", re_cstring)); + EXPECT_TRUE(RE2::PartialMatch("e", ".")); +} + +// Bugs introduced by 8622304 +TEST(RE2, CL8622304) { + // reported by ingow + std::string dir; + EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])")); // ok + EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir)); // fails + + // reported by jacobsa + std::string key, val; + EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true", + "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?", + &key, + &val)); + EXPECT_EQ(key, "bar"); + EXPECT_EQ(val, "1,0x2F,030,4,5"); +} + +// Check that RE2 returns correct regexp pieces on error. +// In particular, make sure it returns whole runes +// and that it always reports invalid UTF-8. +// Also check that Perl error flag piece is big enough. +static struct ErrorTest { + const char *regexp; + RE2::ErrorCode error_code; + const char *error_arg; +} error_tests[] = { + { "ab\\αcd", RE2::ErrorBadEscape, "\\α" }, + { "ef\\x☺01", RE2::ErrorBadEscape, "\\x☺0" }, + { "gh\\x1☺01", RE2::ErrorBadEscape, "\\x1☺" }, + { "ij\\x1", RE2::ErrorBadEscape, "\\x1" }, + { "kl\\x", RE2::ErrorBadEscape, "\\x" }, + { "uv\\x{0000☺}", RE2::ErrorBadEscape, "\\x{0000☺" }, + { "wx\\p{ABC", RE2::ErrorBadCharRange, "\\p{ABC" }, + // used to return (?s but the error is X + { "yz(?smiUX:abc)", RE2::ErrorBadPerlOp, "(?smiUX" }, + { "aa(?sm☺i", RE2::ErrorBadPerlOp, "(?sm☺" }, + { "bb[abc", RE2::ErrorMissingBracket, "[abc" }, + { "abc(def", RE2::ErrorMissingParen, "abc(def" }, + { "abc)def", RE2::ErrorUnexpectedParen, "abc)def" }, + + // no argument string returned for invalid UTF-8 + { "mn\\x1\377", RE2::ErrorBadUTF8, "" }, + { "op\377qr", RE2::ErrorBadUTF8, "" }, + { "st\\x{00000\377", RE2::ErrorBadUTF8, "" }, + { "zz\\p{\377}", RE2::ErrorBadUTF8, "" }, + { "zz\\x{00\377}", RE2::ErrorBadUTF8, "" }, + { "zz(?P<name\377>abc)", RE2::ErrorBadUTF8, "" }, +}; +TEST(RE2, ErrorCodeAndArg) { + for (size_t i = 0; i < arraysize(error_tests); i++) { + RE2 re(error_tests[i].regexp, RE2::Quiet); + EXPECT_FALSE(re.ok()); + EXPECT_EQ(re.error_code(), error_tests[i].error_code) << re.error(); + EXPECT_EQ(re.error_arg(), error_tests[i].error_arg) << re.error(); + } +} + +// Check that "never match \n" mode never matches \n. +static struct NeverTest { + const char* regexp; + const char* text; + const char* match; +} never_tests[] = { + { "(.*)", "abc\ndef\nghi\n", "abc" }, + { "(?s)(abc.*def)", "abc\ndef\n", NULL }, + { "(abc(.|\n)*def)", "abc\ndef\n", NULL }, + { "(abc[^x]*def)", "abc\ndef\n", NULL }, + { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" }, +}; +TEST(RE2, NeverNewline) { + RE2::Options opt; + opt.set_never_nl(true); + for (size_t i = 0; i < arraysize(never_tests); i++) { + const NeverTest& t = never_tests[i]; + RE2 re(t.regexp, opt); + if (t.match == NULL) { + EXPECT_FALSE(re.PartialMatch(t.text, re)); + } else { + StringPiece m; + EXPECT_TRUE(re.PartialMatch(t.text, re, &m)); + EXPECT_EQ(m, t.match); + } + } +} + +// Check that dot_nl option works. +TEST(RE2, DotNL) { + RE2::Options opt; + opt.set_dot_nl(true); + EXPECT_TRUE(RE2::PartialMatch("\n", RE2(".", opt))); + EXPECT_FALSE(RE2::PartialMatch("\n", RE2("(?-s).", opt))); + opt.set_never_nl(true); + EXPECT_FALSE(RE2::PartialMatch("\n", RE2(".", opt))); +} + +// Check that there are no capturing groups in "never capture" mode. +TEST(RE2, NeverCapture) { + RE2::Options opt; + opt.set_never_capture(true); + RE2 re("(r)(e)", opt); + EXPECT_EQ(0, re.NumberOfCapturingGroups()); +} + +// Bitstate bug was looking at submatch[0] even if nsubmatch == 0. +// Triggered by a failed DFA search falling back to Bitstate when +// using Match with a NULL submatch set. Bitstate tried to read +// the submatch[0] entry even if nsubmatch was 0. +TEST(RE2, BitstateCaptureBug) { + RE2::Options opt; + opt.set_max_mem(20000); + RE2 re("(_________$)", opt); + StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x"; + EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0)); +} + +// C++ version of bug 609710. +TEST(RE2, UnicodeClasses) { + const std::string str = "ABCDEFGHI譚永鋒"; + std::string a, b, c; + + EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}")); + EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}")); + EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}")); + + EXPECT_TRUE(RE2::FullMatch("譚", "\\p{L}")); + EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("譚", "\\P{L}")); + EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Ll}")); + + EXPECT_TRUE(RE2::FullMatch("永", "\\p{L}")); + EXPECT_FALSE(RE2::FullMatch("永", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("永", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("永", "\\P{L}")); + EXPECT_TRUE(RE2::FullMatch("永", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("永", "\\P{Ll}")); + + EXPECT_TRUE(RE2::FullMatch("鋒", "\\p{L}")); + EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Lu}")); + EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Ll}")); + EXPECT_FALSE(RE2::FullMatch("鋒", "\\P{L}")); + EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Lu}")); + EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Ll}")); + + EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c)); + EXPECT_EQ("A", a); + EXPECT_EQ("B", b); + EXPECT_EQ("C", c); + + EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c)); + EXPECT_EQ("A", a); + EXPECT_EQ("B", b); + EXPECT_EQ("C", c); + + EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}")); + + EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c)); + EXPECT_EQ("A", a); + EXPECT_EQ("B", b); + EXPECT_EQ("C", c); + + EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]")); + + EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c)); + EXPECT_EQ("譚", a); + EXPECT_EQ("永", b); + EXPECT_EQ("鋒", c); +} + +TEST(RE2, LazyRE2) { + // Test with and without options. + static LazyRE2 a = {"a"}; + static LazyRE2 b = {"b", RE2::Latin1}; + + EXPECT_EQ("a", a->pattern()); + EXPECT_EQ(RE2::Options::EncodingUTF8, a->options().encoding()); + + EXPECT_EQ("b", b->pattern()); + EXPECT_EQ(RE2::Options::EncodingLatin1, b->options().encoding()); +} + +// Bug reported by saito. 2009/02/17 +TEST(RE2, NullVsEmptyString) { + RE2 re(".*"); + EXPECT_TRUE(re.ok()); + + StringPiece null; + EXPECT_TRUE(RE2::FullMatch(null, re)); + + StringPiece empty(""); + EXPECT_TRUE(RE2::FullMatch(empty, re)); +} + +// Similar to the previous test, check that the null string and the empty +// string both match, but also that the null string can only provide null +// submatches whereas the empty string can also provide empty submatches. +TEST(RE2, NullVsEmptyStringSubmatches) { + RE2 re("()|(foo)"); + EXPECT_TRUE(re.ok()); + + // matches[0] is overall match, [1] is (), [2] is (foo), [3] is nonexistent. + StringPiece matches[4]; + + for (size_t i = 0; i < arraysize(matches); i++) + matches[i] = "bar"; + + StringPiece null; + EXPECT_TRUE(re.Match(null, 0, null.size(), RE2::UNANCHORED, + matches, arraysize(matches))); + for (size_t i = 0; i < arraysize(matches); i++) { + EXPECT_TRUE(matches[i].data() == NULL); // always null + EXPECT_TRUE(matches[i].empty()); + } + + for (size_t i = 0; i < arraysize(matches); i++) + matches[i] = "bar"; + + StringPiece empty(""); + EXPECT_TRUE(re.Match(empty, 0, empty.size(), RE2::UNANCHORED, + matches, arraysize(matches))); + EXPECT_TRUE(matches[0].data() != NULL); // empty, not null + EXPECT_TRUE(matches[0].empty()); + EXPECT_TRUE(matches[1].data() != NULL); // empty, not null + EXPECT_TRUE(matches[1].empty()); + EXPECT_TRUE(matches[2].data() == NULL); + EXPECT_TRUE(matches[2].empty()); + EXPECT_TRUE(matches[3].data() == NULL); + EXPECT_TRUE(matches[3].empty()); +} + +// Issue 1816809 +TEST(RE2, Bug1816809) { + RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))"); + StringPiece piece("llx-3;llx4"); + std::string x; + EXPECT_TRUE(RE2::Consume(&piece, re, &x)); +} + +// Issue 3061120 +TEST(RE2, Bug3061120) { + RE2 re("(?i)\\W"); + EXPECT_FALSE(RE2::PartialMatch("x", re)); // always worked + EXPECT_FALSE(RE2::PartialMatch("k", re)); // broke because of kelvin + EXPECT_FALSE(RE2::PartialMatch("s", re)); // broke because of latin long s +} + +TEST(RE2, CapturingGroupNames) { + // Opening parentheses annotated with group IDs: + // 12 3 45 6 7 + RE2 re("((abc)(?P<G2>)|((e+)(?P<G2>.*)(?P<G1>u+)))"); + EXPECT_TRUE(re.ok()); + const std::map<int, std::string>& have = re.CapturingGroupNames(); + std::map<int, std::string> want; + want[3] = "G2"; + want[6] = "G2"; + want[7] = "G1"; + EXPECT_EQ(want, have); +} + +TEST(RE2, RegexpToStringLossOfAnchor) { + EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at"); + EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at"); + EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$"); + EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)"); +} + +// Issue 10131674 +TEST(RE2, Bug10131674) { + // Some of these escapes describe values that do not fit in a byte. + RE2 re("\\140\\440\\174\\271\\150\\656\\106\\201\\004\\332", RE2::Latin1); + EXPECT_FALSE(re.ok()); + EXPECT_FALSE(RE2::FullMatch("hello world", re)); +} + +TEST(RE2, Bug18391750) { + // Stray write past end of match_ in nfa.cc, caught by fuzzing + address sanitizer. + const char t[] = { + (char)0x28, (char)0x28, (char)0xfc, (char)0xfc, (char)0x08, (char)0x08, + (char)0x26, (char)0x26, (char)0x28, (char)0xc2, (char)0x9b, (char)0xc5, + (char)0xc5, (char)0xd4, (char)0x8f, (char)0x8f, (char)0x69, (char)0x69, + (char)0xe7, (char)0x29, (char)0x7b, (char)0x37, (char)0x31, (char)0x31, + (char)0x7d, (char)0xae, (char)0x7c, (char)0x7c, (char)0xf3, (char)0x29, + (char)0xae, (char)0xae, (char)0x2e, (char)0x2a, (char)0x29, (char)0x00, + }; + RE2::Options opt; + opt.set_encoding(RE2::Options::EncodingLatin1); + opt.set_longest_match(true); + opt.set_dot_nl(true); + opt.set_case_sensitive(false); + RE2 re(t, opt); + ASSERT_TRUE(re.ok()); + RE2::PartialMatch(t, re); +} + +TEST(RE2, Bug18458852) { + // Bug in parser accepting invalid (too large) rune, + // causing compiler to fail in DCHECK in UTF-8 + // character class code. + const char b[] = { + (char)0x28, (char)0x05, (char)0x05, (char)0x41, (char)0x41, (char)0x28, + (char)0x24, (char)0x5b, (char)0x5e, (char)0xf5, (char)0x87, (char)0x87, + (char)0x90, (char)0x29, (char)0x5d, (char)0x29, (char)0x29, (char)0x00, + }; + RE2 re(b); + ASSERT_FALSE(re.ok()); +} + +TEST(RE2, Bug18523943) { + // Bug in BitState: case kFailInst failed the match entirely. + + RE2::Options opt; + const char a[] = { + (char)0x29, (char)0x29, (char)0x24, (char)0x00, + }; + const char b[] = { + (char)0x28, (char)0x0a, (char)0x2a, (char)0x2a, (char)0x29, (char)0x00, + }; + opt.set_log_errors(false); + opt.set_encoding(RE2::Options::EncodingLatin1); + opt.set_posix_syntax(true); + opt.set_longest_match(true); + opt.set_literal(false); + opt.set_never_nl(true); + + RE2 re((const char*)b, opt); + ASSERT_TRUE(re.ok()); + std::string s1; + ASSERT_TRUE(RE2::PartialMatch((const char*)a, re, &s1)); +} + +TEST(RE2, Bug21371806) { + // Bug in parser accepting Unicode groups in Latin-1 mode, + // causing compiler to fail in DCHECK in prog.cc. + + RE2::Options opt; + opt.set_encoding(RE2::Options::EncodingLatin1); + + RE2 re("g\\p{Zl}]", opt); + ASSERT_TRUE(re.ok()); +} + +TEST(RE2, Bug26356109) { + // Bug in parser caused by factoring of common prefixes in alternations. + + // In the past, this was factored to "a\\C*?[bc]". Thus, the automaton would + // consume "ab" and then stop (when unanchored) whereas it should consume all + // of "abc" as per first-match semantics. + RE2 re("a\\C*?c|a\\C*?b"); + ASSERT_TRUE(re.ok()); + + std::string s = "abc"; + StringPiece m; + + ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); + ASSERT_EQ(m, s) << " (UNANCHORED) got m='" << m << "', want '" << s << "'"; + + ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::ANCHOR_BOTH, &m, 1)); + ASSERT_EQ(m, s) << " (ANCHOR_BOTH) got m='" << m << "', want '" << s << "'"; +} + +TEST(RE2, Issue104) { + // RE2::GlobalReplace always advanced by one byte when the empty string was + // matched, which would clobber any rune that is longer than one byte. + + std::string s = "bc"; + ASSERT_EQ(3, RE2::GlobalReplace(&s, "a*", "d")); + ASSERT_EQ("dbdcd", s); + + s = "ąć"; + ASSERT_EQ(3, RE2::GlobalReplace(&s, "Ć*", "Ĉ")); + ASSERT_EQ("ĈąĈćĈ", s); + + s = "人类"; + ASSERT_EQ(3, RE2::GlobalReplace(&s, "大*", "小")); + ASSERT_EQ("小人小类小", s); +} + +TEST(RE2, Issue310) { + // (?:|a)* matched more text than (?:|a)+ did. + + std::string s = "aaa"; + StringPiece m; + + RE2 star("(?:|a)*"); + ASSERT_TRUE(star.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); + ASSERT_EQ(m, "") << " got m='" << m << "', want ''"; + + RE2 plus("(?:|a)+"); + ASSERT_TRUE(plus.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); + ASSERT_EQ(m, "") << " got m='" << m << "', want ''"; +} + +} // namespace re2 |