diff options
author | Jacob Evans <jacob.evans@10gen.com> | 2020-10-12 20:51:23 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-10-15 00:59:10 +0000 |
commit | b0ef26c639112b50648a02d969298650fbd402a4 (patch) | |
tree | 7e60b84b601536b56c6e3f699dce7451350b671b | |
parent | d881b2b32dda7389e99efd40e4a96e34de082281 (diff) | |
download | mongo-b0ef26c639112b50648a02d969298650fbd402a4.tar.gz |
SERVER-51083 Reject invalid UTF-8 from $regex match expressions
-rw-r--r-- | src/mongo/db/matcher/expression_leaf.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/matcher/expression_leaf_test.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/query/query_planner_test.cpp | 15 |
3 files changed, 29 insertions, 0 deletions
diff --git a/src/mongo/db/matcher/expression_leaf.cpp b/src/mongo/db/matcher/expression_leaf.cpp index ffe3a660a28..b1ca9d30342 100644 --- a/src/mongo/db/matcher/expression_leaf.cpp +++ b/src/mongo/db/matcher/expression_leaf.cpp @@ -46,6 +46,7 @@ #include "mongo/db/query/collation/collator_interface.h" #include "mongo/stdx/memory.h" #include "mongo/util/mongoutils/str.h" +#include "mongo/util/text.h" namespace mongo { @@ -226,6 +227,13 @@ Status RegexMatchExpression::init(StringData path, StringData regex, StringData _regex = regex.toString(); _flags = options.toString(); + + // isValidUTF8() checks for UTF-8 which does not map to a series of codepoints but does not + // check the validity of the code points themselves. These situations do not cause problems + // downstream so we do not do additional work to enforce that the code points are valid. + uassert( + 5108300, "Regular expression is invalid UTF-8", isValidUTF8(_regex) && isValidUTF8(_flags)); + _re.reset(new pcrecpp::RE(_regex.c_str(), flags2options(_flags.c_str()))); return setPath(path); diff --git a/src/mongo/db/matcher/expression_leaf_test.cpp b/src/mongo/db/matcher/expression_leaf_test.cpp index 9d655f3734d..af60b5f05d4 100644 --- a/src/mongo/db/matcher/expression_leaf_test.cpp +++ b/src/mongo/db/matcher/expression_leaf_test.cpp @@ -917,6 +917,12 @@ TEST(RegexMatchExpression, TooLargePattern) { ASSERT(!regex.init("a", tooLargePattern, "").isOK()); } +TEST(RegexMatchExpression, RegexCannotBeInvalidUTF8) { + ASSERT_THROWS_CODE(RegexMatchExpression("path", "^\xff\xff", ""), AssertionException, 5108300); + ASSERT_THROWS_CODE( + RegexMatchExpression("path", "^42", "\xff\xff"), AssertionException, 5108300); +} + TEST(RegexMatchExpression, MatchesElementSimplePrefix) { BSONObj match = BSON("x" << "abc"); diff --git a/src/mongo/db/query/query_planner_test.cpp b/src/mongo/db/query/query_planner_test.cpp index 6f69c90e520..417081795a6 100644 --- a/src/mongo/db/query/query_planner_test.cpp +++ b/src/mongo/db/query/query_planner_test.cpp @@ -6076,4 +6076,19 @@ TEST_F(QueryPlannerTest, LockstepOrEnumerationApplysToEachOrInTree) { "]}}"); } +TEST_F(QueryPlannerTest, InvalidUtf8CodePointDoesNotLeadToInvalidIndexBoundsInvariantFailure) { + params.options &= ~QueryPlannerParams::INCLUDE_COLLSCAN; + addIndex(BSON("a" << 1)); + + // This UTF-8 is encoded correctly in the sense that it maps to a sequence of code points. The + // code point 0x110000 is considered invalid. This does not result in an error because it does + // not trigger a bounds building invariant. + auto invalidCodePoint = std::string{"\xf4\x90\x80\x80"}; + auto findCommandWithInvalidCodepoint = BSON("find" + << "testns" + << "filter" + << BSON("a" << BSON("$regex" << invalidCodePoint))); + runQueryAsCommand(findCommandWithInvalidCodepoint); +} + } // namespace |