summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJacob Evans <jacob.evans@10gen.com>2020-10-12 20:51:23 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-10-15 00:59:10 +0000
commitb0ef26c639112b50648a02d969298650fbd402a4 (patch)
tree7e60b84b601536b56c6e3f699dce7451350b671b
parentd881b2b32dda7389e99efd40e4a96e34de082281 (diff)
downloadmongo-b0ef26c639112b50648a02d969298650fbd402a4.tar.gz
SERVER-51083 Reject invalid UTF-8 from $regex match expressions
-rw-r--r--src/mongo/db/matcher/expression_leaf.cpp8
-rw-r--r--src/mongo/db/matcher/expression_leaf_test.cpp6
-rw-r--r--src/mongo/db/query/query_planner_test.cpp15
3 files changed, 29 insertions, 0 deletions
diff --git a/src/mongo/db/matcher/expression_leaf.cpp b/src/mongo/db/matcher/expression_leaf.cpp
index ffe3a660a28..b1ca9d30342 100644
--- a/src/mongo/db/matcher/expression_leaf.cpp
+++ b/src/mongo/db/matcher/expression_leaf.cpp
@@ -46,6 +46,7 @@
#include "mongo/db/query/collation/collator_interface.h"
#include "mongo/stdx/memory.h"
#include "mongo/util/mongoutils/str.h"
+#include "mongo/util/text.h"
namespace mongo {
@@ -226,6 +227,13 @@ Status RegexMatchExpression::init(StringData path, StringData regex, StringData
_regex = regex.toString();
_flags = options.toString();
+
+ // isValidUTF8() checks for UTF-8 which does not map to a series of codepoints but does not
+ // check the validity of the code points themselves. These situations do not cause problems
+ // downstream so we do not do additional work to enforce that the code points are valid.
+ uassert(
+ 5108300, "Regular expression is invalid UTF-8", isValidUTF8(_regex) && isValidUTF8(_flags));
+
_re.reset(new pcrecpp::RE(_regex.c_str(), flags2options(_flags.c_str())));
return setPath(path);
diff --git a/src/mongo/db/matcher/expression_leaf_test.cpp b/src/mongo/db/matcher/expression_leaf_test.cpp
index 9d655f3734d..af60b5f05d4 100644
--- a/src/mongo/db/matcher/expression_leaf_test.cpp
+++ b/src/mongo/db/matcher/expression_leaf_test.cpp
@@ -917,6 +917,12 @@ TEST(RegexMatchExpression, TooLargePattern) {
ASSERT(!regex.init("a", tooLargePattern, "").isOK());
}
+TEST(RegexMatchExpression, RegexCannotBeInvalidUTF8) {
+ ASSERT_THROWS_CODE(RegexMatchExpression("path", "^\xff\xff", ""), AssertionException, 5108300);
+ ASSERT_THROWS_CODE(
+ RegexMatchExpression("path", "^42", "\xff\xff"), AssertionException, 5108300);
+}
+
TEST(RegexMatchExpression, MatchesElementSimplePrefix) {
BSONObj match = BSON("x"
<< "abc");
diff --git a/src/mongo/db/query/query_planner_test.cpp b/src/mongo/db/query/query_planner_test.cpp
index 6f69c90e520..417081795a6 100644
--- a/src/mongo/db/query/query_planner_test.cpp
+++ b/src/mongo/db/query/query_planner_test.cpp
@@ -6076,4 +6076,19 @@ TEST_F(QueryPlannerTest, LockstepOrEnumerationApplysToEachOrInTree) {
"]}}");
}
+TEST_F(QueryPlannerTest, InvalidUtf8CodePointDoesNotLeadToInvalidIndexBoundsInvariantFailure) {
+ params.options &= ~QueryPlannerParams::INCLUDE_COLLSCAN;
+ addIndex(BSON("a" << 1));
+
+ // This UTF-8 is encoded correctly in the sense that it maps to a sequence of code points. The
+ // code point 0x110000 is considered invalid. This does not result in an error because it does
+ // not trigger a bounds building invariant.
+ auto invalidCodePoint = std::string{"\xf4\x90\x80\x80"};
+ auto findCommandWithInvalidCodepoint = BSON("find"
+ << "testns"
+ << "filter"
+ << BSON("a" << BSON("$regex" << invalidCodePoint)));
+ runQueryAsCommand(findCommandWithInvalidCodepoint);
+}
+
} // namespace