From 95b038c4078a30a797f369b9df1b22ddb427de5f Mon Sep 17 00:00:00 2001
From: Ian Boros <ian.boros@mongodb.com>
Date: Wed, 9 Oct 2019 18:12:41 +0000
Subject: SERVER-42433 use ProjectionAST in canonical query encoding

---
 src/mongo/db/query/canonical_query_encoder.cpp     | 74 +++++++++++-----------
 .../db/query/canonical_query_encoder_test.cpp      | 68 ++++++++++++--------
 2 files changed, 80 insertions(+), 62 deletions(-)

diff --git a/src/mongo/db/query/canonical_query_encoder.cpp b/src/mongo/db/query/canonical_query_encoder.cpp
index 6698e56766c..18ba78b5476 100644
--- a/src/mongo/db/query/canonical_query_encoder.cpp
+++ b/src/mongo/db/query/canonical_query_encoder.cpp
@@ -38,6 +38,7 @@
 #include "mongo/base/simple_string_data_comparator.h"
 #include "mongo/db/matcher/expression_array.h"
 #include "mongo/db/matcher/expression_geo.h"
+#include "mongo/db/query/projection.h"
 #include "mongo/util/log.h"
 
 namespace mongo {
@@ -49,6 +50,7 @@ const char kEncodeChildrenEnd = ']';
 const char kEncodeChildrenSeparator = ',';
 const char kEncodeCollationSection = '#';
 const char kEncodeProjectionSection = '|';
+const char kEncodeProjectionRequirementSeparator = '-';
 const char kEncodeRegexFlagsSeparator = '/';
 const char kEncodeSortSection = '~';
 
@@ -65,6 +67,7 @@ void encodeUserString(StringData s, StringBuilder* keyBuilder) {
             case kEncodeChildrenSeparator:
             case kEncodeCollationSection:
             case kEncodeProjectionSection:
+            case kEncodeProjectionRequirementSeparator:
             case kEncodeRegexFlagsSeparator:
             case kEncodeSortSection:
             case '\\':
@@ -463,51 +466,48 @@ void encodeKeyForSort(const BSONObj& sortObj, StringBuilder* keyBuilder) {
 }
 
 /**
- * Encodes parsed projection into cache key.
- * Does a simple toString() on each projected field
- * in the BSON object.
- * Orders the encoded elements in the projection by field name.
- * This handles all the special projection types ($meta, $elemMatch, etc.)
+ * Encodes projection AST into a cache key.
+ *
+ * For projections which have a finite set of required fields (inclusion-only projections), encodes
+ * those field names in order.
+ *
+ * For projections which require the entire document (exclusion projections, projections with
+ * expressions), the projection section is empty.
  */
-void encodeKeyForProj(const BSONObj& projObj, StringBuilder* keyBuilder) {
-    // Sorts the BSON elements by field name using a map.
-    std::map<StringData, BSONElement> elements;
+void encodeKeyForProj(const projection_ast::Projection* proj, StringBuilder* keyBuilder) {
+    if (!proj || proj->requiresDocument()) {
+        // Don't encode anything for the projection section to indicate the entire document is
+        // required.
+        return;
+    }
 
-    BSONObjIterator it(projObj);
-    while (it.more()) {
-        BSONElement elt = it.next();
-        StringData fieldName = elt.fieldNameStringData();
+    std::vector<std::string> requiredFields = proj->getRequiredFields();
+    invariant(!requiredFields.empty());
 
-        // Internal callers may add $-prefixed fields to the projection. These are not part of a
-        // user query, and therefore are not considered part of the cache key.
-        if (fieldName[0] == '$') {
-            continue;
-        }
+    // Keep track of whether we appended the character marking the beginning of the projection
+    // section. We may not have to if all of the fields in the projection are $-prefixed.
+    bool appendedStart = false;
 
-        elements[fieldName] = elt;
-    }
+    // Encode the fields required by the projection in order.
+    std::sort(requiredFields.begin(), requiredFields.end());
+    for (auto&& requiredField : requiredFields) {
+        invariant(!requiredField.empty());
 
-    if (!elements.empty()) {
-        *keyBuilder << kEncodeProjectionSection;
-    }
+        // Internal callers (e.g, from mongos) may add "$sortKey" to the projection. This is not
+        // part of the user query, and therefore are not considered part of the cache key.
+        if (requiredField == "$sortKey") {
+            continue;
+        }
 
-    // Read elements in order of field name
-    for (std::map<StringData, BSONElement>::const_iterator i = elements.begin();
-         i != elements.end();
-         ++i) {
-        const BSONElement& elt = (*i).second;
+        const bool isFirst = !appendedStart;
 
-        if (elt.type() != BSONType::Object) {
-            // For inclusion/exclusion projections, we encode as "i" or "e".
-            *keyBuilder << (elt.trueValue() ? "i" : "e");
+        if (isFirst) {
+            *keyBuilder << kEncodeProjectionSection;
+            appendedStart = true;
         } else {
-            // For projection operators, we use the verbatim string encoding of the element.
-            encodeUserString(elt.toString(false,   // includeFieldName
-                                          false),  // full
-                             keyBuilder);
+            *keyBuilder << kEncodeProjectionRequirementSeparator;
         }
-
-        encodeUserString(elt.fieldName(), keyBuilder);
+        encodeUserString(requiredField, keyBuilder);
     }
 }
 }  // namespace
@@ -518,7 +518,7 @@ CanonicalQuery::QueryShapeString encode(const CanonicalQuery& cq) {
     StringBuilder keyBuilder;
     encodeKeyForMatch(cq.root(), &keyBuilder);
     encodeKeyForSort(cq.getQueryRequest().getSort(), &keyBuilder);
-    encodeKeyForProj(cq.getQueryRequest().getProj(), &keyBuilder);
+    encodeKeyForProj(cq.getProj(), &keyBuilder);
     encodeCollation(cq.getCollator(), &keyBuilder);
 
     return keyBuilder.str();
diff --git a/src/mongo/db/query/canonical_query_encoder_test.cpp b/src/mongo/db/query/canonical_query_encoder_test.cpp
index 4086f64402f..1d647910ce8 100644
--- a/src/mongo/db/query/canonical_query_encoder_test.cpp
+++ b/src/mongo/db/query/canonical_query_encoder_test.cpp
@@ -123,50 +123,68 @@ TEST(CanonicalQueryEncoderTest, ComputeKey) {
     // With sort
     testComputeKey("{}", "{a: 1}", "{}", "an~aa");
     testComputeKey("{}", "{a: -1}", "{}", "an~da");
-    testComputeKey("{}",
-                   "{a: {$meta: 'textScore'}}",
-                   "{a: {$meta: 'textScore'}}",
-                   "an~ta|{ $meta: \"textScore\" }a");
+    testComputeKey("{}", "{a: {$meta: 'textScore'}}", "{a: {$meta: 'textScore'}}", "an~ta");
     testComputeKey("{a: 1}", "{b: 1}", "{}", "eqa~ab");
 
     // With projection
-    testComputeKey("{}", "{}", "{a: 1}", "an|ia");
-    testComputeKey("{}", "{}", "{a: -1}", "an|ia");
-    testComputeKey("{}", "{}", "{a: -1.0}", "an|ia");
-    testComputeKey("{}", "{}", "{a: true}", "an|ia");
-    testComputeKey("{}", "{}", "{a: 0}", "an|ea");
-    testComputeKey("{}", "{}", "{a: false}", "an|ea");
-    testComputeKey("{}", "{}", "{a: 99}", "an|ia");
-    testComputeKey("{}", "{}", "{a: 'foo'}", "an|ia");
-    testComputeKey("{}", "{}", "{a: {$slice: [3, 5]}}", "an|{ $slice: \\[ 3\\, 5 \\] }a");
-    testComputeKey("{}", "{}", "{a: {$elemMatch: {x: 2}}}", "an|{ $elemMatch: { x: 2 } }a");
-    testComputeKey("{}", "{}", "{a: ObjectId('507f191e810c19729de860ea')}", "an|ia");
-    testComputeKey("{a: 1}", "{}", "{'a.$': 1}", "eqa|ia.$");
-    testComputeKey("{a: 1}", "{}", "{a: 1}", "eqa|ia");
+    testComputeKey("{}", "{}", "{a: 1}", "an|_id-a");
+    testComputeKey("{}", "{}", "{a: -1}", "an|_id-a");
+    testComputeKey("{}", "{}", "{a: -1.0}", "an|_id-a");
+    testComputeKey("{}", "{}", "{a: true}", "an|_id-a");
+    testComputeKey("{}", "{}", "{a: 0}", "an");
+    testComputeKey("{}", "{}", "{a: false}", "an");
+    testComputeKey("{}", "{}", "{a: 99}", "an|_id-a");
+    testComputeKey("{}", "{}", "{a: 'foo'}", "an|_id-a");
+    // $slice defaults to exclusion.
+    testComputeKey("{}", "{}", "{a: {$slice: [3, 5]}}", "an");
+    testComputeKey("{}", "{}", "{a: {$slice: [3, 5]}, b: 0}", "an");
+
+    // But even when using $slice in an inclusion, the entire document is needed.
+    testComputeKey("{}", "{}", "{a: {$slice: [3, 5]}, b: 1}", "an");
+
+    testComputeKey("{}", "{}", "{a: {$elemMatch: {x: 2}}}", "an");
+    testComputeKey("{}", "{}", "{a: {$elemMatch: {x: 2}}, b: 0}", "an");
+    testComputeKey("{}", "{}", "{a: {$elemMatch: {x: 2}}, b: 1}", "an");
+
+    testComputeKey("{}", "{}", "{a: {$slice: [3, 5]}, b: {$elemMatch: {x: 2}}}", "an");
+
+    testComputeKey("{}", "{}", "{a: ObjectId('507f191e810c19729de860ea')}", "an|_id-a");
+    testComputeKey("{a: 1}", "{}", "{'a.$': 1}", "eqa");
+    testComputeKey("{a: 1}", "{}", "{a: 1}", "eqa|_id-a");
 
     // Projection should be order-insensitive
-    testComputeKey("{}", "{}", "{a: 1, b: 1}", "an|iaib");
-    testComputeKey("{}", "{}", "{b: 1, a: 1}", "an|iaib");
+    testComputeKey("{}", "{}", "{a: 1, b: 1}", "an|_id-a-b");
+    testComputeKey("{}", "{}", "{b: 1, a: 1}", "an|_id-a-b");
+
+    // And should escape the separation character.
+    testComputeKey("{}", "{}", "{'b-1': 1, 'a-2': 1}", "an|_id-a\\-2-b\\-1");
+
+    // And should exclude $-prefixed fields which can be added internally.
+    testComputeKey("{}", "{x: 1}", "{$sortKey: {$meta: 'sortKey'}}", "an~ax");
+    testComputeKey("{}", "{}", "{}", "an");
+
+    testComputeKey("{}", "{x: 1}", "{a: 1, $sortKey: {$meta: 'sortKey'}}", "an~ax|_id-a");
+    testComputeKey("{}", "{}", "{a: 1}", "an|_id-a");
 
     // With or-elimination and projection
-    testComputeKey("{$or: [{a: 1}]}", "{}", "{_id: 0, a: 1}", "eqa|e_idia");
-    testComputeKey("{$or: [{a: 1}]}", "{}", "{'a.$': 1}", "eqa|ia.$");
+    testComputeKey("{$or: [{a: 1}]}", "{}", "{_id: 0, a: 1}", "eqa|a");
+    testComputeKey("{$or: [{a: 1}]}", "{}", "{'a.$': 1}", "eqa");
 }
 
 // Delimiters found in user field names or non-standard projection field values
 // must be escaped.
 TEST(CanonicalQueryEncoderTest, ComputeKeyEscaped) {
     // Field name in query.
-    testComputeKey("{'a,[]~|<>': 1}", "{}", "{}", "eqa\\,\\[\\]\\~\\|<>");
+    testComputeKey("{'a,[]~|-<>': 1}", "{}", "{}", "eqa\\,\\[\\]\\~\\|\\-<>");
 
     // Field name in sort.
-    testComputeKey("{}", "{'a,[]~|<>': 1}", "{}", "an~aa\\,\\[\\]\\~\\|<>");
+    testComputeKey("{}", "{'a,[]~|-<>': 1}", "{}", "an~aa\\,\\[\\]\\~\\|\\-<>");
 
     // Field name in projection.
-    testComputeKey("{}", "{}", "{'a,[]~|<>': 1}", "an|ia\\,\\[\\]\\~\\|<>");
+    testComputeKey("{}", "{}", "{'a,[]~|-<>': 1}", "an|_id-a\\,\\[\\]\\~\\|\\-<>");
 
     // Value in projection.
-    testComputeKey("{}", "{}", "{a: 'foo,[]~|<>'}", "an|ia");
+    testComputeKey("{}", "{}", "{a: 'foo,[]~|-<>'}", "an|_id-a");
 }
 
 // Cache keys for $geoWithin queries with legacy and GeoJSON coordinates should
-- 
cgit v1.2.1