SERVER-62499 Augment DocumentStorage APIs to accept pre-computed hashes

author: Dan Larkin-York <dan.larkin-york@mongodb.com> 2022-01-13 13:53:05 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-01-13 14:38:25 +0000
commit: c67e0f941ae7278478e250c7692ef76b4a65953d (patch)
tree: 9d97f6033cf6a38bdcef7a5d747572c8f3f402c2
parent: 429faa23081c2f97e953ca878368dca189df9986 (diff)
download: mongo-c67e0f941ae7278478e250c7692ef76b4a65953d.tar.gz
11 files changed, 356 insertions, 92 deletions
diff --git a/src/mongo/bson/util/bsoncolumn.cpp b/src/mongo/bson/util/bsoncolumn.cpp
index c681ba3ddae..e13d1fd80ae 100644
--- a/src/mongo/bson/util/bsoncolumn.cpp
+++ b/src/mongo/bson/util/bsoncolumn.cpp
@@ -28,12 +28,14 @@
  */
 
 #include "mongo/bson/util/bsoncolumn.h"
+
+#include <algorithm>
+#include <third_party/murmurhash3/MurmurHash3.h>
+
 #include "mongo/bson/bsonobj.h"
 #include "mongo/bson/util/bsoncolumn_util.h"
 #include "mongo/bson/util/simple8b_type_util.h"
 
-#include <algorithm>
-
 namespace mongo {
 using namespace bsoncolumn;
 
@@ -102,6 +104,13 @@ private:
     ElementFunc _elemFunc;
 };
 
+std::size_t hashName(StringData sd) {
+    // Keep in sync with DocumentStorageHasher
+    unsigned out;
+    MurmurHash3_x86_32(sd.rawData(), sd.size(), 0, &out);
+    return out;
+}
+
 }  // namespace
 
 BSONColumn::ElementStorage::Element::Element(char* buffer, int nameSize, int valueSize)
@@ -761,6 +770,7 @@ BSONColumn::BSONColumn(BSONElement bin) {
 
     _binary = bin.binData(_size);
     _name = bin.fieldNameStringData().toString();
+    _nameHash = hashName(_name);
     _init();
 }
 
@@ -769,6 +779,7 @@ BSONColumn::BSONColumn(BSONBinData bin, StringData name) {
     _binary = static_cast<const char*>(bin.data);
     _size = bin.length;
     _name = name.toString();
+    _nameHash = hashName(_name);
     _init();
 }
 
diff --git a/src/mongo/bson/util/bsoncolumn.h b/src/mongo/bson/util/bsoncolumn.h
index f69b52c2f92..511090c3b3e 100644
--- a/src/mongo/bson/util/bsoncolumn.h
+++ b/src/mongo/bson/util/bsoncolumn.h
@@ -263,6 +263,15 @@ public:
         return _name;
     }
 
+    /**
+     * Field name that this BSONColumn represents.
+     *
+     * O(1) time complexity
+     */
+    std::pair<StringData, std::size_t> nameHashed() const {
+        return {_name, _nameHash};
+    }
+
 private:
     /**
      * BSONElement storage, owns materialised BSONElement returned by BSONColumn.
@@ -405,5 +414,6 @@ private:
     bool _fullyDecompressed = false;
 
     std::string _name;
+    std::size_t _nameHash;
 };
 }  // namespace mongo
diff --git a/src/mongo/db/exec/bucket_unpacker.cpp b/src/mongo/db/exec/bucket_unpacker.cpp
index 7f479ce763e..782d2ff5235 100644
--- a/src/mongo/db/exec/bucket_unpacker.cpp
+++ b/src/mongo/db/exec/bucket_unpacker.cpp
@@ -27,10 +27,9 @@
  *    it in the license file.
  */
 
-#include "mongo/platform/basic.h"
+#include "mongo/db/exec/bucket_unpacker.h"
 
 #include "mongo/bson/util/bsoncolumn.h"
-#include "mongo/db/exec/bucket_unpacker.h"
 #include "mongo/db/timeseries/timeseries_constants.h"
 
 namespace mongo {
@@ -163,12 +162,12 @@ bool BucketUnpackerV1::getNext(MutableDocument& measurement,
                                bool includeMetaField) {
     auto&& timeElem = _timeFieldIter.next();
     if (includeTimeField) {
-        measurement.addField(spec.timeField, Value{timeElem});
+        measurement.addField(spec.timeFieldHashed(), Value{timeElem});
     }
 
     // Includes metaField when we're instructed to do so and metaField value exists.
     if (includeMetaField && !metaValue.missing()) {
-        measurement.addField(*spec.metaField, metaValue);
+        measurement.addField(*spec.metaFieldHashed(), metaValue);
     }
 
     auto& currentIdx = timeElem.fieldNameStringData();
@@ -195,7 +194,7 @@ void BucketUnpackerV1::extractSingleMeasurement(MutableDocument& measurement,
     auto&& dataRegion = bucket.getField(timeseries::kBucketDataFieldName).Obj();
 
     if (includeMetaField && !metaValue.missing()) {
-        measurement.addField(*spec.metaField, metaValue);
+        measurement.addField(*spec.metaFieldHashed(), metaValue);
     }
 
     for (auto&& dataElem : dataRegion) {
@@ -283,13 +282,13 @@ bool BucketUnpackerV2::getNext(MutableDocument& measurement,
     // Get element and increment iterator
     const auto& timeElem = *_timeColumn.it;
     if (includeTimeField) {
-        measurement.addField(spec.timeField, Value{timeElem});
+        measurement.addField(spec.timeFieldHashed(), Value{timeElem});
     }
     ++_timeColumn.it;
 
     // Includes metaField when we're instructed to do so and metaField value exists.
     if (includeMetaField && !metaValue.missing()) {
-        measurement.addField(*spec.metaField, metaValue);
+        measurement.addField(*spec.metaFieldHashed(), metaValue);
     }
 
     for (auto& fieldColumn : _fieldColumns) {
@@ -299,7 +298,7 @@ bool BucketUnpackerV2::getNext(MutableDocument& measurement,
         const BSONElement& elem = *fieldColumn.it;
         // EOO represents missing field
         if (!elem.eoo()) {
-            measurement.addField(fieldColumn.column.name(), Value{elem});
+            measurement.addField(HashedFieldName{fieldColumn.column.nameHashed()}, Value{elem});
         }
         ++fieldColumn.it;
     }
@@ -319,18 +318,18 @@ void BucketUnpackerV2::extractSingleMeasurement(MutableDocument& measurement,
         auto val = _timeColumn.column[j];
         uassert(
             6067500, "Bucket unexpectedly contained fewer values than count", val && !val->eoo());
-        measurement.addField(_timeColumn.column.name(), Value{*val});
+        measurement.addField(spec.timeFieldHashed(), Value{*val});
     }
 
     if (includeMetaField && !metaValue.missing()) {
-        measurement.addField(*spec.metaField, metaValue);
+        measurement.addField(*spec.metaFieldHashed(), metaValue);
     }
 
     if (includeTimeField) {
         for (auto& fieldColumn : _fieldColumns) {
             auto val = fieldColumn.column[j];
             uassert(6067600, "Bucket unexpectedly contained fewer values than count", val);
-            measurement.addField(fieldColumn.column.name(), Value{*val});
+            measurement.addField(HashedFieldName{fieldColumn.column.nameHashed()}, Value{*val});
         }
     }
 }
@@ -361,6 +360,88 @@ void eraseExcludedComputedMetaProjFields(BucketUnpacker::Behavior unpackerBehavi
 
 }  // namespace
 
+BucketSpec::BucketSpec(const std::string& timeField,
+                       const boost::optional<std::string>& metaField,
+                       const std::set<std::string>& fields,
+                       const std::vector<std::string>& computedProjections)
+    : fieldSet(fields),
+      computedMetaProjFields(computedProjections),
+      _timeField(timeField),
+      _timeFieldHashed(FieldNameHasher().hashedFieldName(_timeField)),
+      _metaField(metaField) {
+    if (_metaField) {
+        _metaFieldHashed = FieldNameHasher().hashedFieldName(*_metaField);
+    }
+}
+
+BucketSpec::BucketSpec(const BucketSpec& other)
+    : fieldSet(other.fieldSet),
+      computedMetaProjFields(other.computedMetaProjFields),
+      _timeField(other._timeField),
+      _timeFieldHashed(HashedFieldName{_timeField, other._timeFieldHashed->hash()}),
+      _metaField(other._metaField) {
+    if (_metaField) {
+        _metaFieldHashed = HashedFieldName{*_metaField, other._metaFieldHashed->hash()};
+    }
+}
+
+BucketSpec::BucketSpec(BucketSpec&& other)
+    : fieldSet(std::move(other.fieldSet)),
+      computedMetaProjFields(std::move(other.computedMetaProjFields)),
+      _timeField(std::move(other._timeField)),
+      _timeFieldHashed(HashedFieldName{_timeField, other._timeFieldHashed->hash()}),
+      _metaField(std::move(other._metaField)) {
+    if (_metaField) {
+        _metaFieldHashed = HashedFieldName{*_metaField, other._metaFieldHashed->hash()};
+    }
+}
+
+BucketSpec& BucketSpec::operator=(const BucketSpec& other) {
+    if (&other != this) {
+        fieldSet = other.fieldSet;
+        computedMetaProjFields = other.computedMetaProjFields;
+        _timeField = other._timeField;
+        _timeFieldHashed = HashedFieldName{_timeField, other._timeFieldHashed->hash()};
+        _metaField = other._metaField;
+        if (_metaField) {
+            _metaFieldHashed = HashedFieldName{*_metaField, other._metaFieldHashed->hash()};
+        }
+    }
+    return *this;
+}
+
+void BucketSpec::setTimeField(std::string&& name) {
+    _timeField = std::move(name);
+    _timeFieldHashed = FieldNameHasher().hashedFieldName(_timeField);
+}
+
+const std::string& BucketSpec::timeField() const {
+    return _timeField;
+}
+
+HashedFieldName BucketSpec::timeFieldHashed() const {
+    invariant(_timeFieldHashed->key().rawData() == _timeField.data());
+    invariant(_timeFieldHashed->key() == _timeField);
+    return *_timeFieldHashed;
+}
+
+void BucketSpec::setMetaField(boost::optional<std::string>&& name) {
+    _metaField = std::move(name);
+    if (_metaField) {
+        _metaFieldHashed = FieldNameHasher().hashedFieldName(*_metaField);
+    } else {
+        _metaFieldHashed = boost::none;
+    }
+}
+
+const boost::optional<std::string>& BucketSpec::metaField() const {
+    return _metaField;
+}
+
+boost::optional<HashedFieldName> BucketSpec::metaFieldHashed() const {
+    return _metaFieldHashed;
+}
+
 BucketUnpacker::BucketUnpacker() = default;
 BucketUnpacker::BucketUnpacker(BucketUnpacker&& other) = default;
 BucketUnpacker::~BucketUnpacker() = default;
@@ -437,13 +518,13 @@ void BucketUnpacker::reset(BSONObj&& bucket) {
         return;
     }
 
-    auto&& timeFieldElem = dataRegion.getField(_spec.timeField);
+    auto&& timeFieldElem = dataRegion.getField(_spec.timeField());
     uassert(5346700,
             "The $_internalUnpackBucket stage requires the data region to have a timeField object",
             timeFieldElem);
 
     _metaValue = Value{_bucket[timeseries::kBucketMetaFieldName]};
-    if (_spec.metaField) {
+    if (_spec.metaField()) {
         // The spec indicates that there might be a metadata region. Missing metadata in
         // measurements is expressed with missing metadata in a bucket. But we disallow undefined
         // since the undefined BSON type is deprecated.
@@ -489,7 +570,7 @@ void BucketUnpacker::reset(BSONObj&& bucket) {
     // include or exclude case.
     for (auto&& elem : dataRegion) {
         auto& colName = elem.fieldNameStringData();
-        if (colName == _spec.timeField) {
+        if (colName == _spec.timeField()) {
             // Skip adding a FieldIterator for the timeField since the timestamp value from
             // _timeFieldIter can be placed accordingly in the materialized measurement.
             continue;
diff --git a/src/mongo/db/exec/bucket_unpacker.h b/src/mongo/db/exec/bucket_unpacker.h
index da057735691..2f5fd70b785 100644
--- a/src/mongo/db/exec/bucket_unpacker.h
+++ b/src/mongo/db/exec/bucket_unpacker.h
@@ -29,6 +29,7 @@
 
 #pragma once
 
+
 #include <algorithm>
 #include <set>
 
@@ -39,14 +40,29 @@ namespace mongo {
 /**
  * Carries parameters for unpacking a bucket.
  */
-struct BucketSpec {
+class BucketSpec {
+public:
+    BucketSpec() = default;
+    BucketSpec(const std::string& timeField,
+               const boost::optional<std::string>& metaField,
+               const std::set<std::string>& fields = {},
+               const std::vector<std::string>& computedProjections = {});
+    BucketSpec(const BucketSpec&);
+    BucketSpec(BucketSpec&&);
+
+    BucketSpec& operator=(const BucketSpec&);
+
     // The user-supplied timestamp field name specified during time-series collection creation.
-    std::string timeField;
+    void setTimeField(std::string&& field);
+    const std::string& timeField() const;
+    HashedFieldName timeFieldHashed() const;
 
     // An optional user-supplied metadata field name specified during time-series collection
     // creation. This field name is used during materialization of metadata fields of a measurement
     // after unpacking.
-    boost::optional<std::string> metaField;
+    void setMetaField(boost::optional<std::string>&& field);
+    const boost::optional<std::string>& metaField() const;
+    boost::optional<HashedFieldName> metaFieldHashed() const;
 
     // The set of field names in the data region that should be included or excluded.
     std::set<std::string> fieldSet;
@@ -54,6 +70,13 @@ struct BucketSpec {
     // Vector of computed meta field projection names. Added at the end of materialized
     // measurements.
     std::vector<std::string> computedMetaProjFields;
+
+private:
+    std::string _timeField;
+    boost::optional<HashedFieldName> _timeFieldHashed;
+
+    boost::optional<std::string> _metaField = boost::none;
+    boost::optional<HashedFieldName> _metaFieldHashed = boost::none;
 };
 
 /**
@@ -185,12 +208,12 @@ private:
  */
 inline bool eraseMetaFromFieldSetAndDetermineIncludeMeta(BucketUnpacker::Behavior unpackerBehavior,
                                                          BucketSpec* bucketSpec) {
-    if (!bucketSpec->metaField ||
+    if (!bucketSpec->metaField() ||
         std::find(bucketSpec->computedMetaProjFields.cbegin(),
                   bucketSpec->computedMetaProjFields.cend(),
-                  *bucketSpec->metaField) != bucketSpec->computedMetaProjFields.cend()) {
+                  *bucketSpec->metaField()) != bucketSpec->computedMetaProjFields.cend()) {
         return false;
-    } else if (auto itr = bucketSpec->fieldSet.find(*bucketSpec->metaField);
+    } else if (auto itr = bucketSpec->fieldSet.find(*bucketSpec->metaField());
                itr != bucketSpec->fieldSet.end()) {
         bucketSpec->fieldSet.erase(itr);
         return unpackerBehavior == BucketUnpacker::Behavior::kInclude;
@@ -205,7 +228,7 @@ inline bool eraseMetaFromFieldSetAndDetermineIncludeMeta(BucketUnpacker::Behavio
 inline bool determineIncludeTimeField(BucketUnpacker::Behavior unpackerBehavior,
                                       BucketSpec* bucketSpec) {
     return (unpackerBehavior == BucketUnpacker::Behavior::kInclude) ==
-        (bucketSpec->fieldSet.find(bucketSpec->timeField) != bucketSpec->fieldSet.end());
+        (bucketSpec->fieldSet.find(bucketSpec->timeField()) != bucketSpec->fieldSet.end());
 }
 
 /**
diff --git a/src/mongo/db/exec/document_value/document.cpp b/src/mongo/db/exec/document_value/document.cpp
index f47eff8b300..dd833b3c28f 100644
--- a/src/mongo/db/exec/document_value/document.cpp
+++ b/src/mongo/db/exec/document_value/document.cpp
@@ -27,8 +27,6 @@
  *    it in the license file.
  */
 
-#include "mongo/platform/basic.h"
-
 #include "mongo/db/exec/document_value/document.h"
 
 #include <boost/functional/hash.hpp>
@@ -164,7 +162,8 @@ bool DocumentStorageIterator::shouldSkipDeleted() {
     return false;
 }
 
-Position DocumentStorage::findFieldInCache(StringData requested) const {
+template <typename T>
+Position DocumentStorage::findFieldInCache(T requested) const {
     int reqSize = requested.size();  // get size calculation out of the way if needed
 
     if (_numFields >= HASH_TAB_MIN) {  // hash lookup
@@ -192,14 +191,17 @@ Position DocumentStorage::findFieldInCache(StringData requested) const {
     // if we got here, there's no such field
     return Position();
 }
+template Position DocumentStorage::findFieldInCache<StringData>(StringData field) const;
+template Position DocumentStorage::findFieldInCache<HashedFieldName>(HashedFieldName field) const;
 
-Position DocumentStorage::findField(StringData requested, LookupPolicy policy) const {
-    if (auto pos = findFieldInCache(requested); pos.found() || policy == LookupPolicy::kCacheOnly) {
+template <typename T>
+Position DocumentStorage::findField(T field, LookupPolicy policy) const {
+    if (auto pos = findFieldInCache(field); pos.found() || policy == LookupPolicy::kCacheOnly) {
         return pos;
     }
 
     for (auto&& bsonElement : _bson) {
-        if (requested == bsonElement.fieldNameStringData()) {
+        if (field == bsonElement.fieldNameStringData()) {
             return const_cast<DocumentStorage*>(this)->constructInCache(bsonElement);
         }
     }
@@ -207,6 +209,10 @@ Position DocumentStorage::findField(StringData requested, LookupPolicy policy) c
     // if we got here, there's no such field
     return Position();
 }
+template Position DocumentStorage::findField<StringData>(StringData field,
+                                                         LookupPolicy policy) const;
+template Position DocumentStorage::findField<HashedFieldName>(HashedFieldName field,
+                                                              LookupPolicy policy) const;
 
 Position DocumentStorage::constructInCache(const BSONElement& elem) {
     auto savedModified = _modified;
@@ -222,9 +228,10 @@ Position DocumentStorage::constructInCache(const BSONElement& elem) {
     return pos;
 }
 
-Value& DocumentStorage::appendField(StringData name, ValueElement::Kind kind) {
+template <typename T>
+Value& DocumentStorage::appendField(T field, ValueElement::Kind kind) {
     Position pos = getNextPosition();
-    const int nameSize = name.size();
+    const int nameSize = field.size();
 
     // these are the same for everyone
     const Position nextCollision;
@@ -245,7 +252,7 @@ Value& DocumentStorage::appendField(StringData name, ValueElement::Kind kind) {
     append(nextCollision);
     append(nameSize);
     append(kind);
-    name.copyTo(dest, true);
+    field.copyTo(dest, true);
 // Padding for alignment handled above
 #undef append
 
@@ -255,7 +262,7 @@ Value& DocumentStorage::appendField(StringData name, ValueElement::Kind kind) {
     _numFields++;
 
     if (_numFields > HASH_TAB_MIN) {
-        addFieldToHashTable(pos);
+        addFieldToHashTable(field, pos);
     } else if (_numFields == HASH_TAB_MIN) {
         // adds all fields to hash table (including the one we just added)
         rehash();
@@ -263,13 +270,16 @@ Value& DocumentStorage::appendField(StringData name, ValueElement::Kind kind) {
 
     return getField(pos).val;
 }
+template Value& DocumentStorage::appendField<StringData>(StringData, ValueElement::Kind);
+template Value& DocumentStorage::appendField<HashedFieldName>(HashedFieldName, ValueElement::Kind);
 
 // Call after adding field to _fields and increasing _numFields
-void DocumentStorage::addFieldToHashTable(Position pos) {
+template <typename T>
+void DocumentStorage::addFieldToHashTable(T field, Position pos) {
     ValueElement& elem = getField(pos);
     elem.nextCollision = Position();
 
-    const unsigned bucket = bucketForKey(elem.nameSD());
+    const unsigned bucket = bucketForKey(field);
 
     Position* posPtr = &_hashTab[bucket];
     while (posPtr->found()) {
diff --git a/src/mongo/db/exec/document_value/document.h b/src/mongo/db/exec/document_value/document.h
index 36c4106357d..8fcfb28dd8b 100644
--- a/src/mongo/db/exec/document_value/document.h
+++ b/src/mongo/db/exec/document_value/document.h
@@ -136,17 +136,16 @@ public:
      * Note that this method does *not* traverse nested documents and arrays, use getNestedField()
      * instead.
      */
-    const Value operator[](StringData key) const {
+    template <typename T>
+    const Value operator[](T key) const {
         return getField(key);
     }
-    const Value getField(StringData key) const {
+    template <typename T>
+    const Value getField(T key) const {
         return storage().getField(key);
     }
 
     /// Look up a field by Position. See positionOf and getNestedField.
-    const Value operator[](Position pos) const {
-        return getField(pos);
-    }
     const Value getField(Position pos) const {
         return storage().getField(pos).val;
     }
@@ -535,12 +534,18 @@ public:
      *        Decide what level of support is needed for duplicate fields.
      *        If duplicates are not allowed, consider removing this method.
      */
-    void addField(StringData fieldName, const Value& val) {
-        storage().appendField(fieldName, ValueElement::Kind::kInserted) = val;
+    void addField(StringData name, const Value& val) {
+        storage().appendField(name, ValueElement::Kind::kInserted) = val;
+    }
+    void addField(HashedFieldName field, const Value& val) {
+        storage().appendField(field, ValueElement::Kind::kInserted) = val;
     }
 
-    void addField(StringData fieldName, Value&& val) {
-        storage().appendField(fieldName, ValueElement::Kind::kInserted) = std::move(val);
+    void addField(StringData name, Value&& val) {
+        storage().appendField(name, ValueElement::Kind::kInserted) = std::move(val);
+    }
+    void addField(HashedFieldName field, Value&& val) {
+        storage().appendField(field, ValueElement::Kind::kInserted) = std::move(val);
     }
 
     /** Update field by key. If there is no field with that key, add one.
diff --git a/src/mongo/db/exec/document_value/document_internal.h b/src/mongo/db/exec/document_value/document_internal.h
index 9be057fb522..062b7a4da00 100644
--- a/src/mongo/db/exec/document_value/document_internal.h
+++ b/src/mongo/db/exec/document_value/document_internal.h
@@ -29,10 +29,9 @@
 
 #pragma once
 
-#include <third_party/murmurhash3/MurmurHash3.h>
-
 #include <bitset>
 #include <boost/intrusive_ptr.hpp>
+#include <third_party/murmurhash3/MurmurHash3.h>
 
 #include "mongo/base/static_assert.h"
 #include "mongo/db/exec/document_value/document_metadata_fields.h"
@@ -275,6 +274,85 @@ private:
     const ValueElement* _end;
 };
 
+/**
+ * Type that bundles the hashed field name along with the actual string so that hashing can be done
+ * outside of inserts and lookups and re-used across calls.
+ */
+class HashedFieldName {
+public:
+    explicit HashedFieldName(StringData sd, std::size_t hash) : _sd(sd), _hash(hash) {}
+    explicit HashedFieldName(std::pair<StringData, std::size_t> pair)
+        : _sd(pair.first), _hash(pair.second) {}
+
+    StringData key() const {
+        return _sd;
+    }
+
+    std::size_t hash() const {
+        return _hash;
+    }
+
+    std::size_t size() const {
+        return _sd.size();
+    }
+
+    inline void copyTo(char* dest, bool includeEndingNull) const {
+        return _sd.copyTo(dest, includeEndingNull);
+    }
+
+    constexpr const char* rawData() const noexcept {
+        return _sd.rawData();
+    }
+
+private:
+    StringData _sd;
+    std::size_t _hash;
+};
+
+inline bool operator==(HashedFieldName lhs, StringData rhs) {
+    return lhs.key() == rhs;
+}
+
+inline bool operator==(StringData lhs, HashedFieldName rhs) {
+    return lhs == rhs.key();
+}
+
+inline bool operator==(HashedFieldName lhs, HashedFieldName rhs) {
+    return lhs.key() == rhs.key();
+}
+
+/**
+ * Hasher to support heterogeneous lookup for StringData and string-like elements.
+ */
+struct FieldNameHasher {
+    // This using directive activates heterogeneous lookup in the hash table
+    using is_transparent = void;
+
+    std::size_t operator()(StringData sd) const {
+        // TODO consider FNV-1a once we have a better benchmark corpus
+        // Keep in sync with 'hashName' in BSONColumn implementation.
+        unsigned out;
+        MurmurHash3_x86_32(sd.rawData(), sd.size(), 0, &out);
+        return out;
+    }
+
+    std::size_t operator()(const std::string& s) const {
+        return operator()(StringData(s));
+    }
+
+    std::size_t operator()(const char* s) const {
+        return operator()(StringData(s));
+    }
+
+    std::size_t operator()(HashedFieldName key) const {
+        return key.hash();
+    }
+
+    HashedFieldName hashedFieldName(StringData sd) {
+        return HashedFieldName(sd, operator()(sd));
+    }
+};
+
 /// Storage class used by both Document and MutableDocument
 class DocumentStorage : public RefCountable {
 public:
@@ -334,13 +412,15 @@ public:
     };
 
     /// Returns the position of the named field or Position()
-    Position findField(StringData name, LookupPolicy policy) const;
+    template <typename T>
+    Position findField(T field, LookupPolicy policy) const;
 
     // Document uses these
     const ValueElement& getField(Position pos) const {
         verify(pos.found());
         return *(_firstElement->plusBytes(pos.index));
     }
+
     Value getField(StringData name) const {
         Position pos = findField(name, LookupPolicy::kCacheAndBSON);
         if (!pos.found())
@@ -348,12 +428,20 @@ public:
         return getField(pos).val;
     }
 
+    Value getField(HashedFieldName field) const {
+        Position pos = findField(field, LookupPolicy::kCacheAndBSON);
+        if (!pos.found())
+            return Value();
+        return getField(pos).val;
+    }
+
     // MutableDocument uses these
     ValueElement& getField(Position pos) {
         _modified = true;
         verify(pos.found());
         return *(_firstElement->plusBytes(pos.index));
     }
+
     Value& getField(StringData name, LookupPolicy policy) {
         _modified = true;
         Position pos = findField(name, policy);
@@ -383,7 +471,8 @@ public:
     }
 
     /// Adds a new field with missing Value at the end of the document
-    Value& appendField(StringData name, ValueElement::Kind kind);
+    template <typename T>
+    Value& appendField(T field, ValueElement::Kind kind);
 
     /** Preallocates space for fields. Use this to attempt to prevent buffer growth.
      *  This is only valid to call before anything is added to the document.
@@ -487,11 +576,9 @@ public:
         _metadataFields = std::move(metadata);
     }
 
-    static unsigned hashKey(StringData name) {
-        // TODO consider FNV-1a once we have a better benchmark corpus
-        unsigned out;
-        MurmurHash3_x86_32(name.rawData(), name.size(), 0, &out);
-        return out;
+    template <typename T>
+    static unsigned hashKey(T name) {
+        return FieldNameHasher()(name);
     }
 
     const ValueElement* begin() const {
@@ -519,13 +606,15 @@ public:
 
 private:
     /// Returns the position of the named field in the cache or Position()
-    Position findFieldInCache(StringData name) const;
+    template <typename T>
+    Position findFieldInCache(T name) const;
 
     /// Allocates space in _cache. Copies existing data if there is any.
     void alloc(unsigned newSize);
 
     /// Call after adding field to _cache and increasing _numFields
-    void addFieldToHashTable(Position pos);
+    template <typename T>
+    void addFieldToHashTable(T field, Position pos);
 
     // assumes _hashTabMask is (power of two) - 1
     unsigned hashTabBuckets() const {
@@ -545,15 +634,16 @@ private:
         memset(static_cast<void*>(_hashTab), -1, hashTabBytes());
     }
 
-    unsigned bucketForKey(StringData name) const {
-        return hashKey(name) & _hashTabMask;
+    template <typename T>
+    unsigned bucketForKey(T field) const {
+        return hashKey(field) & _hashTabMask;
     }
 
     /// Adds all fields to the hash table
     void rehash() {
         hashTabInit();
         for (auto it = iteratorCacheOnly(); !it.atEnd(); it.advance())
-            addFieldToHashTable(it.position());
+            addFieldToHashTable(getField(it.position()).nameSD(), it.position());
     }
 
     void loadLazyMetadata() const;
diff --git a/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp b/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp
index 8e0999becaa..d49267b7858 100644
--- a/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp
+++ b/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp
@@ -280,7 +280,7 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF
             uassert(5346504,
                     str::stream() << "timeField field must be a string, got: " << elem.type(),
                     elem.type() == BSONType::String);
-            bucketSpec.timeField = elem.str();
+            bucketSpec.setTimeField(elem.str());
             hasTimeField = true;
         } else if (fieldName == timeseries::kMetaFieldName) {
             uassert(5346505,
@@ -290,7 +290,7 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF
             uassert(5545700,
                     str::stream() << "metaField field must be a single-element field path",
                     metaField.find('.') == std::string::npos);
-            bucketSpec.metaField = std::move(metaField);
+            bucketSpec.setMetaField(std::move(metaField));
         } else if (fieldName == kBucketMaxSpanSeconds) {
             uassert(5510600,
                     str::stream() << "bucketMaxSpanSeconds field must be an integer, got: "
@@ -357,7 +357,7 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF
             uassert(5612401,
                     str::stream() << "timeField field must be a string, got: " << elem.type(),
                     elem.type() == BSONType::String);
-            bucketSpec.timeField = elem.str();
+            bucketSpec.setTimeField(elem.str());
             hasTimeField = true;
         } else if (fieldName == timeseries::kMetaFieldName) {
             uassert(5612402,
@@ -367,7 +367,7 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF
             uassert(5612403,
                     str::stream() << "metaField field must be a single-element field path",
                     metaField.find('.') == std::string::npos);
-            bucketSpec.metaField = std::move(metaField);
+            bucketSpec.setMetaField(std::move(metaField));
         } else if (fieldName == kAssumeNoMixedSchemaData) {
             uassert(6067203,
                     str::stream() << "assumeClean field must be a bool, got: " << elem.type(),
@@ -402,16 +402,16 @@ void DocumentSourceInternalUnpackBucket::serializeToArray(
     if (((_bucketUnpacker.includeMetaField() &&
           _bucketUnpacker.behavior() == BucketUnpacker::Behavior::kInclude) ||
          (!_bucketUnpacker.includeMetaField() &&
-          _bucketUnpacker.behavior() == BucketUnpacker::Behavior::kExclude && spec.metaField)) &&
+          _bucketUnpacker.behavior() == BucketUnpacker::Behavior::kExclude && spec.metaField())) &&
         std::find(spec.computedMetaProjFields.cbegin(),
                   spec.computedMetaProjFields.cend(),
-                  *spec.metaField) == spec.computedMetaProjFields.cend())
-        fields.emplace_back(*spec.metaField);
+                  *spec.metaField()) == spec.computedMetaProjFields.cend())
+        fields.emplace_back(*spec.metaField());
 
     out.addField(behavior, Value{std::move(fields)});
-    out.addField(timeseries::kTimeFieldName, Value{spec.timeField});
-    if (spec.metaField) {
-        out.addField(timeseries::kMetaFieldName, Value{*spec.metaField});
+    out.addField(timeseries::kTimeFieldName, Value{spec.timeField()});
+    if (spec.metaField()) {
+        out.addField(timeseries::kMetaFieldName, Value{*spec.metaField()});
     }
     out.addField(kBucketMaxSpanSeconds, Value{_bucketMaxSpanSeconds});
     if (_assumeNoMixedSchemaData)
@@ -472,7 +472,7 @@ bool DocumentSourceInternalUnpackBucket::pushDownComputedMetaProjection(
     if (std::next(itr) == container->end()) {
         return nextStageWasRemoved;
     }
-    if (!_bucketUnpacker.bucketSpec().metaField) {
+    if (!_bucketUnpacker.bucketSpec().metaField()) {
         return nextStageWasRemoved;
     }
 
@@ -482,7 +482,7 @@ bool DocumentSourceInternalUnpackBucket::pushDownComputedMetaProjection(
         (nextTransform->getType() == TransformerInterface::TransformerType::kInclusionProjection ||
          nextTransform->getType() == TransformerInterface::TransformerType::kComputedProjection)) {
 
-        auto& metaName = _bucketUnpacker.bucketSpec().metaField.get();
+        auto& metaName = _bucketUnpacker.bucketSpec().metaField().get();
         auto [addFieldsSpec, deleteStage] =
             nextTransform->extractComputedProjections(metaName,
                                                       timeseries::kBucketMetaFieldName.toString(),
@@ -655,9 +655,9 @@ std::unique_ptr<MatchExpression> createComparisonPredicate(
     }
 
     // We must avoid mapping predicates on the meta field onto the control field.
-    if (bucketSpec.metaField &&
-        (matchExprPath == bucketSpec.metaField.get() ||
-         expression::isPathPrefixOf(bucketSpec.metaField.get(), matchExprPath)))
+    if (bucketSpec.metaField() &&
+        (matchExprPath == bucketSpec.metaField().get() ||
+         expression::isPathPrefixOf(bucketSpec.metaField().get(), matchExprPath)))
         return nullptr;
 
     // We must avoid mapping predicates on fields computed via $addFields or a computed $project.
@@ -665,7 +665,7 @@ std::unique_ptr<MatchExpression> createComparisonPredicate(
         return nullptr;
     }
 
-    const auto isTimeField = (matchExprPath == bucketSpec.timeField);
+    const auto isTimeField = (matchExprPath == bucketSpec.timeField());
     if (isTimeField && matchExprData.type() != BSONType::Date) {
         // Users are not allowed to insert non-date measurements into time field. So this query
         // would not match anything. We do not need to optimize for this case.
@@ -874,7 +874,7 @@ DocumentSourceInternalUnpackBucket::createPredicatesOnBucketLevelField(
 std::pair<boost::intrusive_ptr<DocumentSourceMatch>, boost::intrusive_ptr<DocumentSourceMatch>>
 DocumentSourceInternalUnpackBucket::splitMatchOnMetaAndRename(
     boost::intrusive_ptr<DocumentSourceMatch> match) {
-    if (auto&& metaField = _bucketUnpacker.bucketSpec().metaField) {
+    if (auto&& metaField = _bucketUnpacker.bucketSpec().metaField()) {
         return std::move(*match).extractMatchOnFieldsAndRemainder(
             {*metaField}, {{*metaField, timeseries::kBucketMetaFieldName.toString()}});
     }
@@ -884,10 +884,10 @@ DocumentSourceInternalUnpackBucket::splitMatchOnMetaAndRename(
 std::pair<BSONObj, bool> DocumentSourceInternalUnpackBucket::extractProjectForPushDown(
     DocumentSource* src) const {
     if (auto nextProject = dynamic_cast<DocumentSourceSingleDocumentTransformation*>(src);
-        _bucketUnpacker.bucketSpec().metaField && nextProject &&
+        _bucketUnpacker.bucketSpec().metaField() && nextProject &&
         nextProject->getType() == TransformerInterface::TransformerType::kExclusionProjection) {
         return nextProject->extractProjectOnFieldAndRename(
-            _bucketUnpacker.bucketSpec().metaField.get(), timeseries::kBucketMetaFieldName);
+            _bucketUnpacker.bucketSpec().metaField().get(), timeseries::kBucketMetaFieldName);
     }
 
     return {BSONObj{}, false};
@@ -902,7 +902,7 @@ DocumentSourceInternalUnpackBucket::rewriteGroupByMinMax(Pipeline::SourceContain
     }
 
     const auto& idFields = groupPtr->getIdFields();
-    if (idFields.size() != 1 || !_bucketUnpacker.bucketSpec().metaField.has_value()) {
+    if (idFields.size() != 1 || !_bucketUnpacker.bucketSpec().metaField().has_value()) {
         return {};
     }
 
@@ -914,7 +914,7 @@ DocumentSourceInternalUnpackBucket::rewriteGroupByMinMax(Pipeline::SourceContain
 
     const auto& idPath = exprIdPath->getFieldPath();
     if (idPath.getPathLength() < 2 ||
-        idPath.getFieldName(1) != _bucketUnpacker.bucketSpec().metaField.get()) {
+        idPath.getFieldName(1) != _bucketUnpacker.bucketSpec().metaField().get()) {
         return {};
     }
 
@@ -935,7 +935,7 @@ DocumentSourceInternalUnpackBucket::rewriteGroupByMinMax(Pipeline::SourceContain
         if (const auto* exprArgPath = dynamic_cast<const ExpressionFieldPath*>(exprArg)) {
             const auto& path = exprArgPath->getFieldPath();
             if (path.getPathLength() <= 1 ||
-                path.getFieldName(1) == _bucketUnpacker.bucketSpec().timeField) {
+                path.getFieldName(1) == _bucketUnpacker.bucketSpec().timeField()) {
                 // Rewrite not valid for time field. We want to eliminate the bucket
                 // unpack stage here.
                 suitable = false;
@@ -1007,12 +1007,13 @@ Pipeline::SourceContainer::iterator DocumentSourceInternalUnpackBucket::doOptimi
 
     // Some optimizations may not be safe to do if we have computed the metaField via an $addFields
     // or a computed $project. We won't do those optimizations if 'haveComputedMetaField' is true.
-    bool haveComputedMetaField = _bucketUnpacker.bucketSpec().metaField &&
-        fieldIsComputed(_bucketUnpacker.bucketSpec(), _bucketUnpacker.bucketSpec().metaField.get());
+    bool haveComputedMetaField = _bucketUnpacker.bucketSpec().metaField() &&
+        fieldIsComputed(_bucketUnpacker.bucketSpec(),
+                        _bucketUnpacker.bucketSpec().metaField().get());
 
     // Before any other rewrites for the current stage, consider reordering with $sort.
     if (auto sortPtr = dynamic_cast<DocumentSourceSort*>(std::next(itr)->get())) {
-        if (auto metaField = _bucketUnpacker.bucketSpec().metaField;
+        if (auto metaField = _bucketUnpacker.bucketSpec().metaField();
             metaField && !haveComputedMetaField) {
             if (checkMetadataSortReorder(sortPtr->getSortKeyPattern(), metaField.get())) {
                 // We have a sort on metadata field following this stage. Reorder the two stages
@@ -1056,7 +1057,7 @@ Pipeline::SourceContainer::iterator DocumentSourceInternalUnpackBucket::doOptimi
             "Must not specify 'query' for $geoNear on a time-series collection; use $match instead",
             nextNear->getQuery().binaryEqual(BSONObj()));
 
-        auto metaField = _bucketUnpacker.bucketSpec().metaField;
+        auto metaField = _bucketUnpacker.bucketSpec().metaField();
         if (metaField && *metaField == keyField->front()) {
             // Make sure we actually re-write the key field for the buckets collection so we can
             // locate the index.
@@ -1111,8 +1112,8 @@ Pipeline::SourceContainer::iterator DocumentSourceInternalUnpackBucket::doOptimi
         auto deps = Pipeline::getDependenciesForContainer(
             pExpCtx, Pipeline::SourceContainer{std::next(itr), container->end()}, boost::none);
         if (deps.hasNoRequirements()) {
-            _bucketUnpacker.setBucketSpecAndBehavior({_bucketUnpacker.bucketSpec().timeField,
-                                                      _bucketUnpacker.bucketSpec().metaField,
+            _bucketUnpacker.setBucketSpecAndBehavior({_bucketUnpacker.bucketSpec().timeField(),
+                                                      _bucketUnpacker.bucketSpec().metaField(),
                                                       {}},
                                                      BucketUnpacker::Behavior::kInclude);
 
diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp
index 5e24100be98..d7afc3680bf 100644
--- a/src/mongo/db/pipeline/expression.cpp
+++ b/src/mongo/db/pipeline/expression.cpp
@@ -2426,10 +2426,10 @@ Value ExpressionFieldPath::evaluatePath(size_t index, const Document& input) con
 
     /* if we've hit the end of the path, stop */
     if (index == _fieldPath.getPathLength() - 1)
-        return input[_fieldPath.getFieldName(index)];
+        return input[_fieldPath.getFieldNameHashed(index)];
 
     // Try to dive deeper
-    const Value val = input[_fieldPath.getFieldName(index)];
+    const Value val = input[_fieldPath.getFieldNameHashed(index)];
     switch (val.getType()) {
         case Object:
             return evaluatePath(index + 1, val.getDocument());
diff --git a/src/mongo/db/pipeline/field_path.cpp b/src/mongo/db/pipeline/field_path.cpp
index 208f60a515b..ab00617bbd3 100644
--- a/src/mongo/db/pipeline/field_path.cpp
+++ b/src/mongo/db/pipeline/field_path.cpp
@@ -70,7 +70,9 @@ string FieldPath::getFullyQualifiedPath(StringData prefix, StringData suffix) {
 }
 
 FieldPath::FieldPath(std::string inputPath)
-    : _fieldPath(std::move(inputPath)), _fieldPathDotPosition{string::npos} {
+    : _fieldPath(std::move(inputPath)),
+      _fieldPathDotPosition{string::npos},
+      _fieldHash{kHashUninitialized} {
     uassert(40352, "FieldPath cannot be constructed with empty string", !_fieldPath.empty());
     uassert(40353, "FieldPath must not end with a '.'.", _fieldPath[_fieldPath.size() - 1] != '.');
 
@@ -79,6 +81,7 @@ FieldPath::FieldPath(std::string inputPath)
     size_t startPos = 0;
     while (string::npos != (dotPos = _fieldPath.find('.', startPos))) {
         _fieldPathDotPosition.push_back(dotPos);
+        _fieldHash.push_back(kHashUninitialized);
         startPos = dotPos + 1;
     }
 
@@ -139,19 +142,26 @@ FieldPath FieldPath::concat(const FieldPath& tail) const {
         head._fieldPathDotPosition.size() + tail._fieldPathDotPosition.size() - 2 + 1;
     newDots.reserve(expectedDotSize);
 
+    std::vector<size_t> newHashes;
+    // We don't need the extra entry in hashes.
+    newHashes.reserve(expectedDotSize - 1);
+
     // The first one in head._fieldPathDotPosition is npos. The last one, is, conveniently, the
     // size of head fieldPath, which also happens to be the index at which we added a new dot.
     newDots.insert(
         newDots.begin(), head._fieldPathDotPosition.begin(), head._fieldPathDotPosition.end());
+    newHashes.insert(newHashes.begin(), head._fieldHash.begin(), head._fieldHash.end());
 
     invariant(tail._fieldPathDotPosition.size() >= 2);
     for (size_t i = 1; i < tail._fieldPathDotPosition.size(); ++i) {
         // Move each index back by size of the first field path, plus one, for the newly added dot.
         newDots.push_back(tail._fieldPathDotPosition[i] + head._fieldPath.size() + 1);
+        newHashes.push_back(tail._fieldHash[i - 1]);
     }
     invariant(newDots.back() == concat.size());
     invariant(newDots.size() == expectedDotSize);
+    invariant(newHashes.size() == expectedDotSize - 1);
 
-    return FieldPath(std::move(concat), std::move(newDots));
+    return FieldPath(std::move(concat), std::move(newDots), std::move(newHashes));
 }
 }  // namespace mongo
diff --git a/src/mongo/db/pipeline/field_path.h b/src/mongo/db/pipeline/field_path.h
index 994e23554b3..d2ee93734e7 100644
--- a/src/mongo/db/pipeline/field_path.h
+++ b/src/mongo/db/pipeline/field_path.h
@@ -29,11 +29,13 @@
 
 #pragma once
 
+#include <limits>
 #include <string>
 #include <vector>
 
 #include "mongo/base/string_data.h"
 #include "mongo/bson/bson_depth.h"
+#include "mongo/db/exec/document_value/document_internal.h"
 #include "mongo/util/assert_util.h"
 
 namespace mongo {
@@ -111,6 +113,20 @@ public:
     }
 
     /**
+     * Return the ith field name from this path using zero-based indexes, with pre-computed hash.
+     */
+    HashedFieldName getFieldNameHashed(size_t i) const {
+        dassert(i < getPathLength());
+        const auto begin = _fieldPathDotPosition[i] + 1;
+        const auto end = _fieldPathDotPosition[i + 1];
+        StringData fieldName{&_fieldPath[begin], end - begin};
+        if (_fieldHash[i] == kHashUninitialized) {
+            _fieldHash[i] = FieldNameHasher()(fieldName);
+        }
+        return HashedFieldName{fieldName, _fieldHash[i]};
+    }
+
+    /**
      * Returns the full path, not including the prefix 'FieldPath::prefix'.
      */
     const std::string& fullPath() const {
@@ -142,8 +158,10 @@ public:
     FieldPath concat(const FieldPath& tail) const;
 
 private:
-    FieldPath(std::string string, std::vector<size_t> dots)
-        : _fieldPath(std::move(string)), _fieldPathDotPosition(std::move(dots)) {
+    FieldPath(std::string string, std::vector<size_t> dots, std::vector<size_t> hashes)
+        : _fieldPath(std::move(string)),
+          _fieldPathDotPosition(std::move(dots)),
+          _fieldHash(std::move(hashes)) {
         uassert(ErrorCodes::Overflow,
                 "FieldPath is too long",
                 _fieldPathDotPosition.size() <= BSONDepth::getMaxAllowableDepth());
@@ -158,6 +176,11 @@ private:
     // string::npos (which evaluates to -1) and the last contains _fieldPath.size() to facilitate
     // lookup.
     std::vector<size_t> _fieldPathDotPosition;
+
+    // Contains the cached hash value for the field. Will initially be set to 'kHashUninitialized',
+    // and only generated when it is first retrieved via 'getFieldNameHashed'.
+    mutable std::vector<size_t> _fieldHash;
+    static constexpr std::size_t kHashUninitialized = std::numeric_limits<std::size_t>::max();
 };
 
 inline bool operator<(const FieldPath& lhs, const FieldPath& rhs) {
author	Dan Larkin-York <dan.larkin-york@mongodb.com>	2022-01-13 13:53:05 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-01-13 14:38:25 +0000
commit	c67e0f941ae7278478e250c7692ef76b4a65953d (patch)
tree	9d97f6033cf6a38bdcef7a5d747572c8f3f402c2
parent	429faa23081c2f97e953ca878368dca189df9986 (diff)
download	mongo-c67e0f941ae7278478e250c7692ef76b4a65953d.tar.gz