diff options
author | Dan Larkin-York <dan.larkin-york@mongodb.com> | 2022-01-13 13:53:05 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-01-13 14:38:25 +0000 |
commit | c67e0f941ae7278478e250c7692ef76b4a65953d (patch) | |
tree | 9d97f6033cf6a38bdcef7a5d747572c8f3f402c2 | |
parent | 429faa23081c2f97e953ca878368dca189df9986 (diff) | |
download | mongo-c67e0f941ae7278478e250c7692ef76b4a65953d.tar.gz |
SERVER-62499 Augment DocumentStorage APIs to accept pre-computed hashes
-rw-r--r-- | src/mongo/bson/util/bsoncolumn.cpp | 15 | ||||
-rw-r--r-- | src/mongo/bson/util/bsoncolumn.h | 10 | ||||
-rw-r--r-- | src/mongo/db/exec/bucket_unpacker.cpp | 109 | ||||
-rw-r--r-- | src/mongo/db/exec/bucket_unpacker.h | 37 | ||||
-rw-r--r-- | src/mongo/db/exec/document_value/document.cpp | 34 | ||||
-rw-r--r-- | src/mongo/db/exec/document_value/document.h | 23 | ||||
-rw-r--r-- | src/mongo/db/exec/document_value/document_internal.h | 118 | ||||
-rw-r--r-- | src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp | 57 | ||||
-rw-r--r-- | src/mongo/db/pipeline/expression.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/pipeline/field_path.cpp | 14 | ||||
-rw-r--r-- | src/mongo/db/pipeline/field_path.h | 27 |
11 files changed, 356 insertions, 92 deletions
diff --git a/src/mongo/bson/util/bsoncolumn.cpp b/src/mongo/bson/util/bsoncolumn.cpp index c681ba3ddae..e13d1fd80ae 100644 --- a/src/mongo/bson/util/bsoncolumn.cpp +++ b/src/mongo/bson/util/bsoncolumn.cpp @@ -28,12 +28,14 @@ */ #include "mongo/bson/util/bsoncolumn.h" + +#include <algorithm> +#include <third_party/murmurhash3/MurmurHash3.h> + #include "mongo/bson/bsonobj.h" #include "mongo/bson/util/bsoncolumn_util.h" #include "mongo/bson/util/simple8b_type_util.h" -#include <algorithm> - namespace mongo { using namespace bsoncolumn; @@ -102,6 +104,13 @@ private: ElementFunc _elemFunc; }; +std::size_t hashName(StringData sd) { + // Keep in sync with DocumentStorageHasher + unsigned out; + MurmurHash3_x86_32(sd.rawData(), sd.size(), 0, &out); + return out; +} + } // namespace BSONColumn::ElementStorage::Element::Element(char* buffer, int nameSize, int valueSize) @@ -761,6 +770,7 @@ BSONColumn::BSONColumn(BSONElement bin) { _binary = bin.binData(_size); _name = bin.fieldNameStringData().toString(); + _nameHash = hashName(_name); _init(); } @@ -769,6 +779,7 @@ BSONColumn::BSONColumn(BSONBinData bin, StringData name) { _binary = static_cast<const char*>(bin.data); _size = bin.length; _name = name.toString(); + _nameHash = hashName(_name); _init(); } diff --git a/src/mongo/bson/util/bsoncolumn.h b/src/mongo/bson/util/bsoncolumn.h index f69b52c2f92..511090c3b3e 100644 --- a/src/mongo/bson/util/bsoncolumn.h +++ b/src/mongo/bson/util/bsoncolumn.h @@ -263,6 +263,15 @@ public: return _name; } + /** + * Field name that this BSONColumn represents. + * + * O(1) time complexity + */ + std::pair<StringData, std::size_t> nameHashed() const { + return {_name, _nameHash}; + } + private: /** * BSONElement storage, owns materialised BSONElement returned by BSONColumn. @@ -405,5 +414,6 @@ private: bool _fullyDecompressed = false; std::string _name; + std::size_t _nameHash; }; } // namespace mongo diff --git a/src/mongo/db/exec/bucket_unpacker.cpp b/src/mongo/db/exec/bucket_unpacker.cpp index 7f479ce763e..782d2ff5235 100644 --- a/src/mongo/db/exec/bucket_unpacker.cpp +++ b/src/mongo/db/exec/bucket_unpacker.cpp @@ -27,10 +27,9 @@ * it in the license file. */ -#include "mongo/platform/basic.h" +#include "mongo/db/exec/bucket_unpacker.h" #include "mongo/bson/util/bsoncolumn.h" -#include "mongo/db/exec/bucket_unpacker.h" #include "mongo/db/timeseries/timeseries_constants.h" namespace mongo { @@ -163,12 +162,12 @@ bool BucketUnpackerV1::getNext(MutableDocument& measurement, bool includeMetaField) { auto&& timeElem = _timeFieldIter.next(); if (includeTimeField) { - measurement.addField(spec.timeField, Value{timeElem}); + measurement.addField(spec.timeFieldHashed(), Value{timeElem}); } // Includes metaField when we're instructed to do so and metaField value exists. if (includeMetaField && !metaValue.missing()) { - measurement.addField(*spec.metaField, metaValue); + measurement.addField(*spec.metaFieldHashed(), metaValue); } auto& currentIdx = timeElem.fieldNameStringData(); @@ -195,7 +194,7 @@ void BucketUnpackerV1::extractSingleMeasurement(MutableDocument& measurement, auto&& dataRegion = bucket.getField(timeseries::kBucketDataFieldName).Obj(); if (includeMetaField && !metaValue.missing()) { - measurement.addField(*spec.metaField, metaValue); + measurement.addField(*spec.metaFieldHashed(), metaValue); } for (auto&& dataElem : dataRegion) { @@ -283,13 +282,13 @@ bool BucketUnpackerV2::getNext(MutableDocument& measurement, // Get element and increment iterator const auto& timeElem = *_timeColumn.it; if (includeTimeField) { - measurement.addField(spec.timeField, Value{timeElem}); + measurement.addField(spec.timeFieldHashed(), Value{timeElem}); } ++_timeColumn.it; // Includes metaField when we're instructed to do so and metaField value exists. if (includeMetaField && !metaValue.missing()) { - measurement.addField(*spec.metaField, metaValue); + measurement.addField(*spec.metaFieldHashed(), metaValue); } for (auto& fieldColumn : _fieldColumns) { @@ -299,7 +298,7 @@ bool BucketUnpackerV2::getNext(MutableDocument& measurement, const BSONElement& elem = *fieldColumn.it; // EOO represents missing field if (!elem.eoo()) { - measurement.addField(fieldColumn.column.name(), Value{elem}); + measurement.addField(HashedFieldName{fieldColumn.column.nameHashed()}, Value{elem}); } ++fieldColumn.it; } @@ -319,18 +318,18 @@ void BucketUnpackerV2::extractSingleMeasurement(MutableDocument& measurement, auto val = _timeColumn.column[j]; uassert( 6067500, "Bucket unexpectedly contained fewer values than count", val && !val->eoo()); - measurement.addField(_timeColumn.column.name(), Value{*val}); + measurement.addField(spec.timeFieldHashed(), Value{*val}); } if (includeMetaField && !metaValue.missing()) { - measurement.addField(*spec.metaField, metaValue); + measurement.addField(*spec.metaFieldHashed(), metaValue); } if (includeTimeField) { for (auto& fieldColumn : _fieldColumns) { auto val = fieldColumn.column[j]; uassert(6067600, "Bucket unexpectedly contained fewer values than count", val); - measurement.addField(fieldColumn.column.name(), Value{*val}); + measurement.addField(HashedFieldName{fieldColumn.column.nameHashed()}, Value{*val}); } } } @@ -361,6 +360,88 @@ void eraseExcludedComputedMetaProjFields(BucketUnpacker::Behavior unpackerBehavi } // namespace +BucketSpec::BucketSpec(const std::string& timeField, + const boost::optional<std::string>& metaField, + const std::set<std::string>& fields, + const std::vector<std::string>& computedProjections) + : fieldSet(fields), + computedMetaProjFields(computedProjections), + _timeField(timeField), + _timeFieldHashed(FieldNameHasher().hashedFieldName(_timeField)), + _metaField(metaField) { + if (_metaField) { + _metaFieldHashed = FieldNameHasher().hashedFieldName(*_metaField); + } +} + +BucketSpec::BucketSpec(const BucketSpec& other) + : fieldSet(other.fieldSet), + computedMetaProjFields(other.computedMetaProjFields), + _timeField(other._timeField), + _timeFieldHashed(HashedFieldName{_timeField, other._timeFieldHashed->hash()}), + _metaField(other._metaField) { + if (_metaField) { + _metaFieldHashed = HashedFieldName{*_metaField, other._metaFieldHashed->hash()}; + } +} + +BucketSpec::BucketSpec(BucketSpec&& other) + : fieldSet(std::move(other.fieldSet)), + computedMetaProjFields(std::move(other.computedMetaProjFields)), + _timeField(std::move(other._timeField)), + _timeFieldHashed(HashedFieldName{_timeField, other._timeFieldHashed->hash()}), + _metaField(std::move(other._metaField)) { + if (_metaField) { + _metaFieldHashed = HashedFieldName{*_metaField, other._metaFieldHashed->hash()}; + } +} + +BucketSpec& BucketSpec::operator=(const BucketSpec& other) { + if (&other != this) { + fieldSet = other.fieldSet; + computedMetaProjFields = other.computedMetaProjFields; + _timeField = other._timeField; + _timeFieldHashed = HashedFieldName{_timeField, other._timeFieldHashed->hash()}; + _metaField = other._metaField; + if (_metaField) { + _metaFieldHashed = HashedFieldName{*_metaField, other._metaFieldHashed->hash()}; + } + } + return *this; +} + +void BucketSpec::setTimeField(std::string&& name) { + _timeField = std::move(name); + _timeFieldHashed = FieldNameHasher().hashedFieldName(_timeField); +} + +const std::string& BucketSpec::timeField() const { + return _timeField; +} + +HashedFieldName BucketSpec::timeFieldHashed() const { + invariant(_timeFieldHashed->key().rawData() == _timeField.data()); + invariant(_timeFieldHashed->key() == _timeField); + return *_timeFieldHashed; +} + +void BucketSpec::setMetaField(boost::optional<std::string>&& name) { + _metaField = std::move(name); + if (_metaField) { + _metaFieldHashed = FieldNameHasher().hashedFieldName(*_metaField); + } else { + _metaFieldHashed = boost::none; + } +} + +const boost::optional<std::string>& BucketSpec::metaField() const { + return _metaField; +} + +boost::optional<HashedFieldName> BucketSpec::metaFieldHashed() const { + return _metaFieldHashed; +} + BucketUnpacker::BucketUnpacker() = default; BucketUnpacker::BucketUnpacker(BucketUnpacker&& other) = default; BucketUnpacker::~BucketUnpacker() = default; @@ -437,13 +518,13 @@ void BucketUnpacker::reset(BSONObj&& bucket) { return; } - auto&& timeFieldElem = dataRegion.getField(_spec.timeField); + auto&& timeFieldElem = dataRegion.getField(_spec.timeField()); uassert(5346700, "The $_internalUnpackBucket stage requires the data region to have a timeField object", timeFieldElem); _metaValue = Value{_bucket[timeseries::kBucketMetaFieldName]}; - if (_spec.metaField) { + if (_spec.metaField()) { // The spec indicates that there might be a metadata region. Missing metadata in // measurements is expressed with missing metadata in a bucket. But we disallow undefined // since the undefined BSON type is deprecated. @@ -489,7 +570,7 @@ void BucketUnpacker::reset(BSONObj&& bucket) { // include or exclude case. for (auto&& elem : dataRegion) { auto& colName = elem.fieldNameStringData(); - if (colName == _spec.timeField) { + if (colName == _spec.timeField()) { // Skip adding a FieldIterator for the timeField since the timestamp value from // _timeFieldIter can be placed accordingly in the materialized measurement. continue; diff --git a/src/mongo/db/exec/bucket_unpacker.h b/src/mongo/db/exec/bucket_unpacker.h index da057735691..2f5fd70b785 100644 --- a/src/mongo/db/exec/bucket_unpacker.h +++ b/src/mongo/db/exec/bucket_unpacker.h @@ -29,6 +29,7 @@ #pragma once + #include <algorithm> #include <set> @@ -39,14 +40,29 @@ namespace mongo { /** * Carries parameters for unpacking a bucket. */ -struct BucketSpec { +class BucketSpec { +public: + BucketSpec() = default; + BucketSpec(const std::string& timeField, + const boost::optional<std::string>& metaField, + const std::set<std::string>& fields = {}, + const std::vector<std::string>& computedProjections = {}); + BucketSpec(const BucketSpec&); + BucketSpec(BucketSpec&&); + + BucketSpec& operator=(const BucketSpec&); + // The user-supplied timestamp field name specified during time-series collection creation. - std::string timeField; + void setTimeField(std::string&& field); + const std::string& timeField() const; + HashedFieldName timeFieldHashed() const; // An optional user-supplied metadata field name specified during time-series collection // creation. This field name is used during materialization of metadata fields of a measurement // after unpacking. - boost::optional<std::string> metaField; + void setMetaField(boost::optional<std::string>&& field); + const boost::optional<std::string>& metaField() const; + boost::optional<HashedFieldName> metaFieldHashed() const; // The set of field names in the data region that should be included or excluded. std::set<std::string> fieldSet; @@ -54,6 +70,13 @@ struct BucketSpec { // Vector of computed meta field projection names. Added at the end of materialized // measurements. std::vector<std::string> computedMetaProjFields; + +private: + std::string _timeField; + boost::optional<HashedFieldName> _timeFieldHashed; + + boost::optional<std::string> _metaField = boost::none; + boost::optional<HashedFieldName> _metaFieldHashed = boost::none; }; /** @@ -185,12 +208,12 @@ private: */ inline bool eraseMetaFromFieldSetAndDetermineIncludeMeta(BucketUnpacker::Behavior unpackerBehavior, BucketSpec* bucketSpec) { - if (!bucketSpec->metaField || + if (!bucketSpec->metaField() || std::find(bucketSpec->computedMetaProjFields.cbegin(), bucketSpec->computedMetaProjFields.cend(), - *bucketSpec->metaField) != bucketSpec->computedMetaProjFields.cend()) { + *bucketSpec->metaField()) != bucketSpec->computedMetaProjFields.cend()) { return false; - } else if (auto itr = bucketSpec->fieldSet.find(*bucketSpec->metaField); + } else if (auto itr = bucketSpec->fieldSet.find(*bucketSpec->metaField()); itr != bucketSpec->fieldSet.end()) { bucketSpec->fieldSet.erase(itr); return unpackerBehavior == BucketUnpacker::Behavior::kInclude; @@ -205,7 +228,7 @@ inline bool eraseMetaFromFieldSetAndDetermineIncludeMeta(BucketUnpacker::Behavio inline bool determineIncludeTimeField(BucketUnpacker::Behavior unpackerBehavior, BucketSpec* bucketSpec) { return (unpackerBehavior == BucketUnpacker::Behavior::kInclude) == - (bucketSpec->fieldSet.find(bucketSpec->timeField) != bucketSpec->fieldSet.end()); + (bucketSpec->fieldSet.find(bucketSpec->timeField()) != bucketSpec->fieldSet.end()); } /** diff --git a/src/mongo/db/exec/document_value/document.cpp b/src/mongo/db/exec/document_value/document.cpp index f47eff8b300..dd833b3c28f 100644 --- a/src/mongo/db/exec/document_value/document.cpp +++ b/src/mongo/db/exec/document_value/document.cpp @@ -27,8 +27,6 @@ * it in the license file. */ -#include "mongo/platform/basic.h" - #include "mongo/db/exec/document_value/document.h" #include <boost/functional/hash.hpp> @@ -164,7 +162,8 @@ bool DocumentStorageIterator::shouldSkipDeleted() { return false; } -Position DocumentStorage::findFieldInCache(StringData requested) const { +template <typename T> +Position DocumentStorage::findFieldInCache(T requested) const { int reqSize = requested.size(); // get size calculation out of the way if needed if (_numFields >= HASH_TAB_MIN) { // hash lookup @@ -192,14 +191,17 @@ Position DocumentStorage::findFieldInCache(StringData requested) const { // if we got here, there's no such field return Position(); } +template Position DocumentStorage::findFieldInCache<StringData>(StringData field) const; +template Position DocumentStorage::findFieldInCache<HashedFieldName>(HashedFieldName field) const; -Position DocumentStorage::findField(StringData requested, LookupPolicy policy) const { - if (auto pos = findFieldInCache(requested); pos.found() || policy == LookupPolicy::kCacheOnly) { +template <typename T> +Position DocumentStorage::findField(T field, LookupPolicy policy) const { + if (auto pos = findFieldInCache(field); pos.found() || policy == LookupPolicy::kCacheOnly) { return pos; } for (auto&& bsonElement : _bson) { - if (requested == bsonElement.fieldNameStringData()) { + if (field == bsonElement.fieldNameStringData()) { return const_cast<DocumentStorage*>(this)->constructInCache(bsonElement); } } @@ -207,6 +209,10 @@ Position DocumentStorage::findField(StringData requested, LookupPolicy policy) c // if we got here, there's no such field return Position(); } +template Position DocumentStorage::findField<StringData>(StringData field, + LookupPolicy policy) const; +template Position DocumentStorage::findField<HashedFieldName>(HashedFieldName field, + LookupPolicy policy) const; Position DocumentStorage::constructInCache(const BSONElement& elem) { auto savedModified = _modified; @@ -222,9 +228,10 @@ Position DocumentStorage::constructInCache(const BSONElement& elem) { return pos; } -Value& DocumentStorage::appendField(StringData name, ValueElement::Kind kind) { +template <typename T> +Value& DocumentStorage::appendField(T field, ValueElement::Kind kind) { Position pos = getNextPosition(); - const int nameSize = name.size(); + const int nameSize = field.size(); // these are the same for everyone const Position nextCollision; @@ -245,7 +252,7 @@ Value& DocumentStorage::appendField(StringData name, ValueElement::Kind kind) { append(nextCollision); append(nameSize); append(kind); - name.copyTo(dest, true); + field.copyTo(dest, true); // Padding for alignment handled above #undef append @@ -255,7 +262,7 @@ Value& DocumentStorage::appendField(StringData name, ValueElement::Kind kind) { _numFields++; if (_numFields > HASH_TAB_MIN) { - addFieldToHashTable(pos); + addFieldToHashTable(field, pos); } else if (_numFields == HASH_TAB_MIN) { // adds all fields to hash table (including the one we just added) rehash(); @@ -263,13 +270,16 @@ Value& DocumentStorage::appendField(StringData name, ValueElement::Kind kind) { return getField(pos).val; } +template Value& DocumentStorage::appendField<StringData>(StringData, ValueElement::Kind); +template Value& DocumentStorage::appendField<HashedFieldName>(HashedFieldName, ValueElement::Kind); // Call after adding field to _fields and increasing _numFields -void DocumentStorage::addFieldToHashTable(Position pos) { +template <typename T> +void DocumentStorage::addFieldToHashTable(T field, Position pos) { ValueElement& elem = getField(pos); elem.nextCollision = Position(); - const unsigned bucket = bucketForKey(elem.nameSD()); + const unsigned bucket = bucketForKey(field); Position* posPtr = &_hashTab[bucket]; while (posPtr->found()) { diff --git a/src/mongo/db/exec/document_value/document.h b/src/mongo/db/exec/document_value/document.h index 36c4106357d..8fcfb28dd8b 100644 --- a/src/mongo/db/exec/document_value/document.h +++ b/src/mongo/db/exec/document_value/document.h @@ -136,17 +136,16 @@ public: * Note that this method does *not* traverse nested documents and arrays, use getNestedField() * instead. */ - const Value operator[](StringData key) const { + template <typename T> + const Value operator[](T key) const { return getField(key); } - const Value getField(StringData key) const { + template <typename T> + const Value getField(T key) const { return storage().getField(key); } /// Look up a field by Position. See positionOf and getNestedField. - const Value operator[](Position pos) const { - return getField(pos); - } const Value getField(Position pos) const { return storage().getField(pos).val; } @@ -535,12 +534,18 @@ public: * Decide what level of support is needed for duplicate fields. * If duplicates are not allowed, consider removing this method. */ - void addField(StringData fieldName, const Value& val) { - storage().appendField(fieldName, ValueElement::Kind::kInserted) = val; + void addField(StringData name, const Value& val) { + storage().appendField(name, ValueElement::Kind::kInserted) = val; + } + void addField(HashedFieldName field, const Value& val) { + storage().appendField(field, ValueElement::Kind::kInserted) = val; } - void addField(StringData fieldName, Value&& val) { - storage().appendField(fieldName, ValueElement::Kind::kInserted) = std::move(val); + void addField(StringData name, Value&& val) { + storage().appendField(name, ValueElement::Kind::kInserted) = std::move(val); + } + void addField(HashedFieldName field, Value&& val) { + storage().appendField(field, ValueElement::Kind::kInserted) = std::move(val); } /** Update field by key. If there is no field with that key, add one. diff --git a/src/mongo/db/exec/document_value/document_internal.h b/src/mongo/db/exec/document_value/document_internal.h index 9be057fb522..062b7a4da00 100644 --- a/src/mongo/db/exec/document_value/document_internal.h +++ b/src/mongo/db/exec/document_value/document_internal.h @@ -29,10 +29,9 @@ #pragma once -#include <third_party/murmurhash3/MurmurHash3.h> - #include <bitset> #include <boost/intrusive_ptr.hpp> +#include <third_party/murmurhash3/MurmurHash3.h> #include "mongo/base/static_assert.h" #include "mongo/db/exec/document_value/document_metadata_fields.h" @@ -275,6 +274,85 @@ private: const ValueElement* _end; }; +/** + * Type that bundles the hashed field name along with the actual string so that hashing can be done + * outside of inserts and lookups and re-used across calls. + */ +class HashedFieldName { +public: + explicit HashedFieldName(StringData sd, std::size_t hash) : _sd(sd), _hash(hash) {} + explicit HashedFieldName(std::pair<StringData, std::size_t> pair) + : _sd(pair.first), _hash(pair.second) {} + + StringData key() const { + return _sd; + } + + std::size_t hash() const { + return _hash; + } + + std::size_t size() const { + return _sd.size(); + } + + inline void copyTo(char* dest, bool includeEndingNull) const { + return _sd.copyTo(dest, includeEndingNull); + } + + constexpr const char* rawData() const noexcept { + return _sd.rawData(); + } + +private: + StringData _sd; + std::size_t _hash; +}; + +inline bool operator==(HashedFieldName lhs, StringData rhs) { + return lhs.key() == rhs; +} + +inline bool operator==(StringData lhs, HashedFieldName rhs) { + return lhs == rhs.key(); +} + +inline bool operator==(HashedFieldName lhs, HashedFieldName rhs) { + return lhs.key() == rhs.key(); +} + +/** + * Hasher to support heterogeneous lookup for StringData and string-like elements. + */ +struct FieldNameHasher { + // This using directive activates heterogeneous lookup in the hash table + using is_transparent = void; + + std::size_t operator()(StringData sd) const { + // TODO consider FNV-1a once we have a better benchmark corpus + // Keep in sync with 'hashName' in BSONColumn implementation. + unsigned out; + MurmurHash3_x86_32(sd.rawData(), sd.size(), 0, &out); + return out; + } + + std::size_t operator()(const std::string& s) const { + return operator()(StringData(s)); + } + + std::size_t operator()(const char* s) const { + return operator()(StringData(s)); + } + + std::size_t operator()(HashedFieldName key) const { + return key.hash(); + } + + HashedFieldName hashedFieldName(StringData sd) { + return HashedFieldName(sd, operator()(sd)); + } +}; + /// Storage class used by both Document and MutableDocument class DocumentStorage : public RefCountable { public: @@ -334,13 +412,15 @@ public: }; /// Returns the position of the named field or Position() - Position findField(StringData name, LookupPolicy policy) const; + template <typename T> + Position findField(T field, LookupPolicy policy) const; // Document uses these const ValueElement& getField(Position pos) const { verify(pos.found()); return *(_firstElement->plusBytes(pos.index)); } + Value getField(StringData name) const { Position pos = findField(name, LookupPolicy::kCacheAndBSON); if (!pos.found()) @@ -348,12 +428,20 @@ public: return getField(pos).val; } + Value getField(HashedFieldName field) const { + Position pos = findField(field, LookupPolicy::kCacheAndBSON); + if (!pos.found()) + return Value(); + return getField(pos).val; + } + // MutableDocument uses these ValueElement& getField(Position pos) { _modified = true; verify(pos.found()); return *(_firstElement->plusBytes(pos.index)); } + Value& getField(StringData name, LookupPolicy policy) { _modified = true; Position pos = findField(name, policy); @@ -383,7 +471,8 @@ public: } /// Adds a new field with missing Value at the end of the document - Value& appendField(StringData name, ValueElement::Kind kind); + template <typename T> + Value& appendField(T field, ValueElement::Kind kind); /** Preallocates space for fields. Use this to attempt to prevent buffer growth. * This is only valid to call before anything is added to the document. @@ -487,11 +576,9 @@ public: _metadataFields = std::move(metadata); } - static unsigned hashKey(StringData name) { - // TODO consider FNV-1a once we have a better benchmark corpus - unsigned out; - MurmurHash3_x86_32(name.rawData(), name.size(), 0, &out); - return out; + template <typename T> + static unsigned hashKey(T name) { + return FieldNameHasher()(name); } const ValueElement* begin() const { @@ -519,13 +606,15 @@ public: private: /// Returns the position of the named field in the cache or Position() - Position findFieldInCache(StringData name) const; + template <typename T> + Position findFieldInCache(T name) const; /// Allocates space in _cache. Copies existing data if there is any. void alloc(unsigned newSize); /// Call after adding field to _cache and increasing _numFields - void addFieldToHashTable(Position pos); + template <typename T> + void addFieldToHashTable(T field, Position pos); // assumes _hashTabMask is (power of two) - 1 unsigned hashTabBuckets() const { @@ -545,15 +634,16 @@ private: memset(static_cast<void*>(_hashTab), -1, hashTabBytes()); } - unsigned bucketForKey(StringData name) const { - return hashKey(name) & _hashTabMask; + template <typename T> + unsigned bucketForKey(T field) const { + return hashKey(field) & _hashTabMask; } /// Adds all fields to the hash table void rehash() { hashTabInit(); for (auto it = iteratorCacheOnly(); !it.atEnd(); it.advance()) - addFieldToHashTable(it.position()); + addFieldToHashTable(getField(it.position()).nameSD(), it.position()); } void loadLazyMetadata() const; diff --git a/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp b/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp index 8e0999becaa..d49267b7858 100644 --- a/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp +++ b/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp @@ -280,7 +280,7 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF uassert(5346504, str::stream() << "timeField field must be a string, got: " << elem.type(), elem.type() == BSONType::String); - bucketSpec.timeField = elem.str(); + bucketSpec.setTimeField(elem.str()); hasTimeField = true; } else if (fieldName == timeseries::kMetaFieldName) { uassert(5346505, @@ -290,7 +290,7 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF uassert(5545700, str::stream() << "metaField field must be a single-element field path", metaField.find('.') == std::string::npos); - bucketSpec.metaField = std::move(metaField); + bucketSpec.setMetaField(std::move(metaField)); } else if (fieldName == kBucketMaxSpanSeconds) { uassert(5510600, str::stream() << "bucketMaxSpanSeconds field must be an integer, got: " @@ -357,7 +357,7 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF uassert(5612401, str::stream() << "timeField field must be a string, got: " << elem.type(), elem.type() == BSONType::String); - bucketSpec.timeField = elem.str(); + bucketSpec.setTimeField(elem.str()); hasTimeField = true; } else if (fieldName == timeseries::kMetaFieldName) { uassert(5612402, @@ -367,7 +367,7 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF uassert(5612403, str::stream() << "metaField field must be a single-element field path", metaField.find('.') == std::string::npos); - bucketSpec.metaField = std::move(metaField); + bucketSpec.setMetaField(std::move(metaField)); } else if (fieldName == kAssumeNoMixedSchemaData) { uassert(6067203, str::stream() << "assumeClean field must be a bool, got: " << elem.type(), @@ -402,16 +402,16 @@ void DocumentSourceInternalUnpackBucket::serializeToArray( if (((_bucketUnpacker.includeMetaField() && _bucketUnpacker.behavior() == BucketUnpacker::Behavior::kInclude) || (!_bucketUnpacker.includeMetaField() && - _bucketUnpacker.behavior() == BucketUnpacker::Behavior::kExclude && spec.metaField)) && + _bucketUnpacker.behavior() == BucketUnpacker::Behavior::kExclude && spec.metaField())) && std::find(spec.computedMetaProjFields.cbegin(), spec.computedMetaProjFields.cend(), - *spec.metaField) == spec.computedMetaProjFields.cend()) - fields.emplace_back(*spec.metaField); + *spec.metaField()) == spec.computedMetaProjFields.cend()) + fields.emplace_back(*spec.metaField()); out.addField(behavior, Value{std::move(fields)}); - out.addField(timeseries::kTimeFieldName, Value{spec.timeField}); - if (spec.metaField) { - out.addField(timeseries::kMetaFieldName, Value{*spec.metaField}); + out.addField(timeseries::kTimeFieldName, Value{spec.timeField()}); + if (spec.metaField()) { + out.addField(timeseries::kMetaFieldName, Value{*spec.metaField()}); } out.addField(kBucketMaxSpanSeconds, Value{_bucketMaxSpanSeconds}); if (_assumeNoMixedSchemaData) @@ -472,7 +472,7 @@ bool DocumentSourceInternalUnpackBucket::pushDownComputedMetaProjection( if (std::next(itr) == container->end()) { return nextStageWasRemoved; } - if (!_bucketUnpacker.bucketSpec().metaField) { + if (!_bucketUnpacker.bucketSpec().metaField()) { return nextStageWasRemoved; } @@ -482,7 +482,7 @@ bool DocumentSourceInternalUnpackBucket::pushDownComputedMetaProjection( (nextTransform->getType() == TransformerInterface::TransformerType::kInclusionProjection || nextTransform->getType() == TransformerInterface::TransformerType::kComputedProjection)) { - auto& metaName = _bucketUnpacker.bucketSpec().metaField.get(); + auto& metaName = _bucketUnpacker.bucketSpec().metaField().get(); auto [addFieldsSpec, deleteStage] = nextTransform->extractComputedProjections(metaName, timeseries::kBucketMetaFieldName.toString(), @@ -655,9 +655,9 @@ std::unique_ptr<MatchExpression> createComparisonPredicate( } // We must avoid mapping predicates on the meta field onto the control field. - if (bucketSpec.metaField && - (matchExprPath == bucketSpec.metaField.get() || - expression::isPathPrefixOf(bucketSpec.metaField.get(), matchExprPath))) + if (bucketSpec.metaField() && + (matchExprPath == bucketSpec.metaField().get() || + expression::isPathPrefixOf(bucketSpec.metaField().get(), matchExprPath))) return nullptr; // We must avoid mapping predicates on fields computed via $addFields or a computed $project. @@ -665,7 +665,7 @@ std::unique_ptr<MatchExpression> createComparisonPredicate( return nullptr; } - const auto isTimeField = (matchExprPath == bucketSpec.timeField); + const auto isTimeField = (matchExprPath == bucketSpec.timeField()); if (isTimeField && matchExprData.type() != BSONType::Date) { // Users are not allowed to insert non-date measurements into time field. So this query // would not match anything. We do not need to optimize for this case. @@ -874,7 +874,7 @@ DocumentSourceInternalUnpackBucket::createPredicatesOnBucketLevelField( std::pair<boost::intrusive_ptr<DocumentSourceMatch>, boost::intrusive_ptr<DocumentSourceMatch>> DocumentSourceInternalUnpackBucket::splitMatchOnMetaAndRename( boost::intrusive_ptr<DocumentSourceMatch> match) { - if (auto&& metaField = _bucketUnpacker.bucketSpec().metaField) { + if (auto&& metaField = _bucketUnpacker.bucketSpec().metaField()) { return std::move(*match).extractMatchOnFieldsAndRemainder( {*metaField}, {{*metaField, timeseries::kBucketMetaFieldName.toString()}}); } @@ -884,10 +884,10 @@ DocumentSourceInternalUnpackBucket::splitMatchOnMetaAndRename( std::pair<BSONObj, bool> DocumentSourceInternalUnpackBucket::extractProjectForPushDown( DocumentSource* src) const { if (auto nextProject = dynamic_cast<DocumentSourceSingleDocumentTransformation*>(src); - _bucketUnpacker.bucketSpec().metaField && nextProject && + _bucketUnpacker.bucketSpec().metaField() && nextProject && nextProject->getType() == TransformerInterface::TransformerType::kExclusionProjection) { return nextProject->extractProjectOnFieldAndRename( - _bucketUnpacker.bucketSpec().metaField.get(), timeseries::kBucketMetaFieldName); + _bucketUnpacker.bucketSpec().metaField().get(), timeseries::kBucketMetaFieldName); } return {BSONObj{}, false}; @@ -902,7 +902,7 @@ DocumentSourceInternalUnpackBucket::rewriteGroupByMinMax(Pipeline::SourceContain } const auto& idFields = groupPtr->getIdFields(); - if (idFields.size() != 1 || !_bucketUnpacker.bucketSpec().metaField.has_value()) { + if (idFields.size() != 1 || !_bucketUnpacker.bucketSpec().metaField().has_value()) { return {}; } @@ -914,7 +914,7 @@ DocumentSourceInternalUnpackBucket::rewriteGroupByMinMax(Pipeline::SourceContain const auto& idPath = exprIdPath->getFieldPath(); if (idPath.getPathLength() < 2 || - idPath.getFieldName(1) != _bucketUnpacker.bucketSpec().metaField.get()) { + idPath.getFieldName(1) != _bucketUnpacker.bucketSpec().metaField().get()) { return {}; } @@ -935,7 +935,7 @@ DocumentSourceInternalUnpackBucket::rewriteGroupByMinMax(Pipeline::SourceContain if (const auto* exprArgPath = dynamic_cast<const ExpressionFieldPath*>(exprArg)) { const auto& path = exprArgPath->getFieldPath(); if (path.getPathLength() <= 1 || - path.getFieldName(1) == _bucketUnpacker.bucketSpec().timeField) { + path.getFieldName(1) == _bucketUnpacker.bucketSpec().timeField()) { // Rewrite not valid for time field. We want to eliminate the bucket // unpack stage here. suitable = false; @@ -1007,12 +1007,13 @@ Pipeline::SourceContainer::iterator DocumentSourceInternalUnpackBucket::doOptimi // Some optimizations may not be safe to do if we have computed the metaField via an $addFields // or a computed $project. We won't do those optimizations if 'haveComputedMetaField' is true. - bool haveComputedMetaField = _bucketUnpacker.bucketSpec().metaField && - fieldIsComputed(_bucketUnpacker.bucketSpec(), _bucketUnpacker.bucketSpec().metaField.get()); + bool haveComputedMetaField = _bucketUnpacker.bucketSpec().metaField() && + fieldIsComputed(_bucketUnpacker.bucketSpec(), + _bucketUnpacker.bucketSpec().metaField().get()); // Before any other rewrites for the current stage, consider reordering with $sort. if (auto sortPtr = dynamic_cast<DocumentSourceSort*>(std::next(itr)->get())) { - if (auto metaField = _bucketUnpacker.bucketSpec().metaField; + if (auto metaField = _bucketUnpacker.bucketSpec().metaField(); metaField && !haveComputedMetaField) { if (checkMetadataSortReorder(sortPtr->getSortKeyPattern(), metaField.get())) { // We have a sort on metadata field following this stage. Reorder the two stages @@ -1056,7 +1057,7 @@ Pipeline::SourceContainer::iterator DocumentSourceInternalUnpackBucket::doOptimi "Must not specify 'query' for $geoNear on a time-series collection; use $match instead", nextNear->getQuery().binaryEqual(BSONObj())); - auto metaField = _bucketUnpacker.bucketSpec().metaField; + auto metaField = _bucketUnpacker.bucketSpec().metaField(); if (metaField && *metaField == keyField->front()) { // Make sure we actually re-write the key field for the buckets collection so we can // locate the index. @@ -1111,8 +1112,8 @@ Pipeline::SourceContainer::iterator DocumentSourceInternalUnpackBucket::doOptimi auto deps = Pipeline::getDependenciesForContainer( pExpCtx, Pipeline::SourceContainer{std::next(itr), container->end()}, boost::none); if (deps.hasNoRequirements()) { - _bucketUnpacker.setBucketSpecAndBehavior({_bucketUnpacker.bucketSpec().timeField, - _bucketUnpacker.bucketSpec().metaField, + _bucketUnpacker.setBucketSpecAndBehavior({_bucketUnpacker.bucketSpec().timeField(), + _bucketUnpacker.bucketSpec().metaField(), {}}, BucketUnpacker::Behavior::kInclude); diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp index 5e24100be98..d7afc3680bf 100644 --- a/src/mongo/db/pipeline/expression.cpp +++ b/src/mongo/db/pipeline/expression.cpp @@ -2426,10 +2426,10 @@ Value ExpressionFieldPath::evaluatePath(size_t index, const Document& input) con /* if we've hit the end of the path, stop */ if (index == _fieldPath.getPathLength() - 1) - return input[_fieldPath.getFieldName(index)]; + return input[_fieldPath.getFieldNameHashed(index)]; // Try to dive deeper - const Value val = input[_fieldPath.getFieldName(index)]; + const Value val = input[_fieldPath.getFieldNameHashed(index)]; switch (val.getType()) { case Object: return evaluatePath(index + 1, val.getDocument()); diff --git a/src/mongo/db/pipeline/field_path.cpp b/src/mongo/db/pipeline/field_path.cpp index 208f60a515b..ab00617bbd3 100644 --- a/src/mongo/db/pipeline/field_path.cpp +++ b/src/mongo/db/pipeline/field_path.cpp @@ -70,7 +70,9 @@ string FieldPath::getFullyQualifiedPath(StringData prefix, StringData suffix) { } FieldPath::FieldPath(std::string inputPath) - : _fieldPath(std::move(inputPath)), _fieldPathDotPosition{string::npos} { + : _fieldPath(std::move(inputPath)), + _fieldPathDotPosition{string::npos}, + _fieldHash{kHashUninitialized} { uassert(40352, "FieldPath cannot be constructed with empty string", !_fieldPath.empty()); uassert(40353, "FieldPath must not end with a '.'.", _fieldPath[_fieldPath.size() - 1] != '.'); @@ -79,6 +81,7 @@ FieldPath::FieldPath(std::string inputPath) size_t startPos = 0; while (string::npos != (dotPos = _fieldPath.find('.', startPos))) { _fieldPathDotPosition.push_back(dotPos); + _fieldHash.push_back(kHashUninitialized); startPos = dotPos + 1; } @@ -139,19 +142,26 @@ FieldPath FieldPath::concat(const FieldPath& tail) const { head._fieldPathDotPosition.size() + tail._fieldPathDotPosition.size() - 2 + 1; newDots.reserve(expectedDotSize); + std::vector<size_t> newHashes; + // We don't need the extra entry in hashes. + newHashes.reserve(expectedDotSize - 1); + // The first one in head._fieldPathDotPosition is npos. The last one, is, conveniently, the // size of head fieldPath, which also happens to be the index at which we added a new dot. newDots.insert( newDots.begin(), head._fieldPathDotPosition.begin(), head._fieldPathDotPosition.end()); + newHashes.insert(newHashes.begin(), head._fieldHash.begin(), head._fieldHash.end()); invariant(tail._fieldPathDotPosition.size() >= 2); for (size_t i = 1; i < tail._fieldPathDotPosition.size(); ++i) { // Move each index back by size of the first field path, plus one, for the newly added dot. newDots.push_back(tail._fieldPathDotPosition[i] + head._fieldPath.size() + 1); + newHashes.push_back(tail._fieldHash[i - 1]); } invariant(newDots.back() == concat.size()); invariant(newDots.size() == expectedDotSize); + invariant(newHashes.size() == expectedDotSize - 1); - return FieldPath(std::move(concat), std::move(newDots)); + return FieldPath(std::move(concat), std::move(newDots), std::move(newHashes)); } } // namespace mongo diff --git a/src/mongo/db/pipeline/field_path.h b/src/mongo/db/pipeline/field_path.h index 994e23554b3..d2ee93734e7 100644 --- a/src/mongo/db/pipeline/field_path.h +++ b/src/mongo/db/pipeline/field_path.h @@ -29,11 +29,13 @@ #pragma once +#include <limits> #include <string> #include <vector> #include "mongo/base/string_data.h" #include "mongo/bson/bson_depth.h" +#include "mongo/db/exec/document_value/document_internal.h" #include "mongo/util/assert_util.h" namespace mongo { @@ -111,6 +113,20 @@ public: } /** + * Return the ith field name from this path using zero-based indexes, with pre-computed hash. + */ + HashedFieldName getFieldNameHashed(size_t i) const { + dassert(i < getPathLength()); + const auto begin = _fieldPathDotPosition[i] + 1; + const auto end = _fieldPathDotPosition[i + 1]; + StringData fieldName{&_fieldPath[begin], end - begin}; + if (_fieldHash[i] == kHashUninitialized) { + _fieldHash[i] = FieldNameHasher()(fieldName); + } + return HashedFieldName{fieldName, _fieldHash[i]}; + } + + /** * Returns the full path, not including the prefix 'FieldPath::prefix'. */ const std::string& fullPath() const { @@ -142,8 +158,10 @@ public: FieldPath concat(const FieldPath& tail) const; private: - FieldPath(std::string string, std::vector<size_t> dots) - : _fieldPath(std::move(string)), _fieldPathDotPosition(std::move(dots)) { + FieldPath(std::string string, std::vector<size_t> dots, std::vector<size_t> hashes) + : _fieldPath(std::move(string)), + _fieldPathDotPosition(std::move(dots)), + _fieldHash(std::move(hashes)) { uassert(ErrorCodes::Overflow, "FieldPath is too long", _fieldPathDotPosition.size() <= BSONDepth::getMaxAllowableDepth()); @@ -158,6 +176,11 @@ private: // string::npos (which evaluates to -1) and the last contains _fieldPath.size() to facilitate // lookup. std::vector<size_t> _fieldPathDotPosition; + + // Contains the cached hash value for the field. Will initially be set to 'kHashUninitialized', + // and only generated when it is first retrieved via 'getFieldNameHashed'. + mutable std::vector<size_t> _fieldHash; + static constexpr std::size_t kHashUninitialized = std::numeric_limits<std::size_t>::max(); }; inline bool operator<(const FieldPath& lhs, const FieldPath& rhs) { |