summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDan Larkin-York <dan.larkin-york@mongodb.com>2022-01-13 13:53:05 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-01-13 14:38:25 +0000
commitc67e0f941ae7278478e250c7692ef76b4a65953d (patch)
tree9d97f6033cf6a38bdcef7a5d747572c8f3f402c2
parent429faa23081c2f97e953ca878368dca189df9986 (diff)
downloadmongo-c67e0f941ae7278478e250c7692ef76b4a65953d.tar.gz
SERVER-62499 Augment DocumentStorage APIs to accept pre-computed hashes
-rw-r--r--src/mongo/bson/util/bsoncolumn.cpp15
-rw-r--r--src/mongo/bson/util/bsoncolumn.h10
-rw-r--r--src/mongo/db/exec/bucket_unpacker.cpp109
-rw-r--r--src/mongo/db/exec/bucket_unpacker.h37
-rw-r--r--src/mongo/db/exec/document_value/document.cpp34
-rw-r--r--src/mongo/db/exec/document_value/document.h23
-rw-r--r--src/mongo/db/exec/document_value/document_internal.h118
-rw-r--r--src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp57
-rw-r--r--src/mongo/db/pipeline/expression.cpp4
-rw-r--r--src/mongo/db/pipeline/field_path.cpp14
-rw-r--r--src/mongo/db/pipeline/field_path.h27
11 files changed, 356 insertions, 92 deletions
diff --git a/src/mongo/bson/util/bsoncolumn.cpp b/src/mongo/bson/util/bsoncolumn.cpp
index c681ba3ddae..e13d1fd80ae 100644
--- a/src/mongo/bson/util/bsoncolumn.cpp
+++ b/src/mongo/bson/util/bsoncolumn.cpp
@@ -28,12 +28,14 @@
*/
#include "mongo/bson/util/bsoncolumn.h"
+
+#include <algorithm>
+#include <third_party/murmurhash3/MurmurHash3.h>
+
#include "mongo/bson/bsonobj.h"
#include "mongo/bson/util/bsoncolumn_util.h"
#include "mongo/bson/util/simple8b_type_util.h"
-#include <algorithm>
-
namespace mongo {
using namespace bsoncolumn;
@@ -102,6 +104,13 @@ private:
ElementFunc _elemFunc;
};
+std::size_t hashName(StringData sd) {
+ // Keep in sync with DocumentStorageHasher
+ unsigned out;
+ MurmurHash3_x86_32(sd.rawData(), sd.size(), 0, &out);
+ return out;
+}
+
} // namespace
BSONColumn::ElementStorage::Element::Element(char* buffer, int nameSize, int valueSize)
@@ -761,6 +770,7 @@ BSONColumn::BSONColumn(BSONElement bin) {
_binary = bin.binData(_size);
_name = bin.fieldNameStringData().toString();
+ _nameHash = hashName(_name);
_init();
}
@@ -769,6 +779,7 @@ BSONColumn::BSONColumn(BSONBinData bin, StringData name) {
_binary = static_cast<const char*>(bin.data);
_size = bin.length;
_name = name.toString();
+ _nameHash = hashName(_name);
_init();
}
diff --git a/src/mongo/bson/util/bsoncolumn.h b/src/mongo/bson/util/bsoncolumn.h
index f69b52c2f92..511090c3b3e 100644
--- a/src/mongo/bson/util/bsoncolumn.h
+++ b/src/mongo/bson/util/bsoncolumn.h
@@ -263,6 +263,15 @@ public:
return _name;
}
+ /**
+ * Field name that this BSONColumn represents.
+ *
+ * O(1) time complexity
+ */
+ std::pair<StringData, std::size_t> nameHashed() const {
+ return {_name, _nameHash};
+ }
+
private:
/**
* BSONElement storage, owns materialised BSONElement returned by BSONColumn.
@@ -405,5 +414,6 @@ private:
bool _fullyDecompressed = false;
std::string _name;
+ std::size_t _nameHash;
};
} // namespace mongo
diff --git a/src/mongo/db/exec/bucket_unpacker.cpp b/src/mongo/db/exec/bucket_unpacker.cpp
index 7f479ce763e..782d2ff5235 100644
--- a/src/mongo/db/exec/bucket_unpacker.cpp
+++ b/src/mongo/db/exec/bucket_unpacker.cpp
@@ -27,10 +27,9 @@
* it in the license file.
*/
-#include "mongo/platform/basic.h"
+#include "mongo/db/exec/bucket_unpacker.h"
#include "mongo/bson/util/bsoncolumn.h"
-#include "mongo/db/exec/bucket_unpacker.h"
#include "mongo/db/timeseries/timeseries_constants.h"
namespace mongo {
@@ -163,12 +162,12 @@ bool BucketUnpackerV1::getNext(MutableDocument& measurement,
bool includeMetaField) {
auto&& timeElem = _timeFieldIter.next();
if (includeTimeField) {
- measurement.addField(spec.timeField, Value{timeElem});
+ measurement.addField(spec.timeFieldHashed(), Value{timeElem});
}
// Includes metaField when we're instructed to do so and metaField value exists.
if (includeMetaField && !metaValue.missing()) {
- measurement.addField(*spec.metaField, metaValue);
+ measurement.addField(*spec.metaFieldHashed(), metaValue);
}
auto& currentIdx = timeElem.fieldNameStringData();
@@ -195,7 +194,7 @@ void BucketUnpackerV1::extractSingleMeasurement(MutableDocument& measurement,
auto&& dataRegion = bucket.getField(timeseries::kBucketDataFieldName).Obj();
if (includeMetaField && !metaValue.missing()) {
- measurement.addField(*spec.metaField, metaValue);
+ measurement.addField(*spec.metaFieldHashed(), metaValue);
}
for (auto&& dataElem : dataRegion) {
@@ -283,13 +282,13 @@ bool BucketUnpackerV2::getNext(MutableDocument& measurement,
// Get element and increment iterator
const auto& timeElem = *_timeColumn.it;
if (includeTimeField) {
- measurement.addField(spec.timeField, Value{timeElem});
+ measurement.addField(spec.timeFieldHashed(), Value{timeElem});
}
++_timeColumn.it;
// Includes metaField when we're instructed to do so and metaField value exists.
if (includeMetaField && !metaValue.missing()) {
- measurement.addField(*spec.metaField, metaValue);
+ measurement.addField(*spec.metaFieldHashed(), metaValue);
}
for (auto& fieldColumn : _fieldColumns) {
@@ -299,7 +298,7 @@ bool BucketUnpackerV2::getNext(MutableDocument& measurement,
const BSONElement& elem = *fieldColumn.it;
// EOO represents missing field
if (!elem.eoo()) {
- measurement.addField(fieldColumn.column.name(), Value{elem});
+ measurement.addField(HashedFieldName{fieldColumn.column.nameHashed()}, Value{elem});
}
++fieldColumn.it;
}
@@ -319,18 +318,18 @@ void BucketUnpackerV2::extractSingleMeasurement(MutableDocument& measurement,
auto val = _timeColumn.column[j];
uassert(
6067500, "Bucket unexpectedly contained fewer values than count", val && !val->eoo());
- measurement.addField(_timeColumn.column.name(), Value{*val});
+ measurement.addField(spec.timeFieldHashed(), Value{*val});
}
if (includeMetaField && !metaValue.missing()) {
- measurement.addField(*spec.metaField, metaValue);
+ measurement.addField(*spec.metaFieldHashed(), metaValue);
}
if (includeTimeField) {
for (auto& fieldColumn : _fieldColumns) {
auto val = fieldColumn.column[j];
uassert(6067600, "Bucket unexpectedly contained fewer values than count", val);
- measurement.addField(fieldColumn.column.name(), Value{*val});
+ measurement.addField(HashedFieldName{fieldColumn.column.nameHashed()}, Value{*val});
}
}
}
@@ -361,6 +360,88 @@ void eraseExcludedComputedMetaProjFields(BucketUnpacker::Behavior unpackerBehavi
} // namespace
+BucketSpec::BucketSpec(const std::string& timeField,
+ const boost::optional<std::string>& metaField,
+ const std::set<std::string>& fields,
+ const std::vector<std::string>& computedProjections)
+ : fieldSet(fields),
+ computedMetaProjFields(computedProjections),
+ _timeField(timeField),
+ _timeFieldHashed(FieldNameHasher().hashedFieldName(_timeField)),
+ _metaField(metaField) {
+ if (_metaField) {
+ _metaFieldHashed = FieldNameHasher().hashedFieldName(*_metaField);
+ }
+}
+
+BucketSpec::BucketSpec(const BucketSpec& other)
+ : fieldSet(other.fieldSet),
+ computedMetaProjFields(other.computedMetaProjFields),
+ _timeField(other._timeField),
+ _timeFieldHashed(HashedFieldName{_timeField, other._timeFieldHashed->hash()}),
+ _metaField(other._metaField) {
+ if (_metaField) {
+ _metaFieldHashed = HashedFieldName{*_metaField, other._metaFieldHashed->hash()};
+ }
+}
+
+BucketSpec::BucketSpec(BucketSpec&& other)
+ : fieldSet(std::move(other.fieldSet)),
+ computedMetaProjFields(std::move(other.computedMetaProjFields)),
+ _timeField(std::move(other._timeField)),
+ _timeFieldHashed(HashedFieldName{_timeField, other._timeFieldHashed->hash()}),
+ _metaField(std::move(other._metaField)) {
+ if (_metaField) {
+ _metaFieldHashed = HashedFieldName{*_metaField, other._metaFieldHashed->hash()};
+ }
+}
+
+BucketSpec& BucketSpec::operator=(const BucketSpec& other) {
+ if (&other != this) {
+ fieldSet = other.fieldSet;
+ computedMetaProjFields = other.computedMetaProjFields;
+ _timeField = other._timeField;
+ _timeFieldHashed = HashedFieldName{_timeField, other._timeFieldHashed->hash()};
+ _metaField = other._metaField;
+ if (_metaField) {
+ _metaFieldHashed = HashedFieldName{*_metaField, other._metaFieldHashed->hash()};
+ }
+ }
+ return *this;
+}
+
+void BucketSpec::setTimeField(std::string&& name) {
+ _timeField = std::move(name);
+ _timeFieldHashed = FieldNameHasher().hashedFieldName(_timeField);
+}
+
+const std::string& BucketSpec::timeField() const {
+ return _timeField;
+}
+
+HashedFieldName BucketSpec::timeFieldHashed() const {
+ invariant(_timeFieldHashed->key().rawData() == _timeField.data());
+ invariant(_timeFieldHashed->key() == _timeField);
+ return *_timeFieldHashed;
+}
+
+void BucketSpec::setMetaField(boost::optional<std::string>&& name) {
+ _metaField = std::move(name);
+ if (_metaField) {
+ _metaFieldHashed = FieldNameHasher().hashedFieldName(*_metaField);
+ } else {
+ _metaFieldHashed = boost::none;
+ }
+}
+
+const boost::optional<std::string>& BucketSpec::metaField() const {
+ return _metaField;
+}
+
+boost::optional<HashedFieldName> BucketSpec::metaFieldHashed() const {
+ return _metaFieldHashed;
+}
+
BucketUnpacker::BucketUnpacker() = default;
BucketUnpacker::BucketUnpacker(BucketUnpacker&& other) = default;
BucketUnpacker::~BucketUnpacker() = default;
@@ -437,13 +518,13 @@ void BucketUnpacker::reset(BSONObj&& bucket) {
return;
}
- auto&& timeFieldElem = dataRegion.getField(_spec.timeField);
+ auto&& timeFieldElem = dataRegion.getField(_spec.timeField());
uassert(5346700,
"The $_internalUnpackBucket stage requires the data region to have a timeField object",
timeFieldElem);
_metaValue = Value{_bucket[timeseries::kBucketMetaFieldName]};
- if (_spec.metaField) {
+ if (_spec.metaField()) {
// The spec indicates that there might be a metadata region. Missing metadata in
// measurements is expressed with missing metadata in a bucket. But we disallow undefined
// since the undefined BSON type is deprecated.
@@ -489,7 +570,7 @@ void BucketUnpacker::reset(BSONObj&& bucket) {
// include or exclude case.
for (auto&& elem : dataRegion) {
auto& colName = elem.fieldNameStringData();
- if (colName == _spec.timeField) {
+ if (colName == _spec.timeField()) {
// Skip adding a FieldIterator for the timeField since the timestamp value from
// _timeFieldIter can be placed accordingly in the materialized measurement.
continue;
diff --git a/src/mongo/db/exec/bucket_unpacker.h b/src/mongo/db/exec/bucket_unpacker.h
index da057735691..2f5fd70b785 100644
--- a/src/mongo/db/exec/bucket_unpacker.h
+++ b/src/mongo/db/exec/bucket_unpacker.h
@@ -29,6 +29,7 @@
#pragma once
+
#include <algorithm>
#include <set>
@@ -39,14 +40,29 @@ namespace mongo {
/**
* Carries parameters for unpacking a bucket.
*/
-struct BucketSpec {
+class BucketSpec {
+public:
+ BucketSpec() = default;
+ BucketSpec(const std::string& timeField,
+ const boost::optional<std::string>& metaField,
+ const std::set<std::string>& fields = {},
+ const std::vector<std::string>& computedProjections = {});
+ BucketSpec(const BucketSpec&);
+ BucketSpec(BucketSpec&&);
+
+ BucketSpec& operator=(const BucketSpec&);
+
// The user-supplied timestamp field name specified during time-series collection creation.
- std::string timeField;
+ void setTimeField(std::string&& field);
+ const std::string& timeField() const;
+ HashedFieldName timeFieldHashed() const;
// An optional user-supplied metadata field name specified during time-series collection
// creation. This field name is used during materialization of metadata fields of a measurement
// after unpacking.
- boost::optional<std::string> metaField;
+ void setMetaField(boost::optional<std::string>&& field);
+ const boost::optional<std::string>& metaField() const;
+ boost::optional<HashedFieldName> metaFieldHashed() const;
// The set of field names in the data region that should be included or excluded.
std::set<std::string> fieldSet;
@@ -54,6 +70,13 @@ struct BucketSpec {
// Vector of computed meta field projection names. Added at the end of materialized
// measurements.
std::vector<std::string> computedMetaProjFields;
+
+private:
+ std::string _timeField;
+ boost::optional<HashedFieldName> _timeFieldHashed;
+
+ boost::optional<std::string> _metaField = boost::none;
+ boost::optional<HashedFieldName> _metaFieldHashed = boost::none;
};
/**
@@ -185,12 +208,12 @@ private:
*/
inline bool eraseMetaFromFieldSetAndDetermineIncludeMeta(BucketUnpacker::Behavior unpackerBehavior,
BucketSpec* bucketSpec) {
- if (!bucketSpec->metaField ||
+ if (!bucketSpec->metaField() ||
std::find(bucketSpec->computedMetaProjFields.cbegin(),
bucketSpec->computedMetaProjFields.cend(),
- *bucketSpec->metaField) != bucketSpec->computedMetaProjFields.cend()) {
+ *bucketSpec->metaField()) != bucketSpec->computedMetaProjFields.cend()) {
return false;
- } else if (auto itr = bucketSpec->fieldSet.find(*bucketSpec->metaField);
+ } else if (auto itr = bucketSpec->fieldSet.find(*bucketSpec->metaField());
itr != bucketSpec->fieldSet.end()) {
bucketSpec->fieldSet.erase(itr);
return unpackerBehavior == BucketUnpacker::Behavior::kInclude;
@@ -205,7 +228,7 @@ inline bool eraseMetaFromFieldSetAndDetermineIncludeMeta(BucketUnpacker::Behavio
inline bool determineIncludeTimeField(BucketUnpacker::Behavior unpackerBehavior,
BucketSpec* bucketSpec) {
return (unpackerBehavior == BucketUnpacker::Behavior::kInclude) ==
- (bucketSpec->fieldSet.find(bucketSpec->timeField) != bucketSpec->fieldSet.end());
+ (bucketSpec->fieldSet.find(bucketSpec->timeField()) != bucketSpec->fieldSet.end());
}
/**
diff --git a/src/mongo/db/exec/document_value/document.cpp b/src/mongo/db/exec/document_value/document.cpp
index f47eff8b300..dd833b3c28f 100644
--- a/src/mongo/db/exec/document_value/document.cpp
+++ b/src/mongo/db/exec/document_value/document.cpp
@@ -27,8 +27,6 @@
* it in the license file.
*/
-#include "mongo/platform/basic.h"
-
#include "mongo/db/exec/document_value/document.h"
#include <boost/functional/hash.hpp>
@@ -164,7 +162,8 @@ bool DocumentStorageIterator::shouldSkipDeleted() {
return false;
}
-Position DocumentStorage::findFieldInCache(StringData requested) const {
+template <typename T>
+Position DocumentStorage::findFieldInCache(T requested) const {
int reqSize = requested.size(); // get size calculation out of the way if needed
if (_numFields >= HASH_TAB_MIN) { // hash lookup
@@ -192,14 +191,17 @@ Position DocumentStorage::findFieldInCache(StringData requested) const {
// if we got here, there's no such field
return Position();
}
+template Position DocumentStorage::findFieldInCache<StringData>(StringData field) const;
+template Position DocumentStorage::findFieldInCache<HashedFieldName>(HashedFieldName field) const;
-Position DocumentStorage::findField(StringData requested, LookupPolicy policy) const {
- if (auto pos = findFieldInCache(requested); pos.found() || policy == LookupPolicy::kCacheOnly) {
+template <typename T>
+Position DocumentStorage::findField(T field, LookupPolicy policy) const {
+ if (auto pos = findFieldInCache(field); pos.found() || policy == LookupPolicy::kCacheOnly) {
return pos;
}
for (auto&& bsonElement : _bson) {
- if (requested == bsonElement.fieldNameStringData()) {
+ if (field == bsonElement.fieldNameStringData()) {
return const_cast<DocumentStorage*>(this)->constructInCache(bsonElement);
}
}
@@ -207,6 +209,10 @@ Position DocumentStorage::findField(StringData requested, LookupPolicy policy) c
// if we got here, there's no such field
return Position();
}
+template Position DocumentStorage::findField<StringData>(StringData field,
+ LookupPolicy policy) const;
+template Position DocumentStorage::findField<HashedFieldName>(HashedFieldName field,
+ LookupPolicy policy) const;
Position DocumentStorage::constructInCache(const BSONElement& elem) {
auto savedModified = _modified;
@@ -222,9 +228,10 @@ Position DocumentStorage::constructInCache(const BSONElement& elem) {
return pos;
}
-Value& DocumentStorage::appendField(StringData name, ValueElement::Kind kind) {
+template <typename T>
+Value& DocumentStorage::appendField(T field, ValueElement::Kind kind) {
Position pos = getNextPosition();
- const int nameSize = name.size();
+ const int nameSize = field.size();
// these are the same for everyone
const Position nextCollision;
@@ -245,7 +252,7 @@ Value& DocumentStorage::appendField(StringData name, ValueElement::Kind kind) {
append(nextCollision);
append(nameSize);
append(kind);
- name.copyTo(dest, true);
+ field.copyTo(dest, true);
// Padding for alignment handled above
#undef append
@@ -255,7 +262,7 @@ Value& DocumentStorage::appendField(StringData name, ValueElement::Kind kind) {
_numFields++;
if (_numFields > HASH_TAB_MIN) {
- addFieldToHashTable(pos);
+ addFieldToHashTable(field, pos);
} else if (_numFields == HASH_TAB_MIN) {
// adds all fields to hash table (including the one we just added)
rehash();
@@ -263,13 +270,16 @@ Value& DocumentStorage::appendField(StringData name, ValueElement::Kind kind) {
return getField(pos).val;
}
+template Value& DocumentStorage::appendField<StringData>(StringData, ValueElement::Kind);
+template Value& DocumentStorage::appendField<HashedFieldName>(HashedFieldName, ValueElement::Kind);
// Call after adding field to _fields and increasing _numFields
-void DocumentStorage::addFieldToHashTable(Position pos) {
+template <typename T>
+void DocumentStorage::addFieldToHashTable(T field, Position pos) {
ValueElement& elem = getField(pos);
elem.nextCollision = Position();
- const unsigned bucket = bucketForKey(elem.nameSD());
+ const unsigned bucket = bucketForKey(field);
Position* posPtr = &_hashTab[bucket];
while (posPtr->found()) {
diff --git a/src/mongo/db/exec/document_value/document.h b/src/mongo/db/exec/document_value/document.h
index 36c4106357d..8fcfb28dd8b 100644
--- a/src/mongo/db/exec/document_value/document.h
+++ b/src/mongo/db/exec/document_value/document.h
@@ -136,17 +136,16 @@ public:
* Note that this method does *not* traverse nested documents and arrays, use getNestedField()
* instead.
*/
- const Value operator[](StringData key) const {
+ template <typename T>
+ const Value operator[](T key) const {
return getField(key);
}
- const Value getField(StringData key) const {
+ template <typename T>
+ const Value getField(T key) const {
return storage().getField(key);
}
/// Look up a field by Position. See positionOf and getNestedField.
- const Value operator[](Position pos) const {
- return getField(pos);
- }
const Value getField(Position pos) const {
return storage().getField(pos).val;
}
@@ -535,12 +534,18 @@ public:
* Decide what level of support is needed for duplicate fields.
* If duplicates are not allowed, consider removing this method.
*/
- void addField(StringData fieldName, const Value& val) {
- storage().appendField(fieldName, ValueElement::Kind::kInserted) = val;
+ void addField(StringData name, const Value& val) {
+ storage().appendField(name, ValueElement::Kind::kInserted) = val;
+ }
+ void addField(HashedFieldName field, const Value& val) {
+ storage().appendField(field, ValueElement::Kind::kInserted) = val;
}
- void addField(StringData fieldName, Value&& val) {
- storage().appendField(fieldName, ValueElement::Kind::kInserted) = std::move(val);
+ void addField(StringData name, Value&& val) {
+ storage().appendField(name, ValueElement::Kind::kInserted) = std::move(val);
+ }
+ void addField(HashedFieldName field, Value&& val) {
+ storage().appendField(field, ValueElement::Kind::kInserted) = std::move(val);
}
/** Update field by key. If there is no field with that key, add one.
diff --git a/src/mongo/db/exec/document_value/document_internal.h b/src/mongo/db/exec/document_value/document_internal.h
index 9be057fb522..062b7a4da00 100644
--- a/src/mongo/db/exec/document_value/document_internal.h
+++ b/src/mongo/db/exec/document_value/document_internal.h
@@ -29,10 +29,9 @@
#pragma once
-#include <third_party/murmurhash3/MurmurHash3.h>
-
#include <bitset>
#include <boost/intrusive_ptr.hpp>
+#include <third_party/murmurhash3/MurmurHash3.h>
#include "mongo/base/static_assert.h"
#include "mongo/db/exec/document_value/document_metadata_fields.h"
@@ -275,6 +274,85 @@ private:
const ValueElement* _end;
};
+/**
+ * Type that bundles the hashed field name along with the actual string so that hashing can be done
+ * outside of inserts and lookups and re-used across calls.
+ */
+class HashedFieldName {
+public:
+ explicit HashedFieldName(StringData sd, std::size_t hash) : _sd(sd), _hash(hash) {}
+ explicit HashedFieldName(std::pair<StringData, std::size_t> pair)
+ : _sd(pair.first), _hash(pair.second) {}
+
+ StringData key() const {
+ return _sd;
+ }
+
+ std::size_t hash() const {
+ return _hash;
+ }
+
+ std::size_t size() const {
+ return _sd.size();
+ }
+
+ inline void copyTo(char* dest, bool includeEndingNull) const {
+ return _sd.copyTo(dest, includeEndingNull);
+ }
+
+ constexpr const char* rawData() const noexcept {
+ return _sd.rawData();
+ }
+
+private:
+ StringData _sd;
+ std::size_t _hash;
+};
+
+inline bool operator==(HashedFieldName lhs, StringData rhs) {
+ return lhs.key() == rhs;
+}
+
+inline bool operator==(StringData lhs, HashedFieldName rhs) {
+ return lhs == rhs.key();
+}
+
+inline bool operator==(HashedFieldName lhs, HashedFieldName rhs) {
+ return lhs.key() == rhs.key();
+}
+
+/**
+ * Hasher to support heterogeneous lookup for StringData and string-like elements.
+ */
+struct FieldNameHasher {
+ // This using directive activates heterogeneous lookup in the hash table
+ using is_transparent = void;
+
+ std::size_t operator()(StringData sd) const {
+ // TODO consider FNV-1a once we have a better benchmark corpus
+ // Keep in sync with 'hashName' in BSONColumn implementation.
+ unsigned out;
+ MurmurHash3_x86_32(sd.rawData(), sd.size(), 0, &out);
+ return out;
+ }
+
+ std::size_t operator()(const std::string& s) const {
+ return operator()(StringData(s));
+ }
+
+ std::size_t operator()(const char* s) const {
+ return operator()(StringData(s));
+ }
+
+ std::size_t operator()(HashedFieldName key) const {
+ return key.hash();
+ }
+
+ HashedFieldName hashedFieldName(StringData sd) {
+ return HashedFieldName(sd, operator()(sd));
+ }
+};
+
/// Storage class used by both Document and MutableDocument
class DocumentStorage : public RefCountable {
public:
@@ -334,13 +412,15 @@ public:
};
/// Returns the position of the named field or Position()
- Position findField(StringData name, LookupPolicy policy) const;
+ template <typename T>
+ Position findField(T field, LookupPolicy policy) const;
// Document uses these
const ValueElement& getField(Position pos) const {
verify(pos.found());
return *(_firstElement->plusBytes(pos.index));
}
+
Value getField(StringData name) const {
Position pos = findField(name, LookupPolicy::kCacheAndBSON);
if (!pos.found())
@@ -348,12 +428,20 @@ public:
return getField(pos).val;
}
+ Value getField(HashedFieldName field) const {
+ Position pos = findField(field, LookupPolicy::kCacheAndBSON);
+ if (!pos.found())
+ return Value();
+ return getField(pos).val;
+ }
+
// MutableDocument uses these
ValueElement& getField(Position pos) {
_modified = true;
verify(pos.found());
return *(_firstElement->plusBytes(pos.index));
}
+
Value& getField(StringData name, LookupPolicy policy) {
_modified = true;
Position pos = findField(name, policy);
@@ -383,7 +471,8 @@ public:
}
/// Adds a new field with missing Value at the end of the document
- Value& appendField(StringData name, ValueElement::Kind kind);
+ template <typename T>
+ Value& appendField(T field, ValueElement::Kind kind);
/** Preallocates space for fields. Use this to attempt to prevent buffer growth.
* This is only valid to call before anything is added to the document.
@@ -487,11 +576,9 @@ public:
_metadataFields = std::move(metadata);
}
- static unsigned hashKey(StringData name) {
- // TODO consider FNV-1a once we have a better benchmark corpus
- unsigned out;
- MurmurHash3_x86_32(name.rawData(), name.size(), 0, &out);
- return out;
+ template <typename T>
+ static unsigned hashKey(T name) {
+ return FieldNameHasher()(name);
}
const ValueElement* begin() const {
@@ -519,13 +606,15 @@ public:
private:
/// Returns the position of the named field in the cache or Position()
- Position findFieldInCache(StringData name) const;
+ template <typename T>
+ Position findFieldInCache(T name) const;
/// Allocates space in _cache. Copies existing data if there is any.
void alloc(unsigned newSize);
/// Call after adding field to _cache and increasing _numFields
- void addFieldToHashTable(Position pos);
+ template <typename T>
+ void addFieldToHashTable(T field, Position pos);
// assumes _hashTabMask is (power of two) - 1
unsigned hashTabBuckets() const {
@@ -545,15 +634,16 @@ private:
memset(static_cast<void*>(_hashTab), -1, hashTabBytes());
}
- unsigned bucketForKey(StringData name) const {
- return hashKey(name) & _hashTabMask;
+ template <typename T>
+ unsigned bucketForKey(T field) const {
+ return hashKey(field) & _hashTabMask;
}
/// Adds all fields to the hash table
void rehash() {
hashTabInit();
for (auto it = iteratorCacheOnly(); !it.atEnd(); it.advance())
- addFieldToHashTable(it.position());
+ addFieldToHashTable(getField(it.position()).nameSD(), it.position());
}
void loadLazyMetadata() const;
diff --git a/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp b/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp
index 8e0999becaa..d49267b7858 100644
--- a/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp
+++ b/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp
@@ -280,7 +280,7 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF
uassert(5346504,
str::stream() << "timeField field must be a string, got: " << elem.type(),
elem.type() == BSONType::String);
- bucketSpec.timeField = elem.str();
+ bucketSpec.setTimeField(elem.str());
hasTimeField = true;
} else if (fieldName == timeseries::kMetaFieldName) {
uassert(5346505,
@@ -290,7 +290,7 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF
uassert(5545700,
str::stream() << "metaField field must be a single-element field path",
metaField.find('.') == std::string::npos);
- bucketSpec.metaField = std::move(metaField);
+ bucketSpec.setMetaField(std::move(metaField));
} else if (fieldName == kBucketMaxSpanSeconds) {
uassert(5510600,
str::stream() << "bucketMaxSpanSeconds field must be an integer, got: "
@@ -357,7 +357,7 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF
uassert(5612401,
str::stream() << "timeField field must be a string, got: " << elem.type(),
elem.type() == BSONType::String);
- bucketSpec.timeField = elem.str();
+ bucketSpec.setTimeField(elem.str());
hasTimeField = true;
} else if (fieldName == timeseries::kMetaFieldName) {
uassert(5612402,
@@ -367,7 +367,7 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF
uassert(5612403,
str::stream() << "metaField field must be a single-element field path",
metaField.find('.') == std::string::npos);
- bucketSpec.metaField = std::move(metaField);
+ bucketSpec.setMetaField(std::move(metaField));
} else if (fieldName == kAssumeNoMixedSchemaData) {
uassert(6067203,
str::stream() << "assumeClean field must be a bool, got: " << elem.type(),
@@ -402,16 +402,16 @@ void DocumentSourceInternalUnpackBucket::serializeToArray(
if (((_bucketUnpacker.includeMetaField() &&
_bucketUnpacker.behavior() == BucketUnpacker::Behavior::kInclude) ||
(!_bucketUnpacker.includeMetaField() &&
- _bucketUnpacker.behavior() == BucketUnpacker::Behavior::kExclude && spec.metaField)) &&
+ _bucketUnpacker.behavior() == BucketUnpacker::Behavior::kExclude && spec.metaField())) &&
std::find(spec.computedMetaProjFields.cbegin(),
spec.computedMetaProjFields.cend(),
- *spec.metaField) == spec.computedMetaProjFields.cend())
- fields.emplace_back(*spec.metaField);
+ *spec.metaField()) == spec.computedMetaProjFields.cend())
+ fields.emplace_back(*spec.metaField());
out.addField(behavior, Value{std::move(fields)});
- out.addField(timeseries::kTimeFieldName, Value{spec.timeField});
- if (spec.metaField) {
- out.addField(timeseries::kMetaFieldName, Value{*spec.metaField});
+ out.addField(timeseries::kTimeFieldName, Value{spec.timeField()});
+ if (spec.metaField()) {
+ out.addField(timeseries::kMetaFieldName, Value{*spec.metaField()});
}
out.addField(kBucketMaxSpanSeconds, Value{_bucketMaxSpanSeconds});
if (_assumeNoMixedSchemaData)
@@ -472,7 +472,7 @@ bool DocumentSourceInternalUnpackBucket::pushDownComputedMetaProjection(
if (std::next(itr) == container->end()) {
return nextStageWasRemoved;
}
- if (!_bucketUnpacker.bucketSpec().metaField) {
+ if (!_bucketUnpacker.bucketSpec().metaField()) {
return nextStageWasRemoved;
}
@@ -482,7 +482,7 @@ bool DocumentSourceInternalUnpackBucket::pushDownComputedMetaProjection(
(nextTransform->getType() == TransformerInterface::TransformerType::kInclusionProjection ||
nextTransform->getType() == TransformerInterface::TransformerType::kComputedProjection)) {
- auto& metaName = _bucketUnpacker.bucketSpec().metaField.get();
+ auto& metaName = _bucketUnpacker.bucketSpec().metaField().get();
auto [addFieldsSpec, deleteStage] =
nextTransform->extractComputedProjections(metaName,
timeseries::kBucketMetaFieldName.toString(),
@@ -655,9 +655,9 @@ std::unique_ptr<MatchExpression> createComparisonPredicate(
}
// We must avoid mapping predicates on the meta field onto the control field.
- if (bucketSpec.metaField &&
- (matchExprPath == bucketSpec.metaField.get() ||
- expression::isPathPrefixOf(bucketSpec.metaField.get(), matchExprPath)))
+ if (bucketSpec.metaField() &&
+ (matchExprPath == bucketSpec.metaField().get() ||
+ expression::isPathPrefixOf(bucketSpec.metaField().get(), matchExprPath)))
return nullptr;
// We must avoid mapping predicates on fields computed via $addFields or a computed $project.
@@ -665,7 +665,7 @@ std::unique_ptr<MatchExpression> createComparisonPredicate(
return nullptr;
}
- const auto isTimeField = (matchExprPath == bucketSpec.timeField);
+ const auto isTimeField = (matchExprPath == bucketSpec.timeField());
if (isTimeField && matchExprData.type() != BSONType::Date) {
// Users are not allowed to insert non-date measurements into time field. So this query
// would not match anything. We do not need to optimize for this case.
@@ -874,7 +874,7 @@ DocumentSourceInternalUnpackBucket::createPredicatesOnBucketLevelField(
std::pair<boost::intrusive_ptr<DocumentSourceMatch>, boost::intrusive_ptr<DocumentSourceMatch>>
DocumentSourceInternalUnpackBucket::splitMatchOnMetaAndRename(
boost::intrusive_ptr<DocumentSourceMatch> match) {
- if (auto&& metaField = _bucketUnpacker.bucketSpec().metaField) {
+ if (auto&& metaField = _bucketUnpacker.bucketSpec().metaField()) {
return std::move(*match).extractMatchOnFieldsAndRemainder(
{*metaField}, {{*metaField, timeseries::kBucketMetaFieldName.toString()}});
}
@@ -884,10 +884,10 @@ DocumentSourceInternalUnpackBucket::splitMatchOnMetaAndRename(
std::pair<BSONObj, bool> DocumentSourceInternalUnpackBucket::extractProjectForPushDown(
DocumentSource* src) const {
if (auto nextProject = dynamic_cast<DocumentSourceSingleDocumentTransformation*>(src);
- _bucketUnpacker.bucketSpec().metaField && nextProject &&
+ _bucketUnpacker.bucketSpec().metaField() && nextProject &&
nextProject->getType() == TransformerInterface::TransformerType::kExclusionProjection) {
return nextProject->extractProjectOnFieldAndRename(
- _bucketUnpacker.bucketSpec().metaField.get(), timeseries::kBucketMetaFieldName);
+ _bucketUnpacker.bucketSpec().metaField().get(), timeseries::kBucketMetaFieldName);
}
return {BSONObj{}, false};
@@ -902,7 +902,7 @@ DocumentSourceInternalUnpackBucket::rewriteGroupByMinMax(Pipeline::SourceContain
}
const auto& idFields = groupPtr->getIdFields();
- if (idFields.size() != 1 || !_bucketUnpacker.bucketSpec().metaField.has_value()) {
+ if (idFields.size() != 1 || !_bucketUnpacker.bucketSpec().metaField().has_value()) {
return {};
}
@@ -914,7 +914,7 @@ DocumentSourceInternalUnpackBucket::rewriteGroupByMinMax(Pipeline::SourceContain
const auto& idPath = exprIdPath->getFieldPath();
if (idPath.getPathLength() < 2 ||
- idPath.getFieldName(1) != _bucketUnpacker.bucketSpec().metaField.get()) {
+ idPath.getFieldName(1) != _bucketUnpacker.bucketSpec().metaField().get()) {
return {};
}
@@ -935,7 +935,7 @@ DocumentSourceInternalUnpackBucket::rewriteGroupByMinMax(Pipeline::SourceContain
if (const auto* exprArgPath = dynamic_cast<const ExpressionFieldPath*>(exprArg)) {
const auto& path = exprArgPath->getFieldPath();
if (path.getPathLength() <= 1 ||
- path.getFieldName(1) == _bucketUnpacker.bucketSpec().timeField) {
+ path.getFieldName(1) == _bucketUnpacker.bucketSpec().timeField()) {
// Rewrite not valid for time field. We want to eliminate the bucket
// unpack stage here.
suitable = false;
@@ -1007,12 +1007,13 @@ Pipeline::SourceContainer::iterator DocumentSourceInternalUnpackBucket::doOptimi
// Some optimizations may not be safe to do if we have computed the metaField via an $addFields
// or a computed $project. We won't do those optimizations if 'haveComputedMetaField' is true.
- bool haveComputedMetaField = _bucketUnpacker.bucketSpec().metaField &&
- fieldIsComputed(_bucketUnpacker.bucketSpec(), _bucketUnpacker.bucketSpec().metaField.get());
+ bool haveComputedMetaField = _bucketUnpacker.bucketSpec().metaField() &&
+ fieldIsComputed(_bucketUnpacker.bucketSpec(),
+ _bucketUnpacker.bucketSpec().metaField().get());
// Before any other rewrites for the current stage, consider reordering with $sort.
if (auto sortPtr = dynamic_cast<DocumentSourceSort*>(std::next(itr)->get())) {
- if (auto metaField = _bucketUnpacker.bucketSpec().metaField;
+ if (auto metaField = _bucketUnpacker.bucketSpec().metaField();
metaField && !haveComputedMetaField) {
if (checkMetadataSortReorder(sortPtr->getSortKeyPattern(), metaField.get())) {
// We have a sort on metadata field following this stage. Reorder the two stages
@@ -1056,7 +1057,7 @@ Pipeline::SourceContainer::iterator DocumentSourceInternalUnpackBucket::doOptimi
"Must not specify 'query' for $geoNear on a time-series collection; use $match instead",
nextNear->getQuery().binaryEqual(BSONObj()));
- auto metaField = _bucketUnpacker.bucketSpec().metaField;
+ auto metaField = _bucketUnpacker.bucketSpec().metaField();
if (metaField && *metaField == keyField->front()) {
// Make sure we actually re-write the key field for the buckets collection so we can
// locate the index.
@@ -1111,8 +1112,8 @@ Pipeline::SourceContainer::iterator DocumentSourceInternalUnpackBucket::doOptimi
auto deps = Pipeline::getDependenciesForContainer(
pExpCtx, Pipeline::SourceContainer{std::next(itr), container->end()}, boost::none);
if (deps.hasNoRequirements()) {
- _bucketUnpacker.setBucketSpecAndBehavior({_bucketUnpacker.bucketSpec().timeField,
- _bucketUnpacker.bucketSpec().metaField,
+ _bucketUnpacker.setBucketSpecAndBehavior({_bucketUnpacker.bucketSpec().timeField(),
+ _bucketUnpacker.bucketSpec().metaField(),
{}},
BucketUnpacker::Behavior::kInclude);
diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp
index 5e24100be98..d7afc3680bf 100644
--- a/src/mongo/db/pipeline/expression.cpp
+++ b/src/mongo/db/pipeline/expression.cpp
@@ -2426,10 +2426,10 @@ Value ExpressionFieldPath::evaluatePath(size_t index, const Document& input) con
/* if we've hit the end of the path, stop */
if (index == _fieldPath.getPathLength() - 1)
- return input[_fieldPath.getFieldName(index)];
+ return input[_fieldPath.getFieldNameHashed(index)];
// Try to dive deeper
- const Value val = input[_fieldPath.getFieldName(index)];
+ const Value val = input[_fieldPath.getFieldNameHashed(index)];
switch (val.getType()) {
case Object:
return evaluatePath(index + 1, val.getDocument());
diff --git a/src/mongo/db/pipeline/field_path.cpp b/src/mongo/db/pipeline/field_path.cpp
index 208f60a515b..ab00617bbd3 100644
--- a/src/mongo/db/pipeline/field_path.cpp
+++ b/src/mongo/db/pipeline/field_path.cpp
@@ -70,7 +70,9 @@ string FieldPath::getFullyQualifiedPath(StringData prefix, StringData suffix) {
}
FieldPath::FieldPath(std::string inputPath)
- : _fieldPath(std::move(inputPath)), _fieldPathDotPosition{string::npos} {
+ : _fieldPath(std::move(inputPath)),
+ _fieldPathDotPosition{string::npos},
+ _fieldHash{kHashUninitialized} {
uassert(40352, "FieldPath cannot be constructed with empty string", !_fieldPath.empty());
uassert(40353, "FieldPath must not end with a '.'.", _fieldPath[_fieldPath.size() - 1] != '.');
@@ -79,6 +81,7 @@ FieldPath::FieldPath(std::string inputPath)
size_t startPos = 0;
while (string::npos != (dotPos = _fieldPath.find('.', startPos))) {
_fieldPathDotPosition.push_back(dotPos);
+ _fieldHash.push_back(kHashUninitialized);
startPos = dotPos + 1;
}
@@ -139,19 +142,26 @@ FieldPath FieldPath::concat(const FieldPath& tail) const {
head._fieldPathDotPosition.size() + tail._fieldPathDotPosition.size() - 2 + 1;
newDots.reserve(expectedDotSize);
+ std::vector<size_t> newHashes;
+ // We don't need the extra entry in hashes.
+ newHashes.reserve(expectedDotSize - 1);
+
// The first one in head._fieldPathDotPosition is npos. The last one, is, conveniently, the
// size of head fieldPath, which also happens to be the index at which we added a new dot.
newDots.insert(
newDots.begin(), head._fieldPathDotPosition.begin(), head._fieldPathDotPosition.end());
+ newHashes.insert(newHashes.begin(), head._fieldHash.begin(), head._fieldHash.end());
invariant(tail._fieldPathDotPosition.size() >= 2);
for (size_t i = 1; i < tail._fieldPathDotPosition.size(); ++i) {
// Move each index back by size of the first field path, plus one, for the newly added dot.
newDots.push_back(tail._fieldPathDotPosition[i] + head._fieldPath.size() + 1);
+ newHashes.push_back(tail._fieldHash[i - 1]);
}
invariant(newDots.back() == concat.size());
invariant(newDots.size() == expectedDotSize);
+ invariant(newHashes.size() == expectedDotSize - 1);
- return FieldPath(std::move(concat), std::move(newDots));
+ return FieldPath(std::move(concat), std::move(newDots), std::move(newHashes));
}
} // namespace mongo
diff --git a/src/mongo/db/pipeline/field_path.h b/src/mongo/db/pipeline/field_path.h
index 994e23554b3..d2ee93734e7 100644
--- a/src/mongo/db/pipeline/field_path.h
+++ b/src/mongo/db/pipeline/field_path.h
@@ -29,11 +29,13 @@
#pragma once
+#include <limits>
#include <string>
#include <vector>
#include "mongo/base/string_data.h"
#include "mongo/bson/bson_depth.h"
+#include "mongo/db/exec/document_value/document_internal.h"
#include "mongo/util/assert_util.h"
namespace mongo {
@@ -111,6 +113,20 @@ public:
}
/**
+ * Return the ith field name from this path using zero-based indexes, with pre-computed hash.
+ */
+ HashedFieldName getFieldNameHashed(size_t i) const {
+ dassert(i < getPathLength());
+ const auto begin = _fieldPathDotPosition[i] + 1;
+ const auto end = _fieldPathDotPosition[i + 1];
+ StringData fieldName{&_fieldPath[begin], end - begin};
+ if (_fieldHash[i] == kHashUninitialized) {
+ _fieldHash[i] = FieldNameHasher()(fieldName);
+ }
+ return HashedFieldName{fieldName, _fieldHash[i]};
+ }
+
+ /**
* Returns the full path, not including the prefix 'FieldPath::prefix'.
*/
const std::string& fullPath() const {
@@ -142,8 +158,10 @@ public:
FieldPath concat(const FieldPath& tail) const;
private:
- FieldPath(std::string string, std::vector<size_t> dots)
- : _fieldPath(std::move(string)), _fieldPathDotPosition(std::move(dots)) {
+ FieldPath(std::string string, std::vector<size_t> dots, std::vector<size_t> hashes)
+ : _fieldPath(std::move(string)),
+ _fieldPathDotPosition(std::move(dots)),
+ _fieldHash(std::move(hashes)) {
uassert(ErrorCodes::Overflow,
"FieldPath is too long",
_fieldPathDotPosition.size() <= BSONDepth::getMaxAllowableDepth());
@@ -158,6 +176,11 @@ private:
// string::npos (which evaluates to -1) and the last contains _fieldPath.size() to facilitate
// lookup.
std::vector<size_t> _fieldPathDotPosition;
+
+ // Contains the cached hash value for the field. Will initially be set to 'kHashUninitialized',
+ // and only generated when it is first retrieved via 'getFieldNameHashed'.
+ mutable std::vector<size_t> _fieldHash;
+ static constexpr std::size_t kHashUninitialized = std::numeric_limits<std::size_t>::max();
};
inline bool operator<(const FieldPath& lhs, const FieldPath& rhs) {