summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Percy <david.percy@mongodb.com>2020-10-21 18:40:58 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-11-10 18:32:21 +0000
commit8011b6129fc08a7dcbd675da737e63a22f1ef362 (patch)
treed427da31a6220426cfa58b7cbc8452bbb0464d34
parent145a48bf2fdddbc44fdea4ab9c0a5475ed1bb1b7 (diff)
downloadmongo-8011b6129fc08a7dcbd675da737e63a22f1ef362.tar.gz
SERVER-49024 Disallow $lookup caching of stages containing $rand, $sample
-rw-r--r--jstests/aggregation/sources/lookup/lookup_random.js41
-rw-r--r--src/mongo/db/pipeline/dependencies.h11
-rw-r--r--src/mongo/db/pipeline/document_source_sample.h1
-rw-r--r--src/mongo/db/pipeline/document_source_sequential_document_cache.cpp6
-rw-r--r--src/mongo/db/pipeline/expression.cpp2
-rw-r--r--src/mongo/db/pipeline/pipeline.cpp31
6 files changed, 69 insertions, 23 deletions
diff --git a/jstests/aggregation/sources/lookup/lookup_random.js b/jstests/aggregation/sources/lookup/lookup_random.js
new file mode 100644
index 00000000000..3558e1d6061
--- /dev/null
+++ b/jstests/aggregation/sources/lookup/lookup_random.js
@@ -0,0 +1,41 @@
+// Operators that use a random generator ($sample and $rand) are not allowed to be cached
+// as part of a non-correlated prefix.
+// @tags: [assumes_unsharded_collection]
+(function() {
+"use strict";
+
+const coll = db.getCollection('lookup_random');
+coll.drop();
+assert.commandWorked(coll.insert(Array.from({length: 200}, (_, i) => ({_id: i}))));
+
+// $sample in the inner pipeline should be rerun per outer document.
+let result = coll.aggregate([
+ {$lookup: {
+ from: coll.getName(),
+ as: 'docs',
+ pipeline: [
+ {$sample: {size: 1}},
+ ],
+ }},
+ {$unwind: "$docs"},
+ {$group: {_id: null, sampled: {$addToSet: "$docs._id"}}},
+]).toArray();
+assert.eq(result.length, 1, result);
+assert.gt(result[0].sampled.length, 1, result);
+
+// $rand in the inner pipeline should be rerun per outer document.
+result = coll.aggregate([
+ {$lookup: {
+ from: coll.getName(),
+ as: 'docs',
+ pipeline: [
+ {$limit: 1},
+ {$set: {r: {$rand: {}}}},
+ ],
+ }},
+ {$unwind: "$docs"},
+ {$group: {_id: null, randomValues: {$addToSet: "$docs.r"}}},
+]).toArray();
+assert.eq(result.length, 1, result);
+assert.gt(result[0].randomValues.length, 1, result);
+})();
diff --git a/src/mongo/db/pipeline/dependencies.h b/src/mongo/db/pipeline/dependencies.h
index 8b75439db53..96d846cc216 100644
--- a/src/mongo/db/pipeline/dependencies.h
+++ b/src/mongo/db/pipeline/dependencies.h
@@ -107,6 +107,12 @@ struct DepsTracker {
*/
BSONObj toProjectionWithoutMetadata() const;
+ /**
+ * Returns 'true' if there is no dependency on the input documents or metadata.
+ *
+ * Note: this method does not say anything about dependencies on variables, or on a random
+ * generator.
+ */
bool hasNoRequirements() const {
return fields.empty() && !needWholeDocument && !_metadataDeps.any();
}
@@ -177,6 +183,11 @@ struct DepsTracker {
std::set<Variables::Id> vars; // IDs of referenced variables.
bool needWholeDocument = false; // If true, ignore 'fields'; the whole document is needed.
+ // The output of some operators (such as $sample and $rand) depends on a source of fresh random
+ // numbers. During execution this dependency is implicit, but during optimize() we need to know
+ // about this dependency to decide whether it's ok to cache or reevaluate an operator.
+ bool needRandomGenerator = false;
+
private:
// Represents all metadata not available to the pipeline.
QueryMetadataBitSet _unavailableMetadata;
diff --git a/src/mongo/db/pipeline/document_source_sample.h b/src/mongo/db/pipeline/document_source_sample.h
index 084b4385fc6..7dd31db3c41 100644
--- a/src/mongo/db/pipeline/document_source_sample.h
+++ b/src/mongo/db/pipeline/document_source_sample.h
@@ -55,6 +55,7 @@ public:
}
DepsTracker::State getDependencies(DepsTracker* deps) const final {
+ deps->needRandomGenerator = true;
return DepsTracker::State::SEE_NEXT;
}
diff --git a/src/mongo/db/pipeline/document_source_sequential_document_cache.cpp b/src/mongo/db/pipeline/document_source_sequential_document_cache.cpp
index a23563adca5..dd115615d08 100644
--- a/src/mongo/db/pipeline/document_source_sequential_document_cache.cpp
+++ b/src/mongo/db/pipeline/document_source_sequential_document_cache.cpp
@@ -117,11 +117,13 @@ Pipeline::SourceContainer::iterator DocumentSourceSequentialDocumentCache::doOpt
// assertions.
DepsTracker deps(DepsTracker::kNoMetadata);
- // Iterate through the pipeline stages until we find one which references an external variable.
+ // Iterate through the pipeline stages until we find one which cannot be cached.
+ // A stage cannot be cached if it either: 1. depends on a variable defined in this scope, or
+ // 2. generates random numbers.
for (; prefixSplit != container->end(); ++prefixSplit) {
(*prefixSplit)->getDependencies(&deps);
- if (deps.hasVariableReferenceTo(varIDs)) {
+ if (deps.hasVariableReferenceTo(varIDs) || deps.needRandomGenerator) {
break;
}
}
diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp
index 3367d9d3549..0cc4961c590 100644
--- a/src/mongo/db/pipeline/expression.cpp
+++ b/src/mongo/db/pipeline/expression.cpp
@@ -6463,7 +6463,7 @@ intrusive_ptr<Expression> ExpressionRandom::optimize() {
}
void ExpressionRandom::_doAddDependencies(DepsTracker* deps) const {
- // Nothing to do.
+ deps->needRandomGenerator = true;
}
Value ExpressionRandom::serialize(const bool explain) const {
diff --git a/src/mongo/db/pipeline/pipeline.cpp b/src/mongo/db/pipeline/pipeline.cpp
index c53b5358d07..15dcebae4f4 100644
--- a/src/mongo/db/pipeline/pipeline.cpp
+++ b/src/mongo/db/pipeline/pipeline.cpp
@@ -486,8 +486,7 @@ void Pipeline::addFinalSource(intrusive_ptr<DocumentSource> source) {
DepsTracker Pipeline::getDependencies(QueryMetadataBitSet unavailableMetadata) const {
DepsTracker deps(unavailableMetadata);
- const bool scopeHasVariables = pCtx->variablesParseState.hasDefinedVariables();
- bool skipFieldsAndMetadataDeps = false;
+ bool hasUnsupportedStage = false;
bool knowAllFields = false;
bool knowAllMeta = false;
for (auto&& source : _sources) {
@@ -495,36 +494,28 @@ DepsTracker Pipeline::getDependencies(QueryMetadataBitSet unavailableMetadata) c
DepsTracker::State status = source->getDependencies(&localDeps);
deps.vars.insert(localDeps.vars.begin(), localDeps.vars.end());
+ deps.needRandomGenerator |= localDeps.needRandomGenerator;
- if ((skipFieldsAndMetadataDeps |= (status == DepsTracker::State::NOT_SUPPORTED))) {
- // Assume this stage needs everything. We may still know something about our
- // dependencies if an earlier stage returned EXHAUSTIVE_FIELDS or EXHAUSTIVE_META. If
- // this scope has variables, we need to keep enumerating the remaining stages but will
- // skip adding any further field or metadata dependencies.
- if (scopeHasVariables) {
- continue;
- } else {
- break;
- }
+ if (status == DepsTracker::State::NOT_SUPPORTED) {
+ // We don't know anything about this stage, so we have to assume it depends on
+ // everything. We may still know something about our dependencies if an earlier stage
+ // returned EXHAUSTIVE_FIELDS or EXHAUSTIVE_META.
+ hasUnsupportedStage = true;
}
- if (!knowAllFields) {
+ // If we ever saw an unsupported stage, don't bother continuing to track field and metadata
+ // deps: we already have to assume the pipeline depends on everything.
+ if (!hasUnsupportedStage && !knowAllFields) {
deps.fields.insert(localDeps.fields.begin(), localDeps.fields.end());
if (localDeps.needWholeDocument)
deps.needWholeDocument = true;
knowAllFields = status & DepsTracker::State::EXHAUSTIVE_FIELDS;
}
- if (!knowAllMeta) {
+ if (!hasUnsupportedStage && !knowAllMeta) {
deps.requestMetadata(localDeps.metadataDeps());
knowAllMeta = status & DepsTracker::State::EXHAUSTIVE_META;
}
-
- // If there are variables defined at this pipeline's scope, there may be dependencies upon
- // them in subsequent stages. Keep enumerating.
- if (knowAllMeta && knowAllFields && !scopeHasVariables) {
- break;
- }
}
if (!knowAllFields)