diff options
author | David Percy <david.percy@mongodb.com> | 2020-10-21 18:40:58 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-11-10 18:32:21 +0000 |
commit | 8011b6129fc08a7dcbd675da737e63a22f1ef362 (patch) | |
tree | d427da31a6220426cfa58b7cbc8452bbb0464d34 | |
parent | 145a48bf2fdddbc44fdea4ab9c0a5475ed1bb1b7 (diff) | |
download | mongo-8011b6129fc08a7dcbd675da737e63a22f1ef362.tar.gz |
SERVER-49024 Disallow $lookup caching of stages containing $rand, $sample
-rw-r--r-- | jstests/aggregation/sources/lookup/lookup_random.js | 41 | ||||
-rw-r--r-- | src/mongo/db/pipeline/dependencies.h | 11 | ||||
-rw-r--r-- | src/mongo/db/pipeline/document_source_sample.h | 1 | ||||
-rw-r--r-- | src/mongo/db/pipeline/document_source_sequential_document_cache.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/pipeline/expression.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/pipeline/pipeline.cpp | 31 |
6 files changed, 69 insertions, 23 deletions
diff --git a/jstests/aggregation/sources/lookup/lookup_random.js b/jstests/aggregation/sources/lookup/lookup_random.js new file mode 100644 index 00000000000..3558e1d6061 --- /dev/null +++ b/jstests/aggregation/sources/lookup/lookup_random.js @@ -0,0 +1,41 @@ +// Operators that use a random generator ($sample and $rand) are not allowed to be cached +// as part of a non-correlated prefix. +// @tags: [assumes_unsharded_collection] +(function() { +"use strict"; + +const coll = db.getCollection('lookup_random'); +coll.drop(); +assert.commandWorked(coll.insert(Array.from({length: 200}, (_, i) => ({_id: i})))); + +// $sample in the inner pipeline should be rerun per outer document. +let result = coll.aggregate([ + {$lookup: { + from: coll.getName(), + as: 'docs', + pipeline: [ + {$sample: {size: 1}}, + ], + }}, + {$unwind: "$docs"}, + {$group: {_id: null, sampled: {$addToSet: "$docs._id"}}}, +]).toArray(); +assert.eq(result.length, 1, result); +assert.gt(result[0].sampled.length, 1, result); + +// $rand in the inner pipeline should be rerun per outer document. +result = coll.aggregate([ + {$lookup: { + from: coll.getName(), + as: 'docs', + pipeline: [ + {$limit: 1}, + {$set: {r: {$rand: {}}}}, + ], + }}, + {$unwind: "$docs"}, + {$group: {_id: null, randomValues: {$addToSet: "$docs.r"}}}, +]).toArray(); +assert.eq(result.length, 1, result); +assert.gt(result[0].randomValues.length, 1, result); +})(); diff --git a/src/mongo/db/pipeline/dependencies.h b/src/mongo/db/pipeline/dependencies.h index 8b75439db53..96d846cc216 100644 --- a/src/mongo/db/pipeline/dependencies.h +++ b/src/mongo/db/pipeline/dependencies.h @@ -107,6 +107,12 @@ struct DepsTracker { */ BSONObj toProjectionWithoutMetadata() const; + /** + * Returns 'true' if there is no dependency on the input documents or metadata. + * + * Note: this method does not say anything about dependencies on variables, or on a random + * generator. + */ bool hasNoRequirements() const { return fields.empty() && !needWholeDocument && !_metadataDeps.any(); } @@ -177,6 +183,11 @@ struct DepsTracker { std::set<Variables::Id> vars; // IDs of referenced variables. bool needWholeDocument = false; // If true, ignore 'fields'; the whole document is needed. + // The output of some operators (such as $sample and $rand) depends on a source of fresh random + // numbers. During execution this dependency is implicit, but during optimize() we need to know + // about this dependency to decide whether it's ok to cache or reevaluate an operator. + bool needRandomGenerator = false; + private: // Represents all metadata not available to the pipeline. QueryMetadataBitSet _unavailableMetadata; diff --git a/src/mongo/db/pipeline/document_source_sample.h b/src/mongo/db/pipeline/document_source_sample.h index 084b4385fc6..7dd31db3c41 100644 --- a/src/mongo/db/pipeline/document_source_sample.h +++ b/src/mongo/db/pipeline/document_source_sample.h @@ -55,6 +55,7 @@ public: } DepsTracker::State getDependencies(DepsTracker* deps) const final { + deps->needRandomGenerator = true; return DepsTracker::State::SEE_NEXT; } diff --git a/src/mongo/db/pipeline/document_source_sequential_document_cache.cpp b/src/mongo/db/pipeline/document_source_sequential_document_cache.cpp index a23563adca5..dd115615d08 100644 --- a/src/mongo/db/pipeline/document_source_sequential_document_cache.cpp +++ b/src/mongo/db/pipeline/document_source_sequential_document_cache.cpp @@ -117,11 +117,13 @@ Pipeline::SourceContainer::iterator DocumentSourceSequentialDocumentCache::doOpt // assertions. DepsTracker deps(DepsTracker::kNoMetadata); - // Iterate through the pipeline stages until we find one which references an external variable. + // Iterate through the pipeline stages until we find one which cannot be cached. + // A stage cannot be cached if it either: 1. depends on a variable defined in this scope, or + // 2. generates random numbers. for (; prefixSplit != container->end(); ++prefixSplit) { (*prefixSplit)->getDependencies(&deps); - if (deps.hasVariableReferenceTo(varIDs)) { + if (deps.hasVariableReferenceTo(varIDs) || deps.needRandomGenerator) { break; } } diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp index 3367d9d3549..0cc4961c590 100644 --- a/src/mongo/db/pipeline/expression.cpp +++ b/src/mongo/db/pipeline/expression.cpp @@ -6463,7 +6463,7 @@ intrusive_ptr<Expression> ExpressionRandom::optimize() { } void ExpressionRandom::_doAddDependencies(DepsTracker* deps) const { - // Nothing to do. + deps->needRandomGenerator = true; } Value ExpressionRandom::serialize(const bool explain) const { diff --git a/src/mongo/db/pipeline/pipeline.cpp b/src/mongo/db/pipeline/pipeline.cpp index c53b5358d07..15dcebae4f4 100644 --- a/src/mongo/db/pipeline/pipeline.cpp +++ b/src/mongo/db/pipeline/pipeline.cpp @@ -486,8 +486,7 @@ void Pipeline::addFinalSource(intrusive_ptr<DocumentSource> source) { DepsTracker Pipeline::getDependencies(QueryMetadataBitSet unavailableMetadata) const { DepsTracker deps(unavailableMetadata); - const bool scopeHasVariables = pCtx->variablesParseState.hasDefinedVariables(); - bool skipFieldsAndMetadataDeps = false; + bool hasUnsupportedStage = false; bool knowAllFields = false; bool knowAllMeta = false; for (auto&& source : _sources) { @@ -495,36 +494,28 @@ DepsTracker Pipeline::getDependencies(QueryMetadataBitSet unavailableMetadata) c DepsTracker::State status = source->getDependencies(&localDeps); deps.vars.insert(localDeps.vars.begin(), localDeps.vars.end()); + deps.needRandomGenerator |= localDeps.needRandomGenerator; - if ((skipFieldsAndMetadataDeps |= (status == DepsTracker::State::NOT_SUPPORTED))) { - // Assume this stage needs everything. We may still know something about our - // dependencies if an earlier stage returned EXHAUSTIVE_FIELDS or EXHAUSTIVE_META. If - // this scope has variables, we need to keep enumerating the remaining stages but will - // skip adding any further field or metadata dependencies. - if (scopeHasVariables) { - continue; - } else { - break; - } + if (status == DepsTracker::State::NOT_SUPPORTED) { + // We don't know anything about this stage, so we have to assume it depends on + // everything. We may still know something about our dependencies if an earlier stage + // returned EXHAUSTIVE_FIELDS or EXHAUSTIVE_META. + hasUnsupportedStage = true; } - if (!knowAllFields) { + // If we ever saw an unsupported stage, don't bother continuing to track field and metadata + // deps: we already have to assume the pipeline depends on everything. + if (!hasUnsupportedStage && !knowAllFields) { deps.fields.insert(localDeps.fields.begin(), localDeps.fields.end()); if (localDeps.needWholeDocument) deps.needWholeDocument = true; knowAllFields = status & DepsTracker::State::EXHAUSTIVE_FIELDS; } - if (!knowAllMeta) { + if (!hasUnsupportedStage && !knowAllMeta) { deps.requestMetadata(localDeps.metadataDeps()); knowAllMeta = status & DepsTracker::State::EXHAUSTIVE_META; } - - // If there are variables defined at this pipeline's scope, there may be dependencies upon - // them in subsequent stages. Keep enumerating. - if (knowAllMeta && knowAllFields && !scopeHasVariables) { - break; - } } if (!knowAllFields) |