diff options
27 files changed, 1520 insertions, 1333 deletions
diff --git a/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml b/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml index 81a68edf47b..61d8bfde017 100644 --- a/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml +++ b/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml @@ -49,6 +49,7 @@ selector: - jstests/sharding/track_unsharded_collections_rename_collection.js - jstests/sharding/banned_txn_databases_sharded.js - jstests/sharding/split_large_key.js + - jstests/sharding/change_streams_establishment_finds_new_shards.js # Enable if SERVER-41813 is backported or 4.4 becomes last-stable - jstests/sharding/invalid_system_views_sharded_collection.js diff --git a/buildscripts/resmokeconfig/suites/sharding_map_reduce_agg.yaml b/buildscripts/resmokeconfig/suites/sharding_map_reduce_agg.yaml index f126871e84d..4de6fde506a 100644 --- a/buildscripts/resmokeconfig/suites/sharding_map_reduce_agg.yaml +++ b/buildscripts/resmokeconfig/suites/sharding_map_reduce_agg.yaml @@ -4,8 +4,16 @@ test_kind: js_test selector: roots: - # Placeholder test to avoid failing in resmoke. - - jstests/sharding/accurate_count_with_predicate.js + - jstests/sharding/auth.js + - jstests/sharding/authCommands.js + - jstests/sharding/authmr.js + - jstests/sharding/causal_consistency_shell_support.js + - jstests/sharding/localhostAuthBypass.js + - jstests/sharding/max_time_ms_sharded.js + - jstests/sharding/mr_and_agg_versioning.js + - jstests/sharding/mr_shard_version.js + - jstests/sharding/query_config.js + - jstests/sharding/shard_targeting.js executor: config: diff --git a/jstests/aggregation/sharded_agg_cleanup_on_error.js b/jstests/aggregation/sharded_agg_cleanup_on_error.js index 5fba3e477a9..fb646819d06 100644 --- a/jstests/aggregation/sharded_agg_cleanup_on_error.js +++ b/jstests/aggregation/sharded_agg_cleanup_on_error.js @@ -88,7 +88,7 @@ try { try { // Enable the failpoint to fail on establishing a merging shard cursor. assert.commandWorked(mongosDB.adminCommand({ - configureFailPoint: "clusterAggregateFailToEstablishMergingShardCursor", + configureFailPoint: "shardedAggregateFailToEstablishMergingShardCursor", mode: "alwaysOn" })); @@ -103,13 +103,13 @@ try { } finally { assert.commandWorked(mongosDB.adminCommand( - {configureFailPoint: "clusterAggregateFailToEstablishMergingShardCursor", mode: "off"})); + {configureFailPoint: "shardedAggregateFailToEstablishMergingShardCursor", mode: "off"})); } // Test that aggregations involving $exchange correctly clean up the producer cursors. try { assert.commandWorked(mongosDB.adminCommand({ - configureFailPoint: "clusterAggregateFailToDispatchExchangeConsumerPipeline", + configureFailPoint: "shardedAggregateFailToDispatchExchangeConsumerPipeline", mode: "alwaysOn" })); @@ -133,7 +133,7 @@ try { } finally { assert.commandWorked(mongosDB.adminCommand({ - configureFailPoint: "clusterAggregateFailToDispatchExchangeConsumerPipeline", + configureFailPoint: "shardedAggregateFailToDispatchExchangeConsumerPipeline", mode: "off" })); } diff --git a/jstests/sharding/change_streams_establishment_finds_new_shards.js b/jstests/sharding/change_streams_establishment_finds_new_shards.js index 8f2393f99ee..6a232856bb7 100644 --- a/jstests/sharding/change_streams_establishment_finds_new_shards.js +++ b/jstests/sharding/change_streams_establishment_finds_new_shards.js @@ -40,13 +40,13 @@ assert.commandWorked(mongos.adminCommand({split: mongosColl.getFullName(), middl // Enable the failpoint. assert.commandWorked(mongos.adminCommand( - {configureFailPoint: "clusterAggregateHangBeforeEstablishingShardCursors", mode: "alwaysOn"})); + {configureFailPoint: "shardedAggregateHangBeforeEstablishingShardCursors", mode: "alwaysOn"})); // While opening the cursor, wait for the failpoint and add the new shard. const awaitNewShard = startParallelShell(` load("jstests/libs/check_log.js"); checkLog.contains(db, - "clusterAggregateHangBeforeEstablishingShardCursors fail point enabled"); + "shardedAggregateHangBeforeEstablishingShardCursors fail point enabled"); assert.commandWorked( db.adminCommand({addShard: "${newShard.getURL()}", name: "${newShard.name}"})); // Migrate the [10, MaxKey] chunk to "newShard". @@ -56,7 +56,7 @@ const awaitNewShard = startParallelShell(` _waitForDelete: true})); assert.commandWorked( db.adminCommand( - {configureFailPoint: "clusterAggregateHangBeforeEstablishingShardCursors", + {configureFailPoint: "shardedAggregateHangBeforeEstablishingShardCursors", mode: "off"}));`, mongos.port); diff --git a/src/mongo/db/commands/SConscript b/src/mongo/db/commands/SConscript index fce6f57b45e..300595e2a4f 100644 --- a/src/mongo/db/commands/SConscript +++ b/src/mongo/db/commands/SConscript @@ -157,6 +157,7 @@ env.Library( '$BUILD_DIR/mongo/db/logical_session_cache', '$BUILD_DIR/mongo/db/logical_session_id_helpers', '$BUILD_DIR/mongo/db/logical_session_id', + '$BUILD_DIR/mongo/db/pipeline/pipeline', '$BUILD_DIR/mongo/db/repl/isself', '$BUILD_DIR/mongo/db/repl/repl_coordinator_interface', '$BUILD_DIR/mongo/db/session_catalog', @@ -508,10 +509,10 @@ env.Library( 'map_reduce_agg.cpp', ], LIBDEPS=[ - '$BUILD_DIR/mongo/db/db_raii', '$BUILD_DIR/mongo/idl/idl_parser', + '$BUILD_DIR/mongo/db/commands/servers', + '$BUILD_DIR/mongo/db/db_raii', '$BUILD_DIR/mongo/db/pipeline/mongo_process_interface', - '$BUILD_DIR/mongo/db/pipeline/pipeline', '$BUILD_DIR/mongo/db/query/map_reduce_output_format', 'map_reduce_parser' ] @@ -541,6 +542,7 @@ env.CppUnitTest( "map_reduce_parse_test.cpp", ], LIBDEPS=[ + '$BUILD_DIR/mongo/db/auth/authmocks', '$BUILD_DIR/mongo/db/query/query_test_service_context', 'map_reduce_agg', ] diff --git a/src/mongo/db/commands/map_reduce_agg.cpp b/src/mongo/db/commands/map_reduce_agg.cpp index aaffa84dc19..f1c94a9c339 100644 --- a/src/mongo/db/commands/map_reduce_agg.cpp +++ b/src/mongo/db/commands/map_reduce_agg.cpp @@ -41,155 +41,18 @@ #include "mongo/db/commands.h" #include "mongo/db/commands/map_reduce_agg.h" #include "mongo/db/commands/map_reduce_javascript_code.h" +#include "mongo/db/commands/mr_common.h" #include "mongo/db/db_raii.h" #include "mongo/db/exec/document_value/value.h" #include "mongo/db/namespace_string.h" -#include "mongo/db/pipeline/document_source.h" -#include "mongo/db/pipeline/document_source_group.h" -#include "mongo/db/pipeline/document_source_limit.h" -#include "mongo/db/pipeline/document_source_match.h" -#include "mongo/db/pipeline/document_source_merge.h" -#include "mongo/db/pipeline/document_source_out.h" -#include "mongo/db/pipeline/document_source_project.h" -#include "mongo/db/pipeline/document_source_single_document_transformation.h" -#include "mongo/db/pipeline/document_source_sort.h" -#include "mongo/db/pipeline/document_source_unwind.h" #include "mongo/db/pipeline/expression.h" -#include "mongo/db/pipeline/expression_javascript.h" -#include "mongo/db/pipeline/parsed_aggregation_projection_node.h" -#include "mongo/db/pipeline/parsed_inclusion_projection.h" #include "mongo/db/pipeline/pipeline_d.h" #include "mongo/db/query/map_reduce_output_format.h" -#include "mongo/db/query/util/make_data_structure.h" -#include "mongo/util/intrusive_counter.h" namespace mongo::map_reduce_agg { namespace { -using namespace std::string_literals; - -auto translateSort(boost::intrusive_ptr<ExpressionContext> expCtx, - const BSONObj& sort, - const boost::optional<std::int64_t>& limit) { - return DocumentSourceSort::create(expCtx, sort, limit.get_value_or(-1)); -} - -auto translateMap(boost::intrusive_ptr<ExpressionContext> expCtx, std::string code) { - auto emitExpression = ExpressionInternalJsEmit::create( - expCtx, ExpressionFieldPath::parse(expCtx, "$$ROOT", expCtx->variablesParseState), code); - auto node = std::make_unique<parsed_aggregation_projection::InclusionNode>( - ProjectionPolicies{ProjectionPolicies::DefaultIdPolicy::kExcludeId}); - node->addExpressionForPath(FieldPath{"emits"s}, std::move(emitExpression)); - auto inclusion = std::unique_ptr<TransformerInterface>{ - std::make_unique<parsed_aggregation_projection::ParsedInclusionProjection>( - expCtx, - ProjectionPolicies{ProjectionPolicies::DefaultIdPolicy::kExcludeId}, - std::move(node))}; - return make_intrusive<DocumentSourceSingleDocumentTransformation>( - expCtx, std::move(inclusion), DocumentSourceProject::kStageName, false); -} - -auto translateReduce(boost::intrusive_ptr<ExpressionContext> expCtx, std::string code) { - auto accumulatorArguments = ExpressionObject::create( - expCtx, - make_vector<std::pair<std::string, boost::intrusive_ptr<Expression>>>( - std::pair{"data"s, - ExpressionFieldPath::parse(expCtx, "$emits", expCtx->variablesParseState)}, - std::pair{"eval"s, ExpressionConstant::create(expCtx, Value{code})})); - auto jsReduce = AccumulationStatement{ - "value", - std::move(accumulatorArguments), - AccumulationStatement::getFactory(AccumulatorInternalJsReduce::kAccumulatorName)}; - auto groupExpr = ExpressionFieldPath::parse(expCtx, "$emits.k", expCtx->variablesParseState); - return DocumentSourceGroup::create(expCtx, - std::move(groupExpr), - make_vector<AccumulationStatement>(std::move(jsReduce)), - boost::none); -} - -auto translateFinalize(boost::intrusive_ptr<ExpressionContext> expCtx, std::string code) { - auto jsExpression = ExpressionInternalJs::create( - expCtx, - ExpressionArray::create( - expCtx, - make_vector<boost::intrusive_ptr<Expression>>( - ExpressionFieldPath::parse(expCtx, "$_id", expCtx->variablesParseState), - ExpressionFieldPath::parse(expCtx, "$value", expCtx->variablesParseState))), - code); - auto node = std::make_unique<parsed_aggregation_projection::InclusionNode>( - ProjectionPolicies{ProjectionPolicies::DefaultIdPolicy::kExcludeId}); - node->addExpressionForPath(FieldPath{"value"s}, std::move(jsExpression)); - auto inclusion = std::unique_ptr<TransformerInterface>{ - std::make_unique<parsed_aggregation_projection::ParsedInclusionProjection>( - expCtx, - ProjectionPolicies{ProjectionPolicies::DefaultIdPolicy::kExcludeId}, - std::move(node))}; - return make_intrusive<DocumentSourceSingleDocumentTransformation>( - expCtx, std::move(inclusion), DocumentSourceProject::kStageName, false); -} - -auto translateOutReplace(boost::intrusive_ptr<ExpressionContext> expCtx, - const StringData inputDatabase, - NamespaceString targetNss) { - uassert(31278, - "MapReduce must output to the database belonging to its input collection - Input: "s + - inputDatabase + "Output: " + targetNss.db(), - inputDatabase == targetNss.db()); - return DocumentSourceOut::create(std::move(targetNss), expCtx); -} - -auto translateOutMerge(boost::intrusive_ptr<ExpressionContext> expCtx, NamespaceString targetNss) { - return DocumentSourceMerge::create(targetNss, - expCtx, - MergeWhenMatchedModeEnum::kReplace, - MergeWhenNotMatchedModeEnum::kInsert, - boost::none, // Let variables - boost::none, // pipeline - std::set<FieldPath>{FieldPath("_id"s)}, - boost::none); // targetCollectionVersion -} - -auto translateOutReduce(boost::intrusive_ptr<ExpressionContext> expCtx, - NamespaceString targetNss, - std::string code) { - // Because of communication for sharding, $merge must hold on to a serializable BSON object - // at the moment so we reparse here. - auto reduceObj = BSON("args" << BSON_ARRAY("$value" - << "$$new.value") - << "eval" << code); - - auto finalProjectSpec = - BSON(DocumentSourceProject::kStageName - << BSON("value" << BSON(ExpressionInternalJs::kExpressionName << reduceObj))); - auto pipelineSpec = boost::make_optional(std::vector<BSONObj>{finalProjectSpec}); - return DocumentSourceMerge::create(targetNss, - expCtx, - MergeWhenMatchedModeEnum::kPipeline, - MergeWhenNotMatchedModeEnum::kInsert, - boost::none, // Let variables - pipelineSpec, - std::set<FieldPath>{FieldPath("_id"s)}, - boost::none); // targetCollectionVersion -} - -auto translateOut(boost::intrusive_ptr<ExpressionContext> expCtx, - const OutputType outputType, - const StringData inputDatabase, - NamespaceString targetNss, - std::string reduceCode) { - switch (outputType) { - case OutputType::Replace: - return boost::make_optional(translateOutReplace(expCtx, inputDatabase, targetNss)); - case OutputType::Merge: - return boost::make_optional(translateOutMerge(expCtx, targetNss)); - case OutputType::Reduce: - return boost::make_optional(translateOutReduce(expCtx, targetNss, reduceCode)); - case OutputType::InMemory:; - } - return boost::optional<boost::intrusive_ptr<mongo::DocumentSource>>{}; -} - auto makeExpressionContext(OperationContext* opCtx, const MapReduce& parsedMr) { // AutoGetCollectionForReadCommand will throw if the sharding version for this connection is // out of date. @@ -247,7 +110,7 @@ bool runAggregationMapReduce(OperationContext* opCtx, auto parsedMr = MapReduce::parse(IDLParserErrorContext("MapReduce"), cmd); auto expCtx = makeExpressionContext(opCtx, parsedMr); auto runnablePipeline = [&]() { - auto pipeline = translateFromMR(parsedMr, expCtx); + auto pipeline = map_reduce_common::translateFromMR(parsedMr, expCtx); return expCtx->mongoProcessInterface->attachCursorSourceToPipelineForLocalRead( expCtx, pipeline.release()); }(); @@ -274,32 +137,4 @@ bool runAggregationMapReduce(OperationContext* opCtx, return true; } -std::unique_ptr<Pipeline, PipelineDeleter> translateFromMR( - MapReduce parsedMr, boost::intrusive_ptr<ExpressionContext> expCtx) { - - // TODO: It would be good to figure out what kind of errors this would produce in the Status. - // It would be better not to produce something incomprehensible out of an internal translation. - return uassertStatusOK(Pipeline::create( - makeFlattenedList<boost::intrusive_ptr<DocumentSource>>( - parsedMr.getQuery().map( - [&](auto&& query) { return DocumentSourceMatch::create(query, expCtx); }), - parsedMr.getSort().map( - [&](auto&& sort) { return translateSort(expCtx, sort, parsedMr.getLimit()); }), - translateMap(expCtx, parsedMr.getMap().getCode()), - DocumentSourceUnwind::create(expCtx, "emits", false, boost::none), - translateReduce(expCtx, parsedMr.getReduce().getCode()), - parsedMr.getFinalize().map([&](auto&& finalize) { - return translateFinalize(expCtx, parsedMr.getFinalize()->getCode()); - }), - translateOut(expCtx, - parsedMr.getOutOptions().getOutputType(), - parsedMr.getNamespace().db(), - NamespaceString{parsedMr.getOutOptions().getDatabaseName() - ? *parsedMr.getOutOptions().getDatabaseName() - : parsedMr.getNamespace().db(), - parsedMr.getOutOptions().getCollectionName()}, - parsedMr.getReduce().getCode())), - expCtx)); -} - } // namespace mongo::map_reduce_agg diff --git a/src/mongo/db/commands/map_reduce_agg.h b/src/mongo/db/commands/map_reduce_agg.h index 573119c0393..7c5972a7de9 100644 --- a/src/mongo/db/commands/map_reduce_agg.h +++ b/src/mongo/db/commands/map_reduce_agg.h @@ -47,7 +47,4 @@ bool runAggregationMapReduce(OperationContext* opCtx, std::string& errmsg, BSONObjBuilder& result); -std::unique_ptr<Pipeline, PipelineDeleter> translateFromMR( - MapReduce parsedMr, boost::intrusive_ptr<ExpressionContext> expCtx); - } // namespace mongo::map_reduce_agg diff --git a/src/mongo/db/commands/map_reduce_agg_test.cpp b/src/mongo/db/commands/map_reduce_agg_test.cpp index 17b5f2520ba..3e78b1cb4c4 100644 --- a/src/mongo/db/commands/map_reduce_agg_test.cpp +++ b/src/mongo/db/commands/map_reduce_agg_test.cpp @@ -35,6 +35,7 @@ #include "mongo/base/string_data.h" #include "mongo/bson/bsonobj.h" #include "mongo/db/commands/map_reduce_agg.h" +#include "mongo/db/commands/mr_common.h" #include "mongo/db/pipeline/document_source_group.h" #include "mongo/db/pipeline/document_source_match.h" #include "mongo/db/pipeline/document_source_merge.h" @@ -65,7 +66,7 @@ TEST(MapReduceAggTest, testBasicTranslate) { MapReduceJavascriptCode{reduceJavascript.toString()}, MapReduceOutOptions{boost::none, "", OutputType::InMemory, false}}; boost::intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest(nss)); - auto pipeline = translateFromMR(mr, expCtx); + auto pipeline = map_reduce_common::translateFromMR(mr, expCtx); auto& sources = pipeline->getSources(); ASSERT_EQ(3u, sources.size()); auto iter = sources.begin(); @@ -82,7 +83,7 @@ TEST(MapReduceAggTest, testSortWithoutLimit) { MapReduceOutOptions{boost::none, "", OutputType::InMemory, false}}; mr.setSort(BSON("foo" << 1)); boost::intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest(nss)); - auto pipeline = translateFromMR(mr, expCtx); + auto pipeline = map_reduce_common::translateFromMR(mr, expCtx); auto& sources = pipeline->getSources(); ASSERT_EQ(4u, sources.size()); auto iter = sources.begin(); @@ -103,7 +104,7 @@ TEST(MapReduceAggTest, testSortWithLimit) { mr.setSort(BSON("foo" << 1)); mr.setLimit(23); boost::intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest(nss)); - auto pipeline = translateFromMR(mr, expCtx); + auto pipeline = map_reduce_common::translateFromMR(mr, expCtx); auto& sources = pipeline->getSources(); ASSERT_EQ(4u, sources.size()); auto iter = sources.begin(); @@ -127,7 +128,7 @@ TEST(MapReduceAggTest, testFeatureLadenTranslate) { << "fooval")); mr.setFinalize(boost::make_optional(MapReduceJavascriptCode{finalizeJavascript.toString()})); boost::intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest(nss)); - auto pipeline = translateFromMR(mr, expCtx); + auto pipeline = map_reduce_common::translateFromMR(mr, expCtx); auto& sources = pipeline->getSources(); ASSERT_EQ(7u, sources.size()); auto iter = sources.begin(); @@ -148,7 +149,7 @@ TEST(MapReduceAggTest, testOutMergeTranslate) { MapReduceJavascriptCode{reduceJavascript.toString()}, MapReduceOutOptions{boost::make_optional("db"s), "coll2", OutputType::Merge, false}}; boost::intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest(nss)); - auto pipeline = translateFromMR(mr, expCtx); + auto pipeline = map_reduce_common::translateFromMR(mr, expCtx); auto& sources = pipeline->getSources(); ASSERT_EQ(sources.size(), 4u); auto iter = sources.begin(); @@ -168,7 +169,7 @@ TEST(MapReduceAggTest, testOutReduceTranslate) { MapReduceJavascriptCode{reduceJavascript.toString()}, MapReduceOutOptions{boost::make_optional("db"s), "coll2", OutputType::Reduce, false}}; boost::intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest(nss)); - auto pipeline = translateFromMR(mr, expCtx); + auto pipeline = map_reduce_common::translateFromMR(mr, expCtx); auto& sources = pipeline->getSources(); ASSERT_EQ(sources.size(), 4u); auto iter = sources.begin(); @@ -190,7 +191,7 @@ TEST(MapReduceAggTest, testOutDifferentDBFails) { MapReduceJavascriptCode{reduceJavascript.toString()}, MapReduceOutOptions{boost::make_optional("db2"s), "coll2", OutputType::Replace, false}}; boost::intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest(nss)); - ASSERT_THROWS_CODE(translateFromMR(mr, expCtx), AssertionException, 31278); + ASSERT_THROWS_CODE(map_reduce_common::translateFromMR(mr, expCtx), AssertionException, 31278); } TEST(MapReduceAggTest, testOutSameCollection) { @@ -201,7 +202,7 @@ TEST(MapReduceAggTest, testOutSameCollection) { MapReduceJavascriptCode{reduceJavascript.toString()}, MapReduceOutOptions{boost::make_optional("db"s), "coll", OutputType::Replace, false}}; boost::intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest(nss)); - auto pipeline = translateFromMR(mr, expCtx); + auto pipeline = map_reduce_common::translateFromMR(mr, expCtx); auto& sources = pipeline->getSources(); ASSERT_EQ(sources.size(), 4u); auto iter = sources.begin(); diff --git a/src/mongo/db/commands/map_reduce_command_base.h b/src/mongo/db/commands/map_reduce_command_base.h index 8b16b29f21f..4de4676e02d 100644 --- a/src/mongo/db/commands/map_reduce_command_base.h +++ b/src/mongo/db/commands/map_reduce_command_base.h @@ -45,7 +45,7 @@ public: } virtual bool supportsWriteConcern(const BSONObj& cmd) const override { - return mr::mrSupportsWriteConcern(cmd); + return map_reduce_common::mrSupportsWriteConcern(cmd); } bool allowsAfterClusterTime(const BSONObj& cmd) const override { @@ -62,7 +62,7 @@ public: virtual void addRequiredPrivileges(const std::string& dbname, const BSONObj& cmdObj, std::vector<Privilege>* out) const { - mr::addPrivilegesRequiredForMapReduce(this, dbname, cmdObj, out); + map_reduce_common::addPrivilegesRequiredForMapReduce(this, dbname, cmdObj, out); } bool errmsgRun(OperationContext* opCtx, diff --git a/src/mongo/db/commands/mr.cpp b/src/mongo/db/commands/mr.cpp index b9d90555d4e..55e6586784d 100644 --- a/src/mongo/db/commands/mr.cpp +++ b/src/mongo/db/commands/mr.cpp @@ -46,6 +46,7 @@ #include "mongo/db/client.h" #include "mongo/db/clientcursor.h" #include "mongo/db/commands.h" +#include "mongo/db/commands/map_reduce_gen.h" #include "mongo/db/concurrency/write_conflict_exception.h" #include "mongo/db/db_raii.h" #include "mongo/db/dbhelpers.h" @@ -425,17 +426,17 @@ Config::Config(const string& _dbname, const BSONObj& cmdObj) { uassert(13602, "outType is no longer a valid option", cmdObj["outType"].eoo()); - outputOptions = mr::parseOutputOptions(dbname, cmdObj); + outputOptions = map_reduce_common::parseOutputOptions(dbname, cmdObj); shardedFirstPass = false; if (cmdObj.hasField("shardedFirstPass") && cmdObj["shardedFirstPass"].trueValue()) { massert(16054, "shardedFirstPass should only use replace outType", - outputOptions.outType == mr::OutputType::kReplace); + outputOptions.outType == OutputType::Replace); shardedFirstPass = true; } - if (outputOptions.outType != mr::OutputType::kInMemory) { + if (outputOptions.outType != OutputType::InMemory) { // Create names for the temp collection and the incremental collection. The incremental // collection goes in the "local" database, so that it doesn't get replicated. const std::string& outDBName = outputOptions.outDB.empty() ? dbname : outputOptions.outDB; @@ -739,7 +740,7 @@ void State::appendResults(BSONObjBuilder& final) { * This may involve replacing, merging or reducing. */ long long State::postProcessCollection(OperationContext* opCtx, CurOp* curOp) { - if (_onDisk == false || _config.outputOptions.outType == mr::OutputType::kInMemory) + if (_onDisk == false || _config.outputOptions.outType == OutputType::InMemory) return numInMemKeys(); bool holdingGlobalLock = false; @@ -763,7 +764,7 @@ long long State::postProcessCollectionNonAtomic(OperationContext* opCtx, if (_config.outputOptions.finalNamespace == _config.tempNamespace) return collectionCount(opCtx, _config.outputOptions.finalNamespace, callerHoldsGlobalLock); - if (_config.outputOptions.outType == mr::OutputType::kReplace || + if (_config.outputOptions.outType == OutputType::Replace || collectionCount(opCtx, _config.outputOptions.finalNamespace, callerHoldsGlobalLock) == 0) { // This must be global because we may write across different databases. Lock::GlobalWrite lock(opCtx); @@ -780,7 +781,7 @@ long long State::postProcessCollectionNonAtomic(OperationContext* opCtx, } _db.dropCollection(_config.tempNamespace.ns()); - } else if (_config.outputOptions.outType == mr::OutputType::kMerge) { + } else if (_config.outputOptions.outType == OutputType::Merge) { // merge: upsert new docs into old collection const auto count = collectionCount(opCtx, _config.tempNamespace, callerHoldsGlobalLock); @@ -799,7 +800,7 @@ long long State::postProcessCollectionNonAtomic(OperationContext* opCtx, } _db.dropCollection(_config.tempNamespace.ns()); pm.finished(); - } else if (_config.outputOptions.outType == mr::OutputType::kReduce) { + } else if (_config.outputOptions.outType == OutputType::Reduce) { // reduce: apply reduce op on new result and existing one BSONList values; @@ -926,7 +927,7 @@ State::State(OperationContext* opCtx, const Config& c) _dupCount(0), _numEmits(0) { _temp.reset(new InMemory()); - _onDisk = _config.outputOptions.outType != mr::OutputType::kInMemory; + _onDisk = _config.outputOptions.outType != OutputType::InMemory; } bool State::sourceExists() { @@ -1747,7 +1748,7 @@ bool runMapReduceShardedFinish(OperationContext* opCtx, std::vector<Chunk> chunks; - if (config.outputOptions.outType != mr::OutputType::kInMemory) { + if (config.outputOptions.outType != OutputType::InMemory) { auto outRoutingInfoStatus = Grid::get(opCtx)->catalogCache()->getCollectionRoutingInfo( opCtx, config.outputOptions.finalNamespace); uassertStatusOK(outRoutingInfoStatus.getStatus()); diff --git a/src/mongo/db/commands/mr.h b/src/mongo/db/commands/mr.h index d5e32d1f9b5..6ea15c30fee 100644 --- a/src/mongo/db/commands/mr.h +++ b/src/mongo/db/commands/mr.h @@ -224,7 +224,7 @@ public: NamespaceString incLong; NamespaceString tempNamespace; - mr::OutputOptions outputOptions; + map_reduce_common::OutputOptions outputOptions; // max number of keys allowed in JS map before switching mode long jsMaxKeys; diff --git a/src/mongo/db/commands/mr_common.cpp b/src/mongo/db/commands/mr_common.cpp index a13b323ea23..ea0dea5e218 100644 --- a/src/mongo/db/commands/mr_common.cpp +++ b/src/mongo/db/commands/mr_common.cpp @@ -40,15 +40,152 @@ #include "mongo/db/catalog/document_validation.h" #include "mongo/db/commands.h" #include "mongo/db/jsobj.h" +#include "mongo/db/pipeline/document_source.h" +#include "mongo/db/pipeline/document_source_group.h" +#include "mongo/db/pipeline/document_source_limit.h" +#include "mongo/db/pipeline/document_source_match.h" +#include "mongo/db/pipeline/document_source_merge.h" +#include "mongo/db/pipeline/document_source_out.h" +#include "mongo/db/pipeline/document_source_project.h" +#include "mongo/db/pipeline/document_source_single_document_transformation.h" +#include "mongo/db/pipeline/document_source_sort.h" +#include "mongo/db/pipeline/document_source_unwind.h" +#include "mongo/db/pipeline/expression_javascript.h" +#include "mongo/db/pipeline/parsed_aggregation_projection_node.h" +#include "mongo/db/pipeline/parsed_inclusion_projection.h" +#include "mongo/db/query/util/make_data_structure.h" +#include "mongo/util/intrusive_counter.h" #include "mongo/util/log.h" #include "mongo/util/str.h" -namespace mongo { - -namespace mr { +namespace mongo::map_reduce_common { namespace { Rarely nonAtomicDeprecationSampler; // Used to occasionally log deprecation messages. + +using namespace std::string_literals; + +auto translateSort(boost::intrusive_ptr<ExpressionContext> expCtx, + const BSONObj& sort, + const boost::optional<std::int64_t>& limit) { + return DocumentSourceSort::create(expCtx, sort, limit.get_value_or(-1)); +} + +auto translateMap(boost::intrusive_ptr<ExpressionContext> expCtx, std::string code) { + auto emitExpression = ExpressionInternalJsEmit::create( + expCtx, ExpressionFieldPath::parse(expCtx, "$$ROOT", expCtx->variablesParseState), code); + auto node = std::make_unique<parsed_aggregation_projection::InclusionNode>( + ProjectionPolicies{ProjectionPolicies::DefaultIdPolicy::kExcludeId}); + node->addExpressionForPath(FieldPath{"emits"s}, std::move(emitExpression)); + auto inclusion = std::unique_ptr<TransformerInterface>{ + std::make_unique<parsed_aggregation_projection::ParsedInclusionProjection>( + expCtx, + ProjectionPolicies{ProjectionPolicies::DefaultIdPolicy::kExcludeId}, + std::move(node))}; + return make_intrusive<DocumentSourceSingleDocumentTransformation>( + expCtx, std::move(inclusion), DocumentSourceProject::kStageName, false); +} + +auto translateReduce(boost::intrusive_ptr<ExpressionContext> expCtx, std::string code) { + auto accumulatorArguments = ExpressionObject::create( + expCtx, + make_vector<std::pair<std::string, boost::intrusive_ptr<Expression>>>( + std::pair{"data"s, + ExpressionFieldPath::parse(expCtx, "$emits", expCtx->variablesParseState)}, + std::pair{"eval"s, ExpressionConstant::create(expCtx, Value{code})})); + auto jsReduce = AccumulationStatement{ + "value", + std::move(accumulatorArguments), + AccumulationStatement::getFactory(AccumulatorInternalJsReduce::kAccumulatorName)}; + auto groupExpr = ExpressionFieldPath::parse(expCtx, "$emits.k", expCtx->variablesParseState); + return DocumentSourceGroup::create(expCtx, + std::move(groupExpr), + make_vector<AccumulationStatement>(std::move(jsReduce)), + boost::none); +} + +auto translateFinalize(boost::intrusive_ptr<ExpressionContext> expCtx, std::string code) { + auto jsExpression = ExpressionInternalJs::create( + expCtx, + ExpressionArray::create( + expCtx, + make_vector<boost::intrusive_ptr<Expression>>( + ExpressionFieldPath::parse(expCtx, "$_id", expCtx->variablesParseState), + ExpressionFieldPath::parse(expCtx, "$value", expCtx->variablesParseState))), + code); + auto node = std::make_unique<parsed_aggregation_projection::InclusionNode>( + ProjectionPolicies{ProjectionPolicies::DefaultIdPolicy::kExcludeId}); + node->addExpressionForPath(FieldPath{"value"s}, std::move(jsExpression)); + auto inclusion = std::unique_ptr<TransformerInterface>{ + std::make_unique<parsed_aggregation_projection::ParsedInclusionProjection>( + expCtx, + ProjectionPolicies{ProjectionPolicies::DefaultIdPolicy::kExcludeId}, + std::move(node))}; + return make_intrusive<DocumentSourceSingleDocumentTransformation>( + expCtx, std::move(inclusion), DocumentSourceProject::kStageName, false); +} + +auto translateOutReplace(boost::intrusive_ptr<ExpressionContext> expCtx, + const StringData inputDatabase, + NamespaceString targetNss) { + uassert(31278, + "MapReduce must output to the database belonging to its input collection - Input: "s + + inputDatabase + " Output: " + targetNss.db(), + inputDatabase == targetNss.db()); + return DocumentSourceOut::create(std::move(targetNss), expCtx); +} + +auto translateOutMerge(boost::intrusive_ptr<ExpressionContext> expCtx, NamespaceString targetNss) { + return DocumentSourceMerge::create(targetNss, + expCtx, + MergeWhenMatchedModeEnum::kReplace, + MergeWhenNotMatchedModeEnum::kInsert, + boost::none, // Let variables + boost::none, // pipeline + std::set<FieldPath>{FieldPath("_id"s)}, + boost::none); // targetCollectionVersion +} + +auto translateOutReduce(boost::intrusive_ptr<ExpressionContext> expCtx, + NamespaceString targetNss, + std::string code) { + // Because of communication for sharding, $merge must hold on to a serializable BSON object + // at the moment so we reparse here. + auto reduceObj = BSON("args" << BSON_ARRAY("$value" + << "$$new.value") + << "eval" << code); + + auto finalProjectSpec = + BSON(DocumentSourceProject::kStageName + << BSON("value" << BSON(ExpressionInternalJs::kExpressionName << reduceObj))); + auto pipelineSpec = boost::make_optional(std::vector<BSONObj>{finalProjectSpec}); + return DocumentSourceMerge::create(targetNss, + expCtx, + MergeWhenMatchedModeEnum::kPipeline, + MergeWhenNotMatchedModeEnum::kInsert, + boost::none, // Let variables + pipelineSpec, + std::set<FieldPath>{FieldPath("_id"s)}, + boost::none); // targetCollectionVersion +} + +auto translateOut(boost::intrusive_ptr<ExpressionContext> expCtx, + const OutputType outputType, + const StringData inputDatabase, + NamespaceString targetNss, + std::string reduceCode) { + switch (outputType) { + case OutputType::Replace: + return boost::make_optional(translateOutReplace(expCtx, inputDatabase, targetNss)); + case OutputType::Merge: + return boost::make_optional(translateOutMerge(expCtx, targetNss)); + case OutputType::Reduce: + return boost::make_optional(translateOutReduce(expCtx, targetNss, reduceCode)); + case OutputType::InMemory:; + } + return boost::optional<boost::intrusive_ptr<mongo::DocumentSource>>{}; +} + } // namespace OutputOptions parseOutputOptions(const std::string& dbname, const BSONObj& cmdObj) { @@ -57,24 +194,24 @@ OutputOptions parseOutputOptions(const std::string& dbname, const BSONObj& cmdOb outputOptions.outNonAtomic = false; if (cmdObj["out"].type() == String) { outputOptions.collectionName = cmdObj["out"].String(); - outputOptions.outType = OutputType::kReplace; + outputOptions.outType = OutputType::Replace; } else if (cmdObj["out"].type() == Object) { BSONObj o = cmdObj["out"].embeddedObject(); if (o.hasElement("normal")) { - outputOptions.outType = OutputType::kReplace; + outputOptions.outType = OutputType::Replace; outputOptions.collectionName = o["normal"].String(); } else if (o.hasElement("replace")) { - outputOptions.outType = OutputType::kReplace; + outputOptions.outType = OutputType::Replace; outputOptions.collectionName = o["replace"].String(); } else if (o.hasElement("merge")) { - outputOptions.outType = OutputType::kMerge; + outputOptions.outType = OutputType::Merge; outputOptions.collectionName = o["merge"].String(); } else if (o.hasElement("reduce")) { - outputOptions.outType = OutputType::kReduce; + outputOptions.outType = OutputType::Reduce; outputOptions.collectionName = o["reduce"].String(); } else if (o.hasElement("inline")) { - outputOptions.outType = OutputType::kInMemory; + outputOptions.outType = OutputType::InMemory; uassert(ErrorCodes::InvalidOptions, "cannot specify 'sharded' in combination with 'inline'", !o.hasElement("sharded")); @@ -96,8 +233,8 @@ OutputOptions parseOutputOptions(const std::string& dbname, const BSONObj& cmdOb if (outputOptions.outNonAtomic) { uassert(15895, "nonAtomic option cannot be used with this output type", - (outputOptions.outType == OutputType::kReduce || - outputOptions.outType == OutputType::kMerge)); + (outputOptions.outType == OutputType::Reduce || + outputOptions.outType == OutputType::Merge)); } else if (nonAtomicDeprecationSampler.tick()) { warning() << "Setting out.nonAtomic to false in MapReduce is deprecated."; } @@ -106,7 +243,7 @@ OutputOptions parseOutputOptions(const std::string& dbname, const BSONObj& cmdOb uasserted(13606, "'out' has to be a string or an object"); } - if (outputOptions.outType != OutputType::kInMemory) { + if (outputOptions.outType != OutputType::InMemory) { const StringData outDb(outputOptions.outDB.empty() ? dbname : outputOptions.outDB); const NamespaceString nss(outDb, outputOptions.collectionName); uassert(ErrorCodes::InvalidNamespace, @@ -130,10 +267,10 @@ void addPrivilegesRequiredForMapReduce(const BasicCommand* commandTemplate, inputResource.isExactNamespacePattern()); out->push_back(Privilege(inputResource, ActionType::find)); - if (outputOptions.outType != OutputType::kInMemory) { + if (outputOptions.outType != OutputType::InMemory) { ActionSet outputActions; outputActions.addAction(ActionType::insert); - if (outputOptions.outType == OutputType::kReplace) { + if (outputOptions.outType == OutputType::Replace) { outputActions.addAction(ActionType::remove); } else { outputActions.addAction(ActionType::update); @@ -163,5 +300,33 @@ bool mrSupportsWriteConcern(const BSONObj& cmd) { return true; } } -} // namespace mr -} // namespace mongo + +std::unique_ptr<Pipeline, PipelineDeleter> translateFromMR( + MapReduce parsedMr, boost::intrusive_ptr<ExpressionContext> expCtx) { + + // TODO: It would be good to figure out what kind of errors this would produce in the Status. + // It would be better not to produce something incomprehensible out of an internal translation. + return uassertStatusOK(Pipeline::create( + makeFlattenedList<boost::intrusive_ptr<DocumentSource>>( + parsedMr.getQuery().map( + [&](auto&& query) { return DocumentSourceMatch::create(query, expCtx); }), + parsedMr.getSort().map( + [&](auto&& sort) { return translateSort(expCtx, sort, parsedMr.getLimit()); }), + translateMap(expCtx, parsedMr.getMap().getCode()), + DocumentSourceUnwind::create(expCtx, "emits", false, boost::none), + translateReduce(expCtx, parsedMr.getReduce().getCode()), + parsedMr.getFinalize().map([&](auto&& finalize) { + return translateFinalize(expCtx, parsedMr.getFinalize()->getCode()); + }), + translateOut(expCtx, + parsedMr.getOutOptions().getOutputType(), + parsedMr.getNamespace().db(), + NamespaceString{parsedMr.getOutOptions().getDatabaseName() + ? *parsedMr.getOutOptions().getDatabaseName() + : parsedMr.getNamespace().db(), + parsedMr.getOutOptions().getCollectionName()}, + parsedMr.getReduce().getCode())), + expCtx)); +} + +} // namespace mongo::map_reduce_common diff --git a/src/mongo/db/commands/mr_common.h b/src/mongo/db/commands/mr_common.h index aad419b94bd..c3e80c11874 100644 --- a/src/mongo/db/commands/mr_common.h +++ b/src/mongo/db/commands/mr_common.h @@ -33,19 +33,11 @@ #include <vector> #include "mongo/db/commands.h" +#include "mongo/db/commands/map_reduce_gen.h" #include "mongo/db/jsobj.h" #include "mongo/db/namespace_string.h" -namespace mongo { - -namespace mr { - -enum class OutputType { - kReplace, // Atomically replace the collection. - kMerge, // Merge keys, override dups. - kReduce, // Merge keys, reduce dups. - kInMemory // Only store in memory, limited in size. -}; +namespace mongo::map_reduce_common { struct OutputOptions { std::string outDB; @@ -68,6 +60,11 @@ void addPrivilegesRequiredForMapReduce(const BasicCommand* commandTemplate, */ bool mrSupportsWriteConcern(const BSONObj& cmd); +/** + * Accepts a parsed mapReduce command and returns the equivalent aggregation pipeline. Note that the + * returned pipeline does *not* contain a $cursor stage and thus is not runnable. + */ +std::unique_ptr<Pipeline, PipelineDeleter> translateFromMR( + MapReduce parsedMr, boost::intrusive_ptr<ExpressionContext> expCtx); -} // namespace mr -} // namespace mongo +} // namespace mongo::map_reduce_common diff --git a/src/mongo/db/commands/mr_test.cpp b/src/mongo/db/commands/mr_test.cpp index 53b06339448..304edd39f83 100644 --- a/src/mongo/db/commands/mr_test.cpp +++ b/src/mongo/db/commands/mr_test.cpp @@ -41,6 +41,7 @@ #include "mongo/db/catalog/collection_options.h" #include "mongo/db/client.h" #include "mongo/db/commands.h" +#include "mongo/db/commands/map_reduce_gen.h" #include "mongo/db/dbdirectclient.h" #include "mongo/db/json.h" #include "mongo/db/op_observer_noop.h" @@ -78,17 +79,17 @@ void _compareOutputOptionField(const std::string& dbname, } /** - * Returns string representation of mr::Config::OutputType + * Returns string representation of OutputType */ -std::string _getOutTypeString(mr::OutputType outType) { +std::string _getOutTypeString(OutputType outType) { switch (outType) { - case mr::OutputType::kReplace: + case OutputType::Replace: return "REPLACE"; - case mr::OutputType::kMerge: + case OutputType::Merge: return "MERGE"; - case mr::OutputType::kReduce: + case OutputType::Reduce: return "REDUCE"; - case mr::OutputType::kInMemory: + case OutputType::InMemory: return "INMEMORY"; } MONGO_UNREACHABLE; @@ -103,9 +104,10 @@ void _testConfigParseOutputOptions(const std::string& dbname, const std::string& expectedCollectionName, const std::string& expectedFinalNamespace, bool expectedOutNonAtomic, - mr::OutputType expectedOutType) { + OutputType expectedOutType) { const BSONObj cmdObj = fromjson(cmdObjStr); - mr::OutputOptions outputOptions = mr::parseOutputOptions(dbname, cmdObj); + map_reduce_common::OutputOptions outputOptions = + map_reduce_common::parseOutputOptions(dbname, cmdObj); _compareOutputOptionField(dbname, cmdObjStr, "outDb", outputOptions.outDB, expectedOutDb); _compareOutputOptionField( dbname, cmdObjStr, "collectionName", outputOptions.collectionName, expectedCollectionName); @@ -124,31 +126,34 @@ void _testConfigParseOutputOptions(const std::string& dbname, } /** - * Tests for mr::parseOutputOptions. + * Tests for map_reduce_common::parseOutputOptions. */ TEST(ConfigOutputOptionsTest, parseOutputOptions) { // Missing 'out' field. - ASSERT_THROWS(mr::parseOutputOptions("mydb", fromjson("{}")), AssertionException); + ASSERT_THROWS(map_reduce_common::parseOutputOptions("mydb", fromjson("{}")), + AssertionException); // 'out' must be either string or object. - ASSERT_THROWS(mr::parseOutputOptions("mydb", fromjson("{out: 99}")), AssertionException); + ASSERT_THROWS(map_reduce_common::parseOutputOptions("mydb", fromjson("{out: 99}")), + AssertionException); // 'out.nonAtomic' is not supported with normal, replace or inline. - ASSERT_THROWS( - mr::parseOutputOptions("mydb", fromjson("{out: {normal: 'mycoll', nonAtomic: true}}")), - AssertionException); - ASSERT_THROWS( - mr::parseOutputOptions("mydb", fromjson("{out: {replace: 'mycoll', nonAtomic: true}}")), - AssertionException); - ASSERT_THROWS( - mr::parseOutputOptions("mydb", fromjson("{out: {inline: 'mycoll', nonAtomic: true}}")), - AssertionException); + ASSERT_THROWS(map_reduce_common::parseOutputOptions( + "mydb", fromjson("{out: {normal: 'mycoll', nonAtomic: true}}")), + AssertionException); + ASSERT_THROWS(map_reduce_common::parseOutputOptions( + "mydb", fromjson("{out: {replace: 'mycoll', nonAtomic: true}}")), + AssertionException); + ASSERT_THROWS(map_reduce_common::parseOutputOptions( + "mydb", fromjson("{out: {inline: 'mycoll', nonAtomic: true}}")), + AssertionException); // Unknown output specifer. - ASSERT_THROWS(mr::parseOutputOptions("mydb", fromjson("{out: {no_such_out_type: 'mycoll'}}")), + ASSERT_THROWS(map_reduce_common::parseOutputOptions( + "mydb", fromjson("{out: {no_such_out_type: 'mycoll'}}")), AssertionException); // 'out' is string. _testConfigParseOutputOptions( - "mydb", "{out: 'mycoll'}", "", "mycoll", "mydb.mycoll", false, mr::OutputType::kReplace); + "mydb", "{out: 'mycoll'}", "", "mycoll", "mydb.mycoll", false, OutputType::Replace); // 'out' is object. _testConfigParseOutputOptions("mydb", "{out: {normal: 'mycoll'}}", @@ -156,7 +161,7 @@ TEST(ConfigOutputOptionsTest, parseOutputOptions) { "mycoll", "mydb.mycoll", false, - mr::OutputType::kReplace); + OutputType::Replace); // 'out.db' overrides dbname parameter _testConfigParseOutputOptions("mydb1", "{out: {replace: 'mycoll', db: 'mydb2'}}", @@ -164,7 +169,7 @@ TEST(ConfigOutputOptionsTest, parseOutputOptions) { "mycoll", "mydb2.mycoll", false, - mr::OutputType::kReplace); + OutputType::Replace); // 'out.nonAtomic' is supported with merge and reduce. _testConfigParseOutputOptions("mydb", "{out: {merge: 'mycoll', nonAtomic: true}}", @@ -172,14 +177,14 @@ TEST(ConfigOutputOptionsTest, parseOutputOptions) { "mycoll", "mydb.mycoll", true, - mr::OutputType::kMerge); + OutputType::Merge); _testConfigParseOutputOptions("mydb", "{out: {reduce: 'mycoll', nonAtomic: true}}", "", "mycoll", "mydb.mycoll", true, - mr::OutputType::kReduce); + OutputType::Reduce); // inline _testConfigParseOutputOptions("mydb1", "{out: {inline: 'mycoll', db: 'mydb2'}}", @@ -187,7 +192,7 @@ TEST(ConfigOutputOptionsTest, parseOutputOptions) { "", "", false, - mr::OutputType::kInMemory); + OutputType::InMemory); // Order should not matter in fields of 'out' object. _testConfigParseOutputOptions("mydb1", @@ -196,35 +201,35 @@ TEST(ConfigOutputOptionsTest, parseOutputOptions) { "mycoll", "mydb2.mycoll", false, - mr::OutputType::kReplace); + OutputType::Replace); _testConfigParseOutputOptions("mydb1", "{out: {db: 'mydb2', replace: 'mycoll'}}", "mydb2", "mycoll", "mydb2.mycoll", false, - mr::OutputType::kReplace); + OutputType::Replace); _testConfigParseOutputOptions("mydb1", "{out: {nonAtomic: true, merge: 'mycoll'}}", "", "mycoll", "mydb1.mycoll", true, - mr::OutputType::kMerge); + OutputType::Merge); _testConfigParseOutputOptions("mydb1", "{out: {nonAtomic: true, reduce: 'mycoll'}}", "", "mycoll", "mydb1.mycoll", true, - mr::OutputType::kReduce); + OutputType::Reduce); _testConfigParseOutputOptions("mydb1", "{out: {db: 'mydb2', inline: 'mycoll'}}", "mydb2", "", "", false, - mr::OutputType::kInMemory); + OutputType::InMemory); } TEST(ConfigTest, ParseCollation) { diff --git a/src/mongo/db/pipeline/aggregation_request.cpp b/src/mongo/db/pipeline/aggregation_request.cpp index ec5c6af9a27..626867a499e 100644 --- a/src/mongo/db/pipeline/aggregation_request.cpp +++ b/src/mongo/db/pipeline/aggregation_request.cpp @@ -283,7 +283,6 @@ NamespaceString AggregationRequest::parseNs(const std::string& dbname, const BSO } Document AggregationRequest::serializeToCommandObj() const { - MutableDocument serialized; return Document{ {kCommandName, (_nss.isCollectionlessAggregateNS() ? Value(1) : Value(_nss.coll()))}, {kPipelineName, _pipeline}, diff --git a/src/mongo/db/pipeline/lite_parsed_pipeline.cpp b/src/mongo/db/pipeline/lite_parsed_pipeline.cpp index 0b9909c5ba4..6af52389e6a 100644 --- a/src/mongo/db/pipeline/lite_parsed_pipeline.cpp +++ b/src/mongo/db/pipeline/lite_parsed_pipeline.cpp @@ -73,7 +73,7 @@ void LiteParsedPipeline::assertSupportsMultiDocumentTransaction( } } -bool LiteParsedPipeline::verifyIsSupported( +void LiteParsedPipeline::verifyIsSupported( OperationContext* opCtx, const std::function<bool(OperationContext*, const NamespaceString&)> isSharded, const boost::optional<ExplainOptions::Verbosity> explain, @@ -85,14 +85,11 @@ bool LiteParsedPipeline::verifyIsSupported( // Verify litePipe can be run at the given read concern. assertSupportsReadConcern(opCtx, explain, enableMajorityReadConcern); // Verify that no involved namespace is sharded unless allowed by the pipeline. - auto sharded = false; for (const auto& nss : getInvolvedNamespaces()) { - sharded = isSharded(opCtx, nss); uassert(28769, str::stream() << nss.ns() << " cannot be sharded", - allowShardedForeignCollection(nss) || !sharded); + allowShardedForeignCollection(nss) || !isSharded(opCtx, nss)); } - return sharded; } } // namespace mongo diff --git a/src/mongo/db/pipeline/lite_parsed_pipeline.h b/src/mongo/db/pipeline/lite_parsed_pipeline.h index f5578873621..21e309dbaba 100644 --- a/src/mongo/db/pipeline/lite_parsed_pipeline.h +++ b/src/mongo/db/pipeline/lite_parsed_pipeline.h @@ -140,10 +140,9 @@ public: /** * Perform checks that verify that the LitePipe is valid. Note that this function must be called * before forwarding an aggregation command on an unsharded collection, in order to verify that - * the involved namespaces are allowed to be sharded. Returns true if any involved namespace is - * sharded. + * the involved namespaces are allowed to be sharded. */ - bool verifyIsSupported( + void verifyIsSupported( OperationContext* opCtx, const std::function<bool(OperationContext*, const NamespaceString&)> isSharded, const boost::optional<ExplainOptions::Verbosity> explain, diff --git a/src/mongo/db/pipeline/mongos_process_interface.h b/src/mongo/db/pipeline/mongos_process_interface.h index 33e133adff2..fc6457fcc4b 100644 --- a/src/mongo/db/pipeline/mongos_process_interface.h +++ b/src/mongo/db/pipeline/mongos_process_interface.h @@ -45,34 +45,6 @@ namespace mongo { */ class MongoSInterface : public MongoProcessCommon { public: - static BSONObj createPassthroughCommandForShard(OperationContext* opCtx, - const AggregationRequest& request, - const boost::optional<ShardId>& shardId, - Pipeline* pipeline, - BSONObj collationObj); - - /** - * Appends information to the command sent to the shards which should be appended both if this - * is a passthrough sent to a single shard and if this is a split pipeline. - */ - static BSONObj genericTransformForShards(MutableDocument&& cmdForShards, - OperationContext* opCtx, - const boost::optional<ShardId>& shardId, - const AggregationRequest& request, - BSONObj collationObj); - - static BSONObj createCommandForTargetedShards( - OperationContext* opCtx, - const AggregationRequest& request, - const LiteParsedPipeline& litePipe, - const cluster_aggregation_planner::SplitPipeline& splitPipeline, - const BSONObj collationObj, - const boost::optional<cluster_aggregation_planner::ShardedExchangePolicy> exchangeSpec, - bool needsMerge); - - static StatusWith<CachedCollectionRoutingInfo> getExecutionNsRoutingInfo( - OperationContext* opCtx, const NamespaceString& execNss); - MongoSInterface() = default; virtual ~MongoSInterface() = default; diff --git a/src/mongo/db/pipeline/sharded_agg_helpers.cpp b/src/mongo/db/pipeline/sharded_agg_helpers.cpp index 3ef81811d71..a469b6d9807 100644 --- a/src/mongo/db/pipeline/sharded_agg_helpers.cpp +++ b/src/mongo/db/pipeline/sharded_agg_helpers.cpp @@ -32,20 +32,56 @@ #include "sharded_agg_helpers.h" +#include "mongo/bson/util/bson_extract.h" +#include "mongo/client/connpool.h" +#include "mongo/db/auth/authorization_session.h" #include "mongo/db/curop.h" #include "mongo/db/pipeline/document_source.h" #include "mongo/db/pipeline/document_source_out.h" +#include "mongo/db/query/find_common.h" +#include "mongo/rpc/get_status_from_command_result.h" +#include "mongo/rpc/op_msg_rpc_impls.h" #include "mongo/s/catalog_cache.h" #include "mongo/s/cluster_commands_helpers.h" +#include "mongo/s/multi_statement_transaction_requests_sender.h" +#include "mongo/s/query/cluster_aggregation_planner.h" +#include "mongo/s/query/cluster_cursor_manager.h" #include "mongo/s/query/cluster_query_knobs_gen.h" #include "mongo/s/query/document_source_merge_cursors.h" +#include "mongo/s/query/store_possible_cursor.h" +#include "mongo/s/transaction_router.h" #include "mongo/util/fail_point.h" #include "mongo/util/log.h" -namespace mongo { -namespace sharded_agg_helpers { +namespace mongo::sharded_agg_helpers { -MONGO_FAIL_POINT_DEFINE(clusterAggregateHangBeforeEstablishingShardCursors); +using SplitPipeline = cluster_aggregation_planner::SplitPipeline; + +MONGO_FAIL_POINT_DEFINE(shardedAggregateHangBeforeEstablishingShardCursors); +MONGO_FAIL_POINT_DEFINE(shardedAggregateFailToEstablishMergingShardCursor); +MONGO_FAIL_POINT_DEFINE(shardedAggregateFailToDispatchExchangeConsumerPipeline); + +namespace { + +bool mustRunOnAllShards(const NamespaceString& nss, bool hasChangeStream) { + // The following aggregations must be routed to all shards: + // - Any collectionless aggregation, such as non-localOps $currentOp. + // - Any aggregation which begins with a $changeStream stage. + return nss.isCollectionlessAggregateNS() || hasChangeStream; +} + +Status appendCursorResponseToCommandResult(const ShardId& shardId, + const BSONObj cursorResponse, + BSONObjBuilder* result) { + // If a write error was encountered, append it to the output buffer first. + if (auto wcErrorElem = cursorResponse["writeConcernError"]) { + appendWriteConcernErrorToCmdResponse(shardId, wcErrorElem, *result); + } + + // Pass the results from the remote shard into our command response. + result->appendElementsUnique(CommandHelpers::filterCommandReplyForPassthrough(cursorResponse)); + return getStatusFromCommandResult(result->asTempObj()); +} /** * Given a document representing an aggregation command such as @@ -73,24 +109,18 @@ Document wrapAggAsExplain(Document aggregateCommand, ExplainOptions::Verbosity v return explainCommandBuilder.freeze(); } -BSONObj createPassthroughCommandForShard(OperationContext* opCtx, - const AggregationRequest& request, - const boost::optional<RuntimeConstants>& constants, - Pipeline* pipeline, - BSONObj collationObj) { - // Create the command for the shards. - MutableDocument targetedCmd(request.serializeToCommandObj()); - if (pipeline) { - targetedCmd[AggregationRequest::kPipelineName] = Value(pipeline->serialize()); +Shard::RetryPolicy getDesiredRetryPolicy(OperationContext* opCtx) { + // The idempotent retry policy will retry even for writeConcern failures, so only set it if the + // pipeline does not support writeConcern. + if (!opCtx->getWriteConcern().usedDefault) { + return Shard::RetryPolicy::kNotIdempotent; } - - return genericTransformForShards( - std::move(targetedCmd), opCtx, request, constants, collationObj); + return Shard::RetryPolicy::kIdempotent; } BSONObj genericTransformForShards(MutableDocument&& cmdForShards, OperationContext* opCtx, - const AggregationRequest& request, + boost::optional<ExplainOptions::Verbosity> explainVerbosity, const boost::optional<RuntimeConstants>& constants, BSONObj collationObj) { if (constants) { @@ -100,7 +130,7 @@ BSONObj genericTransformForShards(MutableDocument&& cmdForShards, cmdForShards[AggregationRequest::kFromMongosName] = Value(true); // If this is a request for an aggregation explain, then we must wrap the aggregate inside an // explain command. - if (auto explainVerbosity = request.getExplain()) { + if (explainVerbosity) { cmdForShards.reset(wrapAggAsExplain(cmdForShards.freeze(), *explainVerbosity)); } @@ -120,6 +150,93 @@ BSONObj genericTransformForShards(MutableDocument&& cmdForShards, return appendAllowImplicitCreate(cmdForShards.freeze().toBson(), false); } +std::vector<RemoteCursor> establishShardCursors( + OperationContext* opCtx, + const NamespaceString& nss, + bool hasChangeStream, + boost::optional<CachedCollectionRoutingInfo>& routingInfo, + const std::set<ShardId>& shardIds, + const BSONObj& cmdObj, + const ReadPreferenceSetting& readPref) { + LOG(1) << "Dispatching command " << redact(cmdObj) << " to establish cursors on shards"; + + const bool mustRunOnAll = mustRunOnAllShards(nss, hasChangeStream); + std::vector<std::pair<ShardId, BSONObj>> requests; + + // If we don't need to run on all shards, then we should always have a valid routing table. + invariant(routingInfo || mustRunOnAll); + + if (mustRunOnAll) { + // The pipeline contains a stage which must be run on all shards. Skip versioning and + // enqueue the raw command objects. + for (const auto& shardId : shardIds) { + requests.emplace_back(shardId, cmdObj); + } + } else if (routingInfo->cm()) { + // The collection is sharded. Use the routing table to decide which shards to target + // based on the query and collation, and build versioned requests for them. + for (const auto& shardId : shardIds) { + auto versionedCmdObj = + appendShardVersion(cmdObj, routingInfo->cm()->getVersion(shardId)); + requests.emplace_back(shardId, std::move(versionedCmdObj)); + } + } else { + // The collection is unsharded. Target only the primary shard for the database. + // Don't append shard version info when contacting the config servers. + const auto cmdObjWithShardVersion = !routingInfo->db().primary()->isConfig() + ? appendShardVersion(cmdObj, ChunkVersion::UNSHARDED()) + : cmdObj; + requests.emplace_back(routingInfo->db().primaryId(), + appendDbVersionIfPresent(cmdObjWithShardVersion, routingInfo->db())); + } + + if (MONGO_unlikely(shardedAggregateHangBeforeEstablishingShardCursors.shouldFail())) { + log() << "shardedAggregateHangBeforeEstablishingShardCursors fail point enabled. Blocking " + "until fail point is disabled."; + while (MONGO_unlikely(shardedAggregateHangBeforeEstablishingShardCursors.shouldFail())) { + sleepsecs(1); + } + } + + return establishCursors(opCtx, + Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor(), + nss, + readPref, + requests, + false /* do not allow partial results */, + getDesiredRetryPolicy(opCtx)); +} + +std::set<ShardId> getTargetedShards(OperationContext* opCtx, + bool mustRunOnAllShards, + const boost::optional<CachedCollectionRoutingInfo>& routingInfo, + const BSONObj shardQuery, + const BSONObj collation) { + if (mustRunOnAllShards) { + // The pipeline begins with a stage which must be run on all shards. + std::vector<ShardId> shardIds; + Grid::get(opCtx)->shardRegistry()->getAllShardIds(opCtx, &shardIds); + return {std::make_move_iterator(shardIds.begin()), std::make_move_iterator(shardIds.end())}; + } + + // If we don't need to run on all shards, then we should always have a valid routing table. + invariant(routingInfo); + + return getTargetedShardsForQuery(opCtx, *routingInfo, shardQuery, collation); +} + +ShardId pickMergingShard(OperationContext* opCtx, + bool needsPrimaryShardMerge, + const std::vector<ShardId>& targetedShards, + ShardId primaryShard) { + auto& prng = opCtx->getClient()->getPrng(); + // If we cannot merge on mongoS, establish the merge cursor on a shard. Perform the merging + // command on random shard, unless the pipeline dictates that it needs to be run on the primary + // shard for the database. + return needsPrimaryShardMerge ? primaryShard + : targetedShards[prng.nextInt32(targetedShards.size())]; +} + StatusWith<CachedCollectionRoutingInfo> getExecutionNsRoutingInfo(OperationContext* opCtx, const NamespaceString& execNss) { // First, verify that there are shards present in the cluster. If not, then we return the @@ -139,32 +256,80 @@ StatusWith<CachedCollectionRoutingInfo> getExecutionNsRoutingInfo(OperationConte return getCollectionRoutingInfoForTxnCmd(opCtx, execNss); } -Shard::RetryPolicy getDesiredRetryPolicy(const AggregationRequest& req) { - // The idempotent retry policy will retry even for writeConcern failures, so only set it if the - // pipeline does not support writeConcern. - if (req.getWriteConcern()) { - return Shard::RetryPolicy::kNotIdempotent; +Status appendExplainResults(sharded_agg_helpers::DispatchShardPipelineResults&& dispatchResults, + const boost::intrusive_ptr<ExpressionContext>& mergeCtx, + BSONObjBuilder* result) { + if (dispatchResults.splitPipeline) { + auto* mergePipeline = dispatchResults.splitPipeline->mergePipeline.get(); + const char* mergeType = [&]() { + if (mergePipeline->canRunOnMongos()) { + return "mongos"; + } else if (dispatchResults.exchangeSpec) { + return "exchange"; + } else if (mergePipeline->needsPrimaryShardMerger()) { + return "primaryShard"; + } else { + return "anyShard"; + } + }(); + + *result << "mergeType" << mergeType; + + MutableDocument pipelinesDoc; + pipelinesDoc.addField("shardsPart", + Value(dispatchResults.splitPipeline->shardsPipeline->writeExplainOps( + *mergeCtx->explain))); + if (dispatchResults.exchangeSpec) { + BSONObjBuilder bob; + dispatchResults.exchangeSpec->exchangeSpec.serialize(&bob); + bob.append("consumerShards", dispatchResults.exchangeSpec->consumerShards); + pipelinesDoc.addField("exchange", Value(bob.obj())); + } + pipelinesDoc.addField("mergerPart", + Value(mergePipeline->writeExplainOps(*mergeCtx->explain))); + + *result << "splitPipeline" << pipelinesDoc.freeze(); + } else { + *result << "splitPipeline" << BSONNULL; } - return Shard::RetryPolicy::kIdempotent; -} -bool mustRunOnAllShards(const NamespaceString& nss, const LiteParsedPipeline& litePipe) { - // The following aggregations must be routed to all shards: - // - Any collectionless aggregation, such as non-localOps $currentOp. - // - Any aggregation which begins with a $changeStream stage. - return nss.isCollectionlessAggregateNS() || litePipe.hasChangeStream(); + BSONObjBuilder shardExplains(result->subobjStart("shards")); + for (const auto& shardResult : dispatchResults.remoteExplainOutput) { + invariant(shardResult.shardHostAndPort); + + uassertStatusOK(shardResult.swResponse.getStatus()); + uassertStatusOK(getStatusFromCommandResult(shardResult.swResponse.getValue().data)); + + auto shardId = shardResult.shardId.toString(); + const auto& data = shardResult.swResponse.getValue().data; + BSONObjBuilder explain(shardExplains.subobjStart(shardId)); + explain << "host" << shardResult.shardHostAndPort->toString(); + if (auto stagesElement = data["stages"]) { + explain << "stages" << stagesElement; + } else { + auto queryPlannerElement = data["queryPlanner"]; + uassert(51157, + str::stream() << "Malformed explain response received from shard " << shardId + << ": " << data.toString(), + queryPlannerElement); + explain << "queryPlanner" << queryPlannerElement; + if (auto executionStatsElement = data["executionStats"]) { + explain << "executionStats" << executionStatsElement; + } + } + } + + return Status::OK(); } + BSONObj createCommandForTargetedShards( - OperationContext* opCtx, - const AggregationRequest& request, - const LiteParsedPipeline& litePipe, + const boost::intrusive_ptr<ExpressionContext>& expCtx, + Document serializedCommand, const cluster_aggregation_planner::SplitPipeline& splitPipeline, - const BSONObj collationObj, const boost::optional<cluster_aggregation_planner::ShardedExchangePolicy> exchangeSpec, - const boost::optional<RuntimeConstants>& constants, bool needsMerge) { // Create the command for the shards. - MutableDocument targetedCmd(request.serializeToCommandObj()); + MutableDocument targetedCmd(serializedCommand); // If we've parsed a pipeline on mongos, always override the pipeline, in case parsing it // has defaulted any arguments or otherwise changed the spec. For example, $listSessions may // have detected a logged in user and appended that user name to the $listSessions spec to @@ -193,8 +358,143 @@ BSONObj createCommandForTargetedShards( targetedCmd[AggregationRequest::kExchangeName] = exchangeSpec ? Value(exchangeSpec->exchangeSpec.toBSON()) : Value(); + return genericTransformForShards(std::move(targetedCmd), + expCtx->opCtx, + expCtx->explain, + expCtx->getRuntimeConstants(), + expCtx->collation); +} + +sharded_agg_helpers::DispatchShardPipelineResults dispatchExchangeConsumerPipeline( + const boost::intrusive_ptr<ExpressionContext>& expCtx, + const NamespaceString& executionNss, + Document serializedCommand, + sharded_agg_helpers::DispatchShardPipelineResults* shardDispatchResults) { + auto opCtx = expCtx->opCtx; + + if (MONGO_unlikely(shardedAggregateFailToDispatchExchangeConsumerPipeline.shouldFail())) { + log() << "shardedAggregateFailToDispatchExchangeConsumerPipeline fail point enabled."; + uasserted(ErrorCodes::FailPointEnabled, + "Asserting on exhange consumer pipeline dispatch due to failpoint."); + } + + // For all consumers construct a request with appropriate cursor ids and send to shards. + std::vector<std::pair<ShardId, BSONObj>> requests; + auto numConsumers = shardDispatchResults->exchangeSpec->consumerShards.size(); + std::vector<SplitPipeline> consumerPipelines; + for (size_t idx = 0; idx < numConsumers; ++idx) { + // Pick this consumer's cursors from producers. + std::vector<OwnedRemoteCursor> producers; + for (size_t p = 0; p < shardDispatchResults->numProducers; ++p) { + producers.emplace_back( + std::move(shardDispatchResults->remoteCursors[p * numConsumers + idx])); + } + + // Create a pipeline for a consumer and add the merging stage. + auto consumerPipeline = uassertStatusOK(Pipeline::create( + shardDispatchResults->splitPipeline->mergePipeline->getSources(), expCtx)); + + cluster_aggregation_planner::addMergeCursorsSource( + consumerPipeline.get(), + BSONObj(), + std::move(producers), + {}, + shardDispatchResults->splitPipeline->shardCursorsSortSpec, + Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor(), + false); + + consumerPipelines.emplace_back(std::move(consumerPipeline), nullptr, boost::none); + + auto consumerCmdObj = createCommandForTargetedShards( + expCtx, serializedCommand, consumerPipelines.back(), boost::none, false); + + requests.emplace_back(shardDispatchResults->exchangeSpec->consumerShards[idx], + consumerCmdObj); + } + auto cursors = establishCursors(opCtx, + Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor(), + executionNss, + ReadPreferenceSetting::get(opCtx), + requests, + false /* do not allow partial results */); + + // Convert remote cursors into a vector of "owned" cursors. + std::vector<OwnedRemoteCursor> ownedCursors; + for (auto&& cursor : cursors) { + ownedCursors.emplace_back(OwnedRemoteCursor(opCtx, std::move(cursor), executionNss)); + } + + // The merging pipeline is just a union of the results from each of the shards involved on the + // consumer side of the exchange. + auto mergePipeline = uassertStatusOK(Pipeline::create({}, expCtx)); + mergePipeline->setSplitState(Pipeline::SplitState::kSplitForMerge); + + SplitPipeline splitPipeline{nullptr, std::move(mergePipeline), boost::none}; + + // Relinquish ownership of the local consumer pipelines' cursors as each shard is now + // responsible for its own producer cursors. + for (const auto& pipeline : consumerPipelines) { + const auto& mergeCursors = + static_cast<DocumentSourceMergeCursors*>(pipeline.shardsPipeline->peekFront()); + mergeCursors->dismissCursorOwnership(); + } + return sharded_agg_helpers::DispatchShardPipelineResults{false, + std::move(ownedCursors), + {} /*TODO SERVER-36279*/, + std::move(splitPipeline), + nullptr, + BSONObj(), + numConsumers}; +} + +BSONObj createCommandForMergingShard(Document serializedCommand, + const boost::intrusive_ptr<ExpressionContext>& mergeCtx, + const ShardId& shardId, + bool mergingShardContributesData, + const Pipeline* pipelineForMerging) { + MutableDocument mergeCmd(serializedCommand); + + mergeCmd["pipeline"] = Value(pipelineForMerging->serialize()); + mergeCmd[AggregationRequest::kFromMongosName] = Value(true); + + mergeCmd[AggregationRequest::kRuntimeConstants] = + Value(mergeCtx->getRuntimeConstants().toBSON()); + + // If the user didn't specify a collation already, make sure there's a collation attached to + // the merge command, since the merging shard may not have the collection metadata. + if (mergeCmd.peek()["collation"].missing()) { + mergeCmd["collation"] = mergeCtx->getCollator() + ? Value(mergeCtx->getCollator()->getSpec().toBSON()) + : Value(Document{CollationSpec::kSimpleSpec}); + } + + const auto txnRouter = TransactionRouter::get(mergeCtx->opCtx); + if (txnRouter && mergingShardContributesData) { + // Don't include a readConcern since we can only include read concerns on the _first_ + // command sent to a participant per transaction. Assuming the merging shard is a + // participant, it will already have received another 'aggregate' command earlier which + // contained a readConcern. + mergeCmd.remove("readConcern"); + } + + return appendAllowImplicitCreate(mergeCmd.freeze().toBson(), false); +} + +BSONObj createPassthroughCommandForShard( + OperationContext* opCtx, + Document serializedCommand, + boost::optional<ExplainOptions::Verbosity> explainVerbosity, + const boost::optional<RuntimeConstants>& constants, + Pipeline* pipeline, + BSONObj collationObj) { + // Create the command for the shards. + MutableDocument targetedCmd(serializedCommand); + if (pipeline) { + targetedCmd[AggregationRequest::kPipelineName] = Value(pipeline->serialize()); + } + return genericTransformForShards( - std::move(targetedCmd), opCtx, request, constants, collationObj); + std::move(targetedCmd), opCtx, explainVerbosity, constants, collationObj); } /** @@ -203,12 +503,11 @@ BSONObj createCommandForTargetedShards( * shard version is encountered, refreshes the routing table and tries again. */ DispatchShardPipelineResults dispatchShardPipeline( - const boost::intrusive_ptr<ExpressionContext>& expCtx, - const NamespaceString& executionNss, - const AggregationRequest& aggRequest, - const LiteParsedPipeline& litePipe, - std::unique_ptr<Pipeline, PipelineDeleter> pipeline, - BSONObj collationObj) { + Document serializedCommand, + bool hasChangeStream, + std::unique_ptr<Pipeline, PipelineDeleter> pipeline) { + auto expCtx = pipeline->getContext(); + // The process is as follows: // - First, determine whether we need to target more than one shard. If so, we split the // pipeline; if not, we retain the existing pipeline. @@ -226,12 +525,11 @@ DispatchShardPipelineResults dispatchShardPipeline( const auto shardQuery = pipeline->getInitialQuery(); - auto executionNsRoutingInfoStatus = getExecutionNsRoutingInfo(opCtx, executionNss); + auto executionNsRoutingInfoStatus = getExecutionNsRoutingInfo(opCtx, expCtx->ns); // If this is a $changeStream, we swallow NamespaceNotFound exceptions and continue. // Otherwise, uassert on all exceptions here. - if (!(litePipe.hasChangeStream() && - executionNsRoutingInfoStatus == ErrorCodes::NamespaceNotFound)) { + if (!(hasChangeStream && executionNsRoutingInfoStatus == ErrorCodes::NamespaceNotFound)) { uassertStatusOK(executionNsRoutingInfoStatus); } @@ -240,9 +538,9 @@ DispatchShardPipelineResults dispatchShardPipeline( : boost::optional<CachedCollectionRoutingInfo>{}; // Determine whether we can run the entire aggregation on a single shard. - const bool mustRunOnAll = mustRunOnAllShards(executionNss, litePipe); + const bool mustRunOnAll = mustRunOnAllShards(expCtx->ns, hasChangeStream); std::set<ShardId> shardIds = getTargetedShards( - opCtx, mustRunOnAll, executionNsRoutingInfo, shardQuery, aggRequest.getCollation()); + opCtx, mustRunOnAll, executionNsRoutingInfo, shardQuery, expCtx->collation); // Don't need to split the pipeline if we are only targeting a single shard, unless: // - There is a stage that needs to be run on the primary shard and the single target shard @@ -268,16 +566,14 @@ DispatchShardPipelineResults dispatchShardPipeline( // Generate the command object for the targeted shards. BSONObj targetedCommand = splitPipeline - ? createCommandForTargetedShards(opCtx, - aggRequest, - litePipe, - *splitPipeline, - collationObj, - exchangeSpec, - expCtx->getRuntimeConstants(), - true) - : createPassthroughCommandForShard( - opCtx, aggRequest, expCtx->getRuntimeConstants(), pipeline.get(), collationObj); + ? createCommandForTargetedShards( + expCtx, serializedCommand, *splitPipeline, exchangeSpec, true) + : createPassthroughCommandForShard(expCtx->opCtx, + serializedCommand, + expCtx->explain, + expCtx->getRuntimeConstants(), + pipeline.get(), + expCtx->collation); // In order for a $changeStream to work reliably, we need the shard registry to be at least as // current as the logical time at which the pipeline was serialized to 'targetedCommand' above. @@ -286,14 +582,14 @@ DispatchShardPipelineResults dispatchShardPipeline( // may not have been forced to split if there was only one shard in the cluster when the command // began execution. If a shard was added since the earlier targeting logic ran, then refreshing // here may cause us to illegally target an unsplit pipeline to more than one shard. - if (litePipe.hasChangeStream()) { + if (hasChangeStream) { auto* shardRegistry = Grid::get(opCtx)->shardRegistry(); if (!shardRegistry->reload(opCtx)) { shardRegistry->reload(opCtx); } // Rebuild the set of shards as the shard registry might have changed. shardIds = getTargetedShards( - opCtx, mustRunOnAll, executionNsRoutingInfo, shardQuery, aggRequest.getCollation()); + opCtx, mustRunOnAll, executionNsRoutingInfo, shardQuery, expCtx->collation); } // If there were no shards when we began execution, we wouldn't have run this aggregation in the @@ -310,7 +606,7 @@ DispatchShardPipelineResults dispatchShardPipeline( // should not participate in the shard version protocol. shardResults = scatterGatherUnversionedTargetAllShards(opCtx, - executionNss.db(), + expCtx->ns.db(), targetedCommand, ReadPreferenceSetting::get(opCtx), Shard::RetryPolicy::kIdempotent); @@ -320,23 +616,22 @@ DispatchShardPipelineResults dispatchShardPipeline( invariant(executionNsRoutingInfo); shardResults = scatterGatherVersionedTargetByRoutingTable(opCtx, - executionNss.db(), - executionNss, + expCtx->ns.db(), + expCtx->ns, *executionNsRoutingInfo, targetedCommand, ReadPreferenceSetting::get(opCtx), Shard::RetryPolicy::kIdempotent, shardQuery, - aggRequest.getCollation()); + expCtx->collation); } } else { cursors = establishShardCursors(opCtx, - executionNss, - litePipe, + expCtx->ns, + hasChangeStream, executionNsRoutingInfo, shardIds, targetedCommand, - aggRequest, ReadPreferenceSetting::get(opCtx)); invariant(cursors.size() % shardIds.size() == 0, str::stream() << "Number of cursors (" << cursors.size() @@ -346,7 +641,7 @@ DispatchShardPipelineResults dispatchShardPipeline( // Convert remote cursors into a vector of "owned" cursors. std::vector<OwnedRemoteCursor> ownedCursors; for (auto&& cursor : cursors) { - ownedCursors.emplace_back(OwnedRemoteCursor(opCtx, std::move(cursor), executionNss)); + ownedCursors.emplace_back(OwnedRemoteCursor(opCtx, std::move(cursor), expCtx->ns)); } // Record the number of shards involved in the aggregation. If we are required to merge on @@ -366,82 +661,269 @@ DispatchShardPipelineResults dispatchShardPipeline( exchangeSpec}; } -std::set<ShardId> getTargetedShards(OperationContext* opCtx, - bool mustRunOnAllShards, - const boost::optional<CachedCollectionRoutingInfo>& routingInfo, - const BSONObj shardQuery, - const BSONObj collation) { - if (mustRunOnAllShards) { - // The pipeline begins with a stage which must be run on all shards. - std::vector<ShardId> shardIds; - Grid::get(opCtx)->shardRegistry()->getAllShardIds(opCtx, &shardIds); - return {shardIds.begin(), shardIds.end()}; +AsyncRequestsSender::Response establishMergingShardCursor(OperationContext* opCtx, + const NamespaceString& nss, + const BSONObj mergeCmdObj, + const ShardId& mergingShardId) { + if (MONGO_unlikely(shardedAggregateFailToEstablishMergingShardCursor.shouldFail())) { + log() << "shardedAggregateFailToEstablishMergingShardCursor fail point enabled."; + uasserted(ErrorCodes::FailPointEnabled, + "Asserting on establishing merging shard cursor due to failpoint."); } - // If we don't need to run on all shards, then we should always have a valid routing table. + MultiStatementTransactionRequestsSender ars( + opCtx, + Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor(), + nss.db().toString(), + {{mergingShardId, mergeCmdObj}}, + ReadPreferenceSetting::get(opCtx), + getDesiredRetryPolicy(opCtx)); + const auto response = ars.next(); + invariant(ars.done()); + return response; +} + +Status dispatchMergingPipeline( + const boost::intrusive_ptr<ExpressionContext>& expCtx, + const ClusterAggregate::Namespaces& namespaces, + Document serializedCommand, + long long batchSize, + const boost::optional<CachedCollectionRoutingInfo>& routingInfo, + sharded_agg_helpers::DispatchShardPipelineResults&& shardDispatchResults, + BSONObjBuilder* result, + const PrivilegeVector& privileges, + bool hasChangeStream) { + // We should never be in a situation where we call this function on a non-merge pipeline. + invariant(shardDispatchResults.splitPipeline); + auto* mergePipeline = shardDispatchResults.splitPipeline->mergePipeline.get(); + invariant(mergePipeline); + auto* opCtx = expCtx->opCtx; + + std::vector<ShardId> targetedShards; + targetedShards.reserve(shardDispatchResults.remoteCursors.size()); + for (auto&& remoteCursor : shardDispatchResults.remoteCursors) { + targetedShards.emplace_back(remoteCursor->getShardId().toString()); + } + + cluster_aggregation_planner::addMergeCursorsSource( + mergePipeline, + shardDispatchResults.commandForTargetedShards, + std::move(shardDispatchResults.remoteCursors), + targetedShards, + shardDispatchResults.splitPipeline->shardCursorsSortSpec, + Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor(), + hasChangeStream); + + // First, check whether we can merge on the mongoS. If the merge pipeline MUST run on mongoS, + // then ignore the internalQueryProhibitMergingOnMongoS parameter. + if (mergePipeline->requiredToRunOnMongos() || + (!internalQueryProhibitMergingOnMongoS.load() && mergePipeline->canRunOnMongos())) { + return runPipelineOnMongoS(namespaces, + batchSize, + std::move(shardDispatchResults.splitPipeline->mergePipeline), + result, + privileges); + } + + // If we are not merging on mongoS, then this is not a $changeStream aggregation, and we + // therefore must have a valid routing table. invariant(routingInfo); - return getTargetedShardsForQuery(opCtx, *routingInfo, shardQuery, collation); + const ShardId mergingShardId = pickMergingShard(opCtx, + shardDispatchResults.needsPrimaryShardMerge, + targetedShards, + routingInfo->db().primaryId()); + const bool mergingShardContributesData = + std::find(targetedShards.begin(), targetedShards.end(), mergingShardId) != + targetedShards.end(); + + auto mergeCmdObj = createCommandForMergingShard( + serializedCommand, expCtx, mergingShardId, mergingShardContributesData, mergePipeline); + + LOG(1) << "Dispatching merge pipeline " << redact(mergeCmdObj) << " to designated shard"; + + // Dispatch $mergeCursors to the chosen shard, store the resulting cursor, and return. + auto mergeResponse = + establishMergingShardCursor(opCtx, namespaces.executionNss, mergeCmdObj, mergingShardId); + uassertStatusOK(mergeResponse.swResponse); + + auto mergeCursorResponse = uassertStatusOK( + storePossibleCursor(opCtx, + mergingShardId, + *mergeResponse.shardHostAndPort, + mergeResponse.swResponse.getValue().data, + namespaces.requestedNss, + Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor(), + Grid::get(opCtx)->getCursorManager(), + privileges, + expCtx->tailableMode)); + + // Ownership for the shard cursors has been transferred to the merging shard. Dismiss the + // ownership in the current merging pipeline such that when it goes out of scope it does not + // attempt to kill the cursors. + auto mergeCursors = static_cast<DocumentSourceMergeCursors*>(mergePipeline->peekFront()); + mergeCursors->dismissCursorOwnership(); + + return appendCursorResponseToCommandResult(mergingShardId, mergeCursorResponse, result); } -std::vector<RemoteCursor> establishShardCursors( - OperationContext* opCtx, - const NamespaceString& nss, - const LiteParsedPipeline& litePipe, - boost::optional<CachedCollectionRoutingInfo>& routingInfo, - const std::set<ShardId>& shardIds, - const BSONObj& cmdObj, - const AggregationRequest& request, - const ReadPreferenceSetting& readPref) { - LOG(1) << "Dispatching command " << redact(cmdObj) << " to establish cursors on shards"; +BSONObj establishMergingMongosCursor(OperationContext* opCtx, + long long batchSize, + const NamespaceString& requestedNss, + std::unique_ptr<Pipeline, PipelineDeleter> pipelineForMerging, + const PrivilegeVector& privileges) { - const bool mustRunOnAll = mustRunOnAllShards(nss, litePipe); - std::vector<std::pair<ShardId, BSONObj>> requests; + ClusterClientCursorParams params(requestedNss, ReadPreferenceSetting::get(opCtx)); - // If we don't need to run on all shards, then we should always have a valid routing table. - invariant(routingInfo || mustRunOnAll); + params.originatingCommandObj = CurOp::get(opCtx)->opDescription().getOwned(); + params.tailableMode = pipelineForMerging->getContext()->tailableMode; + // A batch size of 0 is legal for the initial aggregate, but not valid for getMores, the batch + // size we pass here is used for getMores, so do not specify a batch size if the initial request + // had a batch size of 0. + params.batchSize = batchSize == 0 ? boost::none : boost::make_optional(batchSize); + params.lsid = opCtx->getLogicalSessionId(); + params.txnNumber = opCtx->getTxnNumber(); + params.originatingPrivileges = privileges; - if (mustRunOnAll) { - // The pipeline contains a stage which must be run on all shards. Skip versioning and - // enqueue the raw command objects. - for (auto&& shardId : shardIds) { - requests.emplace_back(std::move(shardId), cmdObj); + if (TransactionRouter::get(opCtx)) { + params.isAutoCommit = false; + } + + auto ccc = cluster_aggregation_planner::buildClusterCursor( + opCtx, std::move(pipelineForMerging), std::move(params)); + + auto cursorState = ClusterCursorManager::CursorState::NotExhausted; + + rpc::OpMsgReplyBuilder replyBuilder; + CursorResponseBuilder::Options options; + options.isInitialResponse = true; + + CursorResponseBuilder responseBuilder(&replyBuilder, options); + bool stashedResult = false; + + for (long long objCount = 0; objCount < batchSize; ++objCount) { + ClusterQueryResult next; + try { + next = uassertStatusOK(ccc->next(RouterExecStage::ExecContext::kInitialFind)); + } catch (const ExceptionFor<ErrorCodes::CloseChangeStream>&) { + // This exception is thrown when a $changeStream stage encounters an event + // that invalidates the cursor. We should close the cursor and return without + // error. + cursorState = ClusterCursorManager::CursorState::Exhausted; + break; } - } else if (routingInfo->cm()) { - // The collection is sharded. Use the routing table to decide which shards to target - // based on the query and collation, and build versioned requests for them. - for (auto& shardId : shardIds) { - auto versionedCmdObj = - appendShardVersion(cmdObj, routingInfo->cm()->getVersion(shardId)); - requests.emplace_back(std::move(shardId), std::move(versionedCmdObj)); + + // Check whether we have exhausted the pipeline's results. + if (next.isEOF()) { + // We reached end-of-stream. If the cursor is not tailable, then we mark it as + // exhausted. If it is tailable, usually we keep it open (i.e. "NotExhausted") even when + // we reach end-of-stream. However, if all the remote cursors are exhausted, there is no + // hope of returning data and thus we need to close the mongos cursor as well. + if (!ccc->isTailable() || ccc->remotesExhausted()) { + cursorState = ClusterCursorManager::CursorState::Exhausted; + } + break; } - } else { - // The collection is unsharded. Target only the primary shard for the database. - // Don't append shard version info when contacting the config servers. - const auto cmdObjWithShardVersion = !routingInfo->db().primary()->isConfig() - ? appendShardVersion(cmdObj, ChunkVersion::UNSHARDED()) - : cmdObj; - requests.emplace_back(routingInfo->db().primaryId(), - appendDbVersionIfPresent(cmdObjWithShardVersion, routingInfo->db())); - } - if (MONGO_unlikely(clusterAggregateHangBeforeEstablishingShardCursors.shouldFail())) { - log() << "clusterAggregateHangBeforeEstablishingShardCursors fail point enabled. Blocking " - "until fail point is disabled."; - while (MONGO_unlikely(clusterAggregateHangBeforeEstablishingShardCursors.shouldFail())) { - sleepsecs(1); + // If this result will fit into the current batch, add it. Otherwise, stash it in the cursor + // to be returned on the next getMore. + auto nextObj = *next.getResult(); + + if (!FindCommon::haveSpaceForNext(nextObj, objCount, responseBuilder.bytesUsed())) { + ccc->queueResult(nextObj); + stashedResult = true; + break; } + + // Set the postBatchResumeToken. For non-$changeStream aggregations, this will be empty. + responseBuilder.setPostBatchResumeToken(ccc->getPostBatchResumeToken()); + responseBuilder.append(nextObj); } - return establishCursors(opCtx, - Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor(), - nss, - readPref, - requests, - false /* do not allow partial results */, - getDesiredRetryPolicy(request)); + // For empty batches, or in the case where the final result was added to the batch rather than + // being stashed, we update the PBRT here to ensure that it is the most recent available. + if (!stashedResult) { + responseBuilder.setPostBatchResumeToken(ccc->getPostBatchResumeToken()); + } + + ccc->detachFromOperationContext(); + + int nShards = ccc->getNumRemotes(); + CursorId clusterCursorId = 0; + + if (cursorState == ClusterCursorManager::CursorState::NotExhausted) { + auto authUsers = AuthorizationSession::get(opCtx->getClient())->getAuthenticatedUserNames(); + clusterCursorId = uassertStatusOK(Grid::get(opCtx)->getCursorManager()->registerCursor( + opCtx, + ccc.releaseCursor(), + requestedNss, + ClusterCursorManager::CursorType::MultiTarget, + ClusterCursorManager::CursorLifetime::Mortal, + authUsers)); + } + + // Fill out the aggregation metrics in CurOp. + if (clusterCursorId > 0) { + CurOp::get(opCtx)->debug().cursorid = clusterCursorId; + } + CurOp::get(opCtx)->debug().nShards = std::max(CurOp::get(opCtx)->debug().nShards, nShards); + CurOp::get(opCtx)->debug().cursorExhausted = (clusterCursorId == 0); + CurOp::get(opCtx)->debug().nreturned = responseBuilder.numDocs(); + + responseBuilder.done(clusterCursorId, requestedNss.ns()); + + auto bodyBuilder = replyBuilder.getBodyBuilder(); + CommandHelpers::appendSimpleCommandStatus(bodyBuilder, true); + bodyBuilder.doneFast(); + + return replyBuilder.releaseBody(); } +/** + * Returns the output of the listCollections command filtered to the namespace 'nss'. + */ +BSONObj getUnshardedCollInfo(const Shard* primaryShard, const NamespaceString& nss) { + ScopedDbConnection conn(primaryShard->getConnString()); + std::list<BSONObj> all = + conn->getCollectionInfos(nss.db().toString(), BSON("name" << nss.coll())); + if (all.empty()) { + // Collection does not exist, return an empty object. + return BSONObj(); + } + return all.front(); +} + + +/** + * Returns the collection default collation or the simple collator if there is no default. If the + * collection does not exist, then returns an empty BSON Object. + */ +BSONObj getDefaultCollationForUnshardedCollection(const BSONObj collectionInfo) { + if (collectionInfo.isEmpty()) { + // Collection does not exist, return an empty object. + return BSONObj(); + } + + BSONObj defaultCollation = CollationSpec::kSimpleSpec; + if (collectionInfo["options"].type() == BSONType::Object) { + BSONObj collectionOptions = collectionInfo["options"].Obj(); + BSONElement collationElement; + auto status = bsonExtractTypedField( + collectionOptions, "collation", BSONType::Object, &collationElement); + if (status.isOK()) { + defaultCollation = collationElement.Obj().getOwned(); + uassert(ErrorCodes::BadValue, + "Default collation in collection metadata cannot be empty.", + !defaultCollation.isEmpty()); + } else if (status != ErrorCodes::NoSuchKey) { + uassertStatusOK(status); + } + } + return defaultCollation; +} + +} // namespace + std::unique_ptr<Pipeline, PipelineDeleter> targetShardsAndAddMergeCursors( const boost::intrusive_ptr<ExpressionContext>& expCtx, Pipeline* ownedPipeline) { std::unique_ptr<Pipeline, PipelineDeleter> pipeline(ownedPipeline, @@ -466,8 +948,9 @@ std::unique_ptr<Pipeline, PipelineDeleter> targetShardsAndAddMergeCursors( AggregationRequest aggRequest(expCtx->ns, rawStages); LiteParsedPipeline liteParsedPipeline(aggRequest); + auto hasChangeStream = liteParsedPipeline.hasChangeStream(); auto shardDispatchResults = dispatchShardPipeline( - expCtx, expCtx->ns, aggRequest, liteParsedPipeline, std::move(pipeline), expCtx->collation); + aggRequest.serializeToCommandObj(), hasChangeStream, std::move(pipeline)); std::vector<ShardId> targetedShards; targetedShards.reserve(shardDispatchResults.remoteCursors.size()); @@ -488,14 +971,281 @@ std::unique_ptr<Pipeline, PipelineDeleter> targetShardsAndAddMergeCursors( cluster_aggregation_planner::addMergeCursorsSource( mergePipeline.get(), - liteParsedPipeline, shardDispatchResults.commandForTargetedShards, std::move(shardDispatchResults.remoteCursors), targetedShards, shardCursorsSortSpec, - Grid::get(expCtx->opCtx)->getExecutorPool()->getArbitraryExecutor()); + Grid::get(expCtx->opCtx)->getExecutorPool()->getArbitraryExecutor(), + hasChangeStream); return mergePipeline; } -} // namespace sharded_agg_helpers -} // namespace mongo + +StatusWith<AggregationTargeter> AggregationTargeter::make( + OperationContext* opCtx, + const NamespaceString& executionNss, + const std::function<std::unique_ptr<Pipeline, PipelineDeleter>( + boost::optional<CachedCollectionRoutingInfo>)> buildPipelineFn, + stdx::unordered_set<NamespaceString> involvedNamespaces, + bool hasChangeStream, + bool allowedToPassthrough) { + + // Check if any of the involved collections are sharded. + bool involvesShardedCollections = [&]() { + for (const auto& nss : involvedNamespaces) { + const auto resolvedNsRoutingInfo = + uassertStatusOK(getCollectionRoutingInfoForTxnCmd(opCtx, nss)); + if (resolvedNsRoutingInfo.cm()) { + return true; + } + } + return false; + }(); + + // Determine whether this aggregation must be dispatched to all shards in the cluster. + const bool mustRunOnAll = mustRunOnAllShards(executionNss, hasChangeStream); + + // If the routing table is valid, we obtain a reference to it. If the table is not valid, then + // either the database does not exist, or there are no shards in the cluster. In the latter + // case, we always return an empty cursor. In the former case, if the requested aggregation is a + // $changeStream, we allow the operation to continue so that stream cursors can be established + // on the given namespace before the database or collection is actually created. If the database + // does not exist and this is not a $changeStream, then we return an empty cursor. + boost::optional<CachedCollectionRoutingInfo> routingInfo; + auto executionNsRoutingInfoStatus = getExecutionNsRoutingInfo(opCtx, executionNss); + if (executionNsRoutingInfoStatus.isOK()) { + routingInfo = std::move(executionNsRoutingInfoStatus.getValue()); + } else if (!(hasChangeStream && + executionNsRoutingInfoStatus == ErrorCodes::NamespaceNotFound)) { + return executionNsRoutingInfoStatus.getStatus(); + } + + // If we don't have a routing table, then this is a $changeStream which must run on all shards. + invariant(routingInfo || (mustRunOnAll && hasChangeStream)); + + // A pipeline is allowed to passthrough to the primary shard iff the following conditions are + // met: + // + // 1. The namespace of the aggregate and any other involved namespaces are unsharded. + // 2. Is allowed to be forwarded to shards. For example, $currentOp with localOps: true should + // run locally on mongos and cannot be forwarded to a shard. + // 3. Does not need to run on all shards. For example, a pipeline with a $changeStream or + // $currentOp. + // 4. Doesn't need transformation via DocumentSource::serialize(). For example, list sessions + // needs to include information about users that can only be deduced on mongos. + if (routingInfo && !routingInfo->cm() && !mustRunOnAll && allowedToPassthrough && + !involvesShardedCollections) { + return AggregationTargeter{TargetingPolicy::kPassthrough, nullptr, routingInfo}; + } else { + auto pipeline = buildPipelineFn(routingInfo); + auto policy = pipeline->requiredToRunOnMongos() ? TargetingPolicy::kMongosRequired + : TargetingPolicy::kAnyShard; + return AggregationTargeter{policy, std::move(pipeline), routingInfo}; + } +} + +Status runPipelineOnPrimaryShard(OperationContext* opCtx, + const ClusterAggregate::Namespaces& namespaces, + const CachedDatabaseInfo& dbInfo, + boost::optional<ExplainOptions::Verbosity> explain, + Document serializedCommand, + const PrivilegeVector& privileges, + BSONObjBuilder* out) { + // Format the command for the shard. This adds the 'fromMongos' field, wraps the command as an + // explain if necessary, and rewrites the result into a format safe to forward to shards. + BSONObj cmdObj = + CommandHelpers::filterCommandRequestForPassthrough(createPassthroughCommandForShard( + opCtx, serializedCommand, explain, boost::none, nullptr, BSONObj())); + + const auto shardId = dbInfo.primary()->getId(); + const auto cmdObjWithShardVersion = (shardId != ShardRegistry::kConfigServerShardId) + ? appendShardVersion(std::move(cmdObj), ChunkVersion::UNSHARDED()) + : std::move(cmdObj); + + MultiStatementTransactionRequestsSender ars( + opCtx, + Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor(), + namespaces.executionNss.db().toString(), + {{shardId, appendDbVersionIfPresent(cmdObjWithShardVersion, dbInfo)}}, + ReadPreferenceSetting::get(opCtx), + Shard::RetryPolicy::kIdempotent); + auto response = ars.next(); + invariant(ars.done()); + + uassertStatusOK(response.swResponse); + auto commandStatus = getStatusFromCommandResult(response.swResponse.getValue().data); + + if (ErrorCodes::isStaleShardVersionError(commandStatus.code())) { + uassertStatusOK(commandStatus.withContext("command failed because of stale config")); + } else if (ErrorCodes::isSnapshotError(commandStatus.code())) { + uassertStatusOK( + commandStatus.withContext("command failed because can not establish a snapshot")); + } + + BSONObj result; + if (explain) { + // If this was an explain, then we get back an explain result object rather than a cursor. + result = response.swResponse.getValue().data; + } else { + result = uassertStatusOK( + storePossibleCursor(opCtx, + shardId, + *response.shardHostAndPort, + response.swResponse.getValue().data, + namespaces.requestedNss, + Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor(), + Grid::get(opCtx)->getCursorManager(), + privileges, + TailableModeEnum::kNormal)); + } + + // First append the properly constructed writeConcernError. It will then be skipped + // in appendElementsUnique. + if (auto wcErrorElem = result["writeConcernError"]) { + appendWriteConcernErrorToCmdResponse(shardId, wcErrorElem, *out); + } + + out->appendElementsUnique(CommandHelpers::filterCommandReplyForPassthrough(result)); + + return getStatusFromCommandResult(out->asTempObj()); +} + +Status runPipelineOnMongoS(const ClusterAggregate::Namespaces& namespaces, + long long batchSize, + std::unique_ptr<Pipeline, PipelineDeleter> pipeline, + BSONObjBuilder* result, + const PrivilegeVector& privileges) { + auto expCtx = pipeline->getContext(); + + // We should never receive a pipeline which cannot run on mongoS. + invariant(!expCtx->explain); + invariant(pipeline->canRunOnMongos()); + + // Verify that the first stage can produce input for the remainder of the pipeline. + uassert(ErrorCodes::IllegalOperation, + str::stream() << "Aggregation pipeline must be run on mongoS, but " + << pipeline->getSources().front()->getSourceName() + << " is not capable of producing input", + !pipeline->getSources().front()->constraints().requiresInputDocSource); + + // Register the new mongoS cursor, and retrieve the initial batch of results. + auto cursorResponse = establishMergingMongosCursor( + expCtx->opCtx, batchSize, namespaces.requestedNss, std::move(pipeline), privileges); + + // We don't need to storePossibleCursor or propagate writeConcern errors; a pipeline with + // writing stages like $out can never run on mongoS. Filter the command response and return + // immediately. + CommandHelpers::filterCommandReplyForPassthrough(cursorResponse, result); + return getStatusFromCommandResult(result->asTempObj()); +} + +Status dispatchPipelineAndMerge(OperationContext* opCtx, + AggregationTargeter targeter, + Document serializedCommand, + long long batchSize, + const ClusterAggregate::Namespaces& namespaces, + const PrivilegeVector& privileges, + BSONObjBuilder* result, + bool hasChangeStream) { + auto expCtx = targeter.pipeline->getContext(); + // If not, split the pipeline as necessary and dispatch to the relevant shards. + auto shardDispatchResults = + dispatchShardPipeline(serializedCommand, hasChangeStream, std::move(targeter.pipeline)); + + // If the operation is an explain, then we verify that it succeeded on all targeted + // shards, write the results to the output builder, and return immediately. + if (expCtx->explain) { + return appendExplainResults(std::move(shardDispatchResults), expCtx, result); + } + + // If this isn't an explain, then we must have established cursors on at least one + // shard. + invariant(shardDispatchResults.remoteCursors.size() > 0); + + // If we sent the entire pipeline to a single shard, store the remote cursor and return. + if (!shardDispatchResults.splitPipeline) { + invariant(shardDispatchResults.remoteCursors.size() == 1); + auto&& remoteCursor = std::move(shardDispatchResults.remoteCursors.front()); + const auto shardId = remoteCursor->getShardId().toString(); + const auto reply = uassertStatusOK(storePossibleCursor(opCtx, + namespaces.requestedNss, + std::move(remoteCursor), + privileges, + expCtx->tailableMode)); + return appendCursorResponseToCommandResult(shardId, reply, result); + } + + // If we have the exchange spec then dispatch all consumers. + if (shardDispatchResults.exchangeSpec) { + shardDispatchResults = dispatchExchangeConsumerPipeline( + expCtx, namespaces.executionNss, serializedCommand, &shardDispatchResults); + } + + // If we reach here, we have a merge pipeline to dispatch. + return dispatchMergingPipeline(expCtx, + namespaces, + serializedCommand, + batchSize, + targeter.routingInfo, + std::move(shardDispatchResults), + result, + privileges, + hasChangeStream); +} + +std::pair<BSONObj, boost::optional<UUID>> getCollationAndUUID( + const boost::optional<CachedCollectionRoutingInfo>& routingInfo, + const NamespaceString& nss, + const BSONObj& collation) { + const bool collectionIsSharded = (routingInfo && routingInfo->cm()); + const bool collectionIsNotSharded = (routingInfo && !routingInfo->cm()); + + // If this is a collectionless aggregation, we immediately return the user- + // defined collation if one exists, or an empty BSONObj otherwise. Collectionless aggregations + // generally run on the 'admin' database, the standard logic would attempt to resolve its + // non-existent UUID and collation by sending a specious 'listCollections' command to the config + // servers. + if (nss.isCollectionlessAggregateNS()) { + return {collation, boost::none}; + } + + // If the collection is unsharded, obtain collInfo from the primary shard. + const auto unshardedCollInfo = collectionIsNotSharded + ? getUnshardedCollInfo(routingInfo->db().primary().get(), nss) + : BSONObj(); + + // Return the collection UUID if available, or boost::none otherwise. + const auto getUUID = [&]() -> auto { + if (collectionIsSharded) { + return routingInfo->cm()->getUUID(); + } else { + return unshardedCollInfo["info"] && unshardedCollInfo["info"]["uuid"] + ? boost::optional<UUID>{uassertStatusOK( + UUID::parse(unshardedCollInfo["info"]["uuid"]))} + : boost::optional<UUID>{boost::none}; + } + }; + + // If the collection exists, return its default collation, or the simple + // collation if no explicit default is present. If the collection does not + // exist, return an empty BSONObj. + const auto getCollation = [&]() -> auto { + if (!collectionIsSharded && !collectionIsNotSharded) { + return BSONObj(); + } + if (collectionIsNotSharded) { + return getDefaultCollationForUnshardedCollection(unshardedCollInfo); + } else { + return routingInfo->cm()->getDefaultCollator() + ? routingInfo->cm()->getDefaultCollator()->getSpec().toBSON() + : CollationSpec::kSimpleSpec; + } + }; + + // If the user specified an explicit collation, we always adopt it. Otherwise, + // obtain the collection default or simple collation as appropriate, and return + // it along with the collection's UUID. + return {collation.isEmpty() ? getCollation() : collation, getUUID()}; +} + +} // namespace mongo::sharded_agg_helpers diff --git a/src/mongo/db/pipeline/sharded_agg_helpers.h b/src/mongo/db/pipeline/sharded_agg_helpers.h index 15e0dd51c2e..b8c25a42510 100644 --- a/src/mongo/db/pipeline/sharded_agg_helpers.h +++ b/src/mongo/db/pipeline/sharded_agg_helpers.h @@ -32,6 +32,7 @@ #include "mongo/db/pipeline/pipeline.h" #include "mongo/s/async_requests_sender.h" #include "mongo/s/catalog_cache.h" +#include "mongo/s/query/cluster_aggregate.h" #include "mongo/s/query/cluster_aggregation_planner.h" namespace mongo { @@ -67,63 +68,78 @@ struct DispatchShardPipelineResults { boost::optional<cluster_aggregation_planner::ShardedExchangePolicy> exchangeSpec; }; -Shard::RetryPolicy getDesiredRetryPolicy(const AggregationRequest& req); +/** + * This structure contains information for targeting an aggregation pipeline in a sharded cluster. + */ +struct AggregationTargeter { + /** + * Populates and returns targeting info for an aggregation pipeline on the given namespace + * 'executionNss'. + */ + static StatusWith<AggregationTargeter> make( + OperationContext* opCtx, + const NamespaceString& executionNss, + const std::function<std::unique_ptr<Pipeline, PipelineDeleter>( + boost::optional<CachedCollectionRoutingInfo>)> buildPipelineFn, + stdx::unordered_set<NamespaceString> involvedNamespaces, + bool hasChangeStream, + bool allowedToPassthrough); + + enum TargetingPolicy { + kPassthrough, + kMongosRequired, + kAnyShard, + } policy; + + std::unique_ptr<Pipeline, PipelineDeleter> pipeline; + boost::optional<CachedCollectionRoutingInfo> routingInfo; +}; + +Status runPipelineOnPrimaryShard(OperationContext* opCtx, + const ClusterAggregate::Namespaces& namespaces, + const CachedDatabaseInfo& dbInfo, + boost::optional<ExplainOptions::Verbosity> explain, + Document serializedCommand, + const PrivilegeVector& privileges, + BSONObjBuilder* out); -bool mustRunOnAllShards(const NamespaceString& nss, const LiteParsedPipeline& litePipe); +/** + * Runs a pipeline on mongoS, having first validated that it is eligible to do so. This can be a + * pipeline which is split for merging, or an intact pipeline which must run entirely on mongoS. + */ +Status runPipelineOnMongoS(const ClusterAggregate::Namespaces& namespaces, + long long batchSize, + std::unique_ptr<Pipeline, PipelineDeleter> pipeline, + BSONObjBuilder* result, + const PrivilegeVector& privileges); -StatusWith<CachedCollectionRoutingInfo> getExecutionNsRoutingInfo(OperationContext* opCtx, - const NamespaceString& execNss); +/** + * Dispatches the pipeline in 'targeter' to the shards that are involved, and merges the results if + * necessary on either mongos or a randomly designated shard. + */ +Status dispatchPipelineAndMerge(OperationContext* opCtx, + sharded_agg_helpers::AggregationTargeter targeter, + Document serializedCommand, + long long batchSize, + const ClusterAggregate::Namespaces& namespaces, + const PrivilegeVector& privileges, + BSONObjBuilder* result, + bool hasChangeStream); /** - * Targets shards for the pipeline and returns a struct with the remote cursors or results, and the - * pipeline that will need to be executed to merge the results from the remotes. If a stale shard - * version is encountered, refreshes the routing table and tries again. + * Returns the "collation" and "uuid" for the collection given by "nss" with the following + * semantics: + * - The "collation" parameter will be set to the default collation for the collection or the + * simple collation if there is no default. If the collection does not exist or if the aggregate + * is on the collectionless namespace, this will be set to an empty object. + * - The "uuid" is retrieved from the chunk manager for sharded collections or the listCollections + * output for unsharded collections. The UUID will remain unset if the aggregate is on the + * collectionless namespace. */ -DispatchShardPipelineResults dispatchShardPipeline( - const boost::intrusive_ptr<ExpressionContext>& expCtx, - const NamespaceString& executionNss, - const AggregationRequest& aggRequest, - const LiteParsedPipeline& liteParsedPipeline, - std::unique_ptr<Pipeline, PipelineDeleter> pipeline, - BSONObj collationObj); - -std::set<ShardId> getTargetedShards(OperationContext* opCtx, - bool mustRunOnAllShards, - const boost::optional<CachedCollectionRoutingInfo>& routingInfo, - const BSONObj shardQuery, - const BSONObj collation); - -std::vector<RemoteCursor> establishShardCursors( - OperationContext* opCtx, +std::pair<BSONObj, boost::optional<UUID>> getCollationAndUUID( + const boost::optional<CachedCollectionRoutingInfo>& routingInfo, const NamespaceString& nss, - const LiteParsedPipeline& litePipe, - boost::optional<CachedCollectionRoutingInfo>& routingInfo, - const std::set<ShardId>& shardIds, - const BSONObj& cmdObj, - const AggregationRequest& request, - const ReadPreferenceSetting& readPref); - -BSONObj createCommandForTargetedShards( - OperationContext* opCtx, - const AggregationRequest& request, - const LiteParsedPipeline& litePipe, - const cluster_aggregation_planner::SplitPipeline& splitPipeline, - const BSONObj collationObj, - const boost::optional<cluster_aggregation_planner::ShardedExchangePolicy> exchangeSpec, - const boost::optional<RuntimeConstants>& constants, - bool needsMerge); - -BSONObj createPassthroughCommandForShard(OperationContext* opCtx, - const AggregationRequest& request, - const boost::optional<RuntimeConstants>& constants, - Pipeline* pipeline, - BSONObj collationObj); - -BSONObj genericTransformForShards(MutableDocument&& cmdForShards, - OperationContext* opCtx, - const AggregationRequest& request, - const boost::optional<RuntimeConstants>& constants, - BSONObj collationObj); + const BSONObj& collation); /** * For a sharded collection, establishes remote cursors on each shard that may have results, and diff --git a/src/mongo/s/commands/cluster_map_reduce.cpp b/src/mongo/s/commands/cluster_map_reduce.cpp index 59b6e1680ae..9ae34eca3c5 100644 --- a/src/mongo/s/commands/cluster_map_reduce.cpp +++ b/src/mongo/s/commands/cluster_map_reduce.cpp @@ -201,6 +201,43 @@ BSONObj fixForShards(const BSONObj& orig, return appendAllowImplicitCreate(b.obj(), false); } +/** + * Outline for sharded map reduce for sharded output, $out replace: + * + * ============= mongos ============= + * 1. Send map reduce command to all relevant shards with some extra info like the value for + * the chunkSize and the name of the temporary output collection. + * + * ============= shard ============= + * 2. Does normal map reduce. + * + * 3. Calls splitVector on itself against the output collection and puts the results into the + * response object. + * + * ============= mongos ============= + * 4. If the output collection is *not* sharded, uses the information from splitVector to + * create a pre-split sharded collection. + * + * 5. Grabs the distributed lock for the final output collection. + * + * 6. Sends mapReduce.shardedfinish. + * + * ============= shard ============= + * 7. Extracts the list of shards from the mapReduce.shardedfinish and performs a broadcast + * query against all of them to obtain all documents that this shard owns. + * + * 8. Performs the reduce operation against every document from step #7 and outputs them to + * another temporary collection. Also keeps track of the BSONObject size of every "reduced" + * document for each chunk range. + * + * 9. Atomically drops the old output collection and renames the temporary collection to the + * output collection. + * + * ============= mongos ============= + * 10. Releases the distributed lock acquired at step #5. + * + * 11. Inspects the BSONObject size from step #8 and determines if it needs to split. + */ bool runMapReduce(OperationContext* opCtx, const std::string& dbname, const BSONObj& cmdObj, diff --git a/src/mongo/s/commands/cluster_map_reduce_agg.cpp b/src/mongo/s/commands/cluster_map_reduce_agg.cpp index 9375d1c2176..2c2012c3af5 100644 --- a/src/mongo/s/commands/cluster_map_reduce_agg.cpp +++ b/src/mongo/s/commands/cluster_map_reduce_agg.cpp @@ -31,56 +31,185 @@ #include "mongo/bson/bsonobj.h" #include "mongo/bson/bsonobjbuilder.h" +#include "mongo/bson/util/bson_extract.h" +#include "mongo/client/connpool.h" +#include "mongo/db/auth/authorization_session.h" +#include "mongo/db/client.h" #include "mongo/db/commands.h" +#include "mongo/db/commands/map_reduce_agg.h" #include "mongo/db/commands/map_reduce_gen.h" +#include "mongo/db/commands/mr_common.h" +#include "mongo/db/pipeline/mongos_process_interface.h" +#include "mongo/db/pipeline/sharded_agg_helpers.h" +#include "mongo/db/query/collation/collator_factory_interface.h" #include "mongo/db/query/cursor_response.h" #include "mongo/db/query/getmore_request.h" +#include "mongo/db/query/map_reduce_output_format.h" +#include "mongo/s/catalog_cache.h" #include "mongo/s/commands/cluster_map_reduce_agg.h" namespace mongo { +namespace { -// Exhaust the cursor from the aggregation response and extract results and statistics. -std::vector<BSONObj> getAllAggregationResults(OperationContext* opCtx, - const std::string& dbname, - CursorResponse& response) { - CursorId cursorId = response.getCursorId(); - auto fullBatch = response.releaseBatch(); - while (cursorId != 0) { - GetMoreRequest request( - response.getNSS(), cursorId, boost::none, boost::none, boost::none, boost::none); - BSONObj getMoreResponse = CommandHelpers::runCommandDirectly( - opCtx, OpMsgRequest::fromDBAndBody(dbname, request.toBSON())); - auto getMoreCursorResponse = CursorResponse::parseFromBSONThrowing(getMoreResponse); - auto nextBatch = getMoreCursorResponse.releaseBatch(); - fullBatch.insert(fullBatch.end(), nextBatch.begin(), nextBatch.end()); - cursorId = getMoreCursorResponse.getCursorId(); +auto makeExpressionContext(OperationContext* opCtx, + const MapReduce& parsedMr, + boost::optional<CachedCollectionRoutingInfo> routingInfo) { + // Populate the collection UUID and the appropriate collation to use. + auto nss = parsedMr.getNamespace(); + auto [collationObj, uuid] = sharded_agg_helpers::getCollationAndUUID( + routingInfo, nss, parsedMr.getCollation().get_value_or(BSONObj())); + + std::unique_ptr<CollatorInterface> resolvedCollator; + if (!collationObj.isEmpty()) { + // This will be null if attempting to build an interface for the simple collator. + resolvedCollator = uassertStatusOK( + CollatorFactoryInterface::get(opCtx->getServiceContext())->makeFromBSON(collationObj)); + } + + // Resolve involved namespaces. + StringMap<ExpressionContext::ResolvedNamespace> resolvedNamespaces; + resolvedNamespaces.try_emplace(nss.coll(), nss, std::vector<BSONObj>{}); + if (parsedMr.getOutOptions().getOutputType() != OutputType::InMemory) { + auto outNss = NamespaceString{parsedMr.getOutOptions().getDatabaseName() + ? *parsedMr.getOutOptions().getDatabaseName() + : parsedMr.getNamespace().db(), + parsedMr.getOutOptions().getCollectionName()}; + resolvedNamespaces.try_emplace(outNss.coll(), outNss, std::vector<BSONObj>{}); } - return fullBatch; + + auto expCtx = make_intrusive<ExpressionContext>( + opCtx, + boost::none, // explain + false, // fromMongos + false, // needsmerge + true, // allowDiskUse + parsedMr.getBypassDocumentValidation().get_value_or(false), + nss, + collationObj, + boost::none, // runtimeConstants + std::move(resolvedCollator), + std::make_shared<MongoSInterface>(), + std::move(resolvedNamespaces), + boost::none); // uuid + expCtx->inMongos = true; + return expCtx; } +Document serializeToCommand(BSONObj originalCmd, const MapReduce& parsedMr, Pipeline* pipeline) { + MutableDocument translatedCmd; + + translatedCmd["aggregate"] = Value(parsedMr.getNamespace().coll()); + translatedCmd["pipeline"] = Value(pipeline->serialize()); + translatedCmd["cursor"] = Value(Document{{"batchSize", std::numeric_limits<long long>::max()}}); + translatedCmd["allowDiskUse"] = Value(true); + translatedCmd["fromMongos"] = Value(true); + + // Append generic command options. + for (const auto& elem : CommandHelpers::appendPassthroughFields(originalCmd, BSONObj())) { + translatedCmd[elem.fieldNameStringData()] = Value(elem); + } + return translatedCmd.freeze(); +} + +} // namespace + bool runAggregationMapReduce(OperationContext* opCtx, const std::string& dbname, const BSONObj& cmd, std::string& errmsg, BSONObjBuilder& result) { - // Pretend we have built the appropriate pipeline and aggregation request. - auto mrRequest = MapReduce::parse(IDLParserErrorContext("MapReduce"), cmd); - const BSONObj aggRequest = - fromjson(str::stream() << "{aggregate: '" << mrRequest.getNamespace().coll() - << "', pipeline: [ { $group: { _id: { user: \"$user\" }," - << "count: { $sum: 1 } } } ], cursor: {}}"); - BSONObj aggResult = CommandHelpers::runCommandDirectly( - opCtx, OpMsgRequest::fromDBAndBody(dbname, std::move(aggRequest))); - - bool inMemory = mrRequest.getOutOptions().getOutputType() == OutputType::InMemory; - std::string outColl = mrRequest.getOutOptions().getCollectionName(); - // Either inline response specified or we have an output collection. - invariant(inMemory ^ !outColl.empty()); - - auto cursorResponse = CursorResponse::parseFromBSONThrowing(aggResult); - auto completeBatch = getAllAggregationResults(opCtx, dbname, cursorResponse); - [[maybe_unused]] CursorResponse completeCursor( - cursorResponse.getNSS(), cursorResponse.getCursorId(), std::move(completeBatch)); + auto parsedMr = MapReduce::parse(IDLParserErrorContext("MapReduce"), cmd); + stdx::unordered_set<NamespaceString> involvedNamespaces{parsedMr.getNamespace()}; + auto resolvedOutNss = NamespaceString{parsedMr.getOutOptions().getDatabaseName() + ? *parsedMr.getOutOptions().getDatabaseName() + : parsedMr.getNamespace().db(), + parsedMr.getOutOptions().getCollectionName()}; + + if (parsedMr.getOutOptions().getOutputType() != OutputType::InMemory) { + involvedNamespaces.insert(resolvedOutNss); + } + + const auto pipelineBuilder = [&](boost::optional<CachedCollectionRoutingInfo> routingInfo) { + return map_reduce_common::translateFromMR( + parsedMr, makeExpressionContext(opCtx, parsedMr, routingInfo)); + }; + + auto namespaces = + ClusterAggregate::Namespaces{parsedMr.getNamespace(), parsedMr.getNamespace()}; + + // Auth has already been checked for the original mapReduce command, no need to recheck here. + PrivilegeVector privileges; + + // This holds the raw results from the aggregation, which will be reformatted to match the + // expected mapReduce output. + BSONObjBuilder tempResults; + + auto targeter = uassertStatusOK( + sharded_agg_helpers::AggregationTargeter::make(opCtx, + parsedMr.getNamespace(), + pipelineBuilder, + involvedNamespaces, + false, // hasChangeStream + true)); // allowedToPassthrough + switch (targeter.policy) { + case sharded_agg_helpers::AggregationTargeter::TargetingPolicy::kPassthrough: { + // For the passthrough case, the targeter will not build a pipeline since its not needed + // in the normal aggregation path. For this translation, though, we need to build the + // pipeline to serialize and send to the primary shard. + auto serialized = + serializeToCommand(cmd, parsedMr, pipelineBuilder(targeter.routingInfo).get()); + uassertStatusOK( + sharded_agg_helpers::runPipelineOnPrimaryShard(opCtx, + namespaces, + targeter.routingInfo->db(), + boost::none, // explain + std::move(serialized), + privileges, + &tempResults)); + break; + } + + case sharded_agg_helpers::AggregationTargeter::TargetingPolicy::kMongosRequired: { + // Pipelines generated from mapReduce should never be required to run on mongos. + uasserted(31291, "Internal error during mapReduce translation"); + break; + } + + case sharded_agg_helpers::AggregationTargeter::TargetingPolicy::kAnyShard: { + auto serialized = serializeToCommand(cmd, parsedMr, targeter.pipeline.get()); + uassertStatusOK( + sharded_agg_helpers::dispatchPipelineAndMerge(opCtx, + std::move(targeter), + std::move(serialized), + std::numeric_limits<long long>::max(), + namespaces, + privileges, + &tempResults, + false)); // hasChangeStream + break; + } + } + + auto aggResults = tempResults.done(); + if (parsedMr.getOutOptions().getOutputType() == OutputType::InMemory) { + auto exhaustedResults = [&]() { + BSONArrayBuilder bab; + for (auto&& elem : aggResults["cursor"]["firstBatch"].Obj()) + bab.append(elem.embeddedObject()); + return bab.arr(); + }(); + map_reduce_output_format::appendInlineResponse(std::move(exhaustedResults), + parsedMr.getVerbose().get_value_or(false), + true, // inMongos + &result); + } else { + map_reduce_output_format::appendOutResponse( + parsedMr.getOutOptions().getDatabaseName(), + parsedMr.getOutOptions().getCollectionName(), + boost::get_optional_value_or(parsedMr.getVerbose(), false), + true, // inMongos + &result); + } return true; } diff --git a/src/mongo/s/commands/cluster_map_reduce_cmd.cpp b/src/mongo/s/commands/cluster_map_reduce_cmd.cpp index 0c40f522e1b..c0abf8efd87 100644 --- a/src/mongo/s/commands/cluster_map_reduce_cmd.cpp +++ b/src/mongo/s/commands/cluster_map_reduce_cmd.cpp @@ -41,43 +41,6 @@ namespace mongo { namespace { -/** - * Outline for sharded map reduce for sharded output, $out replace: - * - * ============= mongos ============= - * 1. Send map reduce command to all relevant shards with some extra info like the value for - * the chunkSize and the name of the temporary output collection. - * - * ============= shard ============= - * 2. Does normal map reduce. - * - * 3. Calls splitVector on itself against the output collection and puts the results into the - * response object. - * - * ============= mongos ============= - * 4. If the output collection is *not* sharded, uses the information from splitVector to - * create a pre-split sharded collection. - * - * 5. Grabs the distributed lock for the final output collection. - * - * 6. Sends mapReduce.shardedfinish. - * - * ============= shard ============= - * 7. Extracts the list of shards from the mapReduce.shardedfinish and performs a broadcast - * query against all of them to obtain all documents that this shard owns. - * - * 8. Performs the reduce operation against every document from step #7 and outputs them to - * another temporary collection. Also keeps track of the BSONObject size of every "reduced" - * document for each chunk range. - * - * 9. Atomically drops the old output collection and renames the temporary collection to the - * output collection. - * - * ============= mongos ============= - * 10. Releases the distributed lock acquired at step #5. - * - * 11. Inspects the BSONObject size from step #8 and determines if it needs to split. - */ class ClusterMapReduceCommand : public MapReduceCommandBase { public: ClusterMapReduceCommand() = default; diff --git a/src/mongo/s/query/cluster_aggregate.cpp b/src/mongo/s/query/cluster_aggregate.cpp index 953f4c48d05..9de374b9202 100644 --- a/src/mongo/s/query/cluster_aggregate.cpp +++ b/src/mongo/s/query/cluster_aggregate.cpp @@ -35,7 +35,6 @@ #include <boost/intrusive_ptr.hpp> -#include "mongo/bson/util/bson_extract.h" #include "mongo/db/auth/authorization_session.h" #include "mongo/db/client.h" #include "mongo/db/commands.h" @@ -75,503 +74,34 @@ #include "mongo/s/query/store_possible_cursor.h" #include "mongo/s/stale_exception.h" #include "mongo/s/transaction_router.h" -#include "mongo/util/fail_point.h" #include "mongo/util/log.h" #include "mongo/util/net/socket_utils.h" namespace mongo { -using SplitPipeline = cluster_aggregation_planner::SplitPipeline; - -MONGO_FAIL_POINT_DEFINE(clusterAggregateFailToEstablishMergingShardCursor); -MONGO_FAIL_POINT_DEFINE(clusterAggregateFailToDispatchExchangeConsumerPipeline); - constexpr unsigned ClusterAggregate::kMaxViewRetries; namespace { -Status appendCursorResponseToCommandResult(const ShardId& shardId, - const BSONObj cursorResponse, - BSONObjBuilder* result) { - // If a write error was encountered, append it to the output buffer first. - if (auto wcErrorElem = cursorResponse["writeConcernError"]) { - appendWriteConcernErrorToCmdResponse(shardId, wcErrorElem, *result); - } - - // Pass the results from the remote shard into our command response. - result->appendElementsUnique(CommandHelpers::filterCommandReplyForPassthrough(cursorResponse)); - return getStatusFromCommandResult(result->asTempObj()); -} - -BSONObj createCommandForMergingShard(const AggregationRequest& request, - const boost::intrusive_ptr<ExpressionContext>& mergeCtx, - const ShardId& shardId, - bool mergingShardContributesData, - const Pipeline* pipelineForMerging) { - MutableDocument mergeCmd(request.serializeToCommandObj()); - - mergeCmd["pipeline"] = Value(pipelineForMerging->serialize()); - mergeCmd[AggregationRequest::kFromMongosName] = Value(true); - - mergeCmd[AggregationRequest::kRuntimeConstants] = - Value(mergeCtx->getRuntimeConstants().toBSON()); - - // If the user didn't specify a collation already, make sure there's a collation attached to - // the merge command, since the merging shard may not have the collection metadata. - if (mergeCmd.peek()["collation"].missing()) { - mergeCmd["collation"] = mergeCtx->getCollator() - ? Value(mergeCtx->getCollator()->getSpec().toBSON()) - : Value(Document{CollationSpec::kSimpleSpec}); - } - - const auto txnRouter = TransactionRouter::get(mergeCtx->opCtx); - if (txnRouter && mergingShardContributesData) { - // Don't include a readConcern since we can only include read concerns on the _first_ - // command sent to a participant per transaction. Assuming the merging shard is a - // participant, it will already have received another 'aggregate' command earlier which - // contained a readConcern. - - mergeCmd.remove("readConcern"); - } - - return appendAllowImplicitCreate(mergeCmd.freeze().toBson(), false); -} - -sharded_agg_helpers::DispatchShardPipelineResults dispatchExchangeConsumerPipeline( - const boost::intrusive_ptr<ExpressionContext>& expCtx, - const NamespaceString& executionNss, - const AggregationRequest& request, - const LiteParsedPipeline& litePipe, - BSONObj collationObj, - sharded_agg_helpers::DispatchShardPipelineResults* shardDispatchResults) { - invariant(!litePipe.hasChangeStream()); - auto opCtx = expCtx->opCtx; - - if (MONGO_unlikely(clusterAggregateFailToDispatchExchangeConsumerPipeline.shouldFail())) { - log() << "clusterAggregateFailToDispatchExchangeConsumerPipeline fail point enabled."; - uasserted(ErrorCodes::FailPointEnabled, - "Asserting on exhange consumer pipeline dispatch due to failpoint."); - } - - // For all consumers construct a request with appropriate cursor ids and send to shards. - std::vector<std::pair<ShardId, BSONObj>> requests; - auto numConsumers = shardDispatchResults->exchangeSpec->consumerShards.size(); - std::vector<SplitPipeline> consumerPipelines; - for (size_t idx = 0; idx < numConsumers; ++idx) { - // Pick this consumer's cursors from producers. - std::vector<OwnedRemoteCursor> producers; - for (size_t p = 0; p < shardDispatchResults->numProducers; ++p) { - producers.emplace_back( - std::move(shardDispatchResults->remoteCursors[p * numConsumers + idx])); - } - - // Create a pipeline for a consumer and add the merging stage. - auto consumerPipeline = uassertStatusOK(Pipeline::create( - shardDispatchResults->splitPipeline->mergePipeline->getSources(), expCtx)); - - cluster_aggregation_planner::addMergeCursorsSource( - consumerPipeline.get(), - litePipe, - BSONObj(), - std::move(producers), - {}, - shardDispatchResults->splitPipeline->shardCursorsSortSpec, - Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor()); - - consumerPipelines.emplace_back(std::move(consumerPipeline), nullptr, boost::none); - - auto consumerCmdObj = - sharded_agg_helpers::createCommandForTargetedShards(opCtx, - request, - litePipe, - consumerPipelines.back(), - collationObj, - boost::none, - expCtx->getRuntimeConstants(), - false); - - requests.emplace_back(shardDispatchResults->exchangeSpec->consumerShards[idx], - consumerCmdObj); - } - auto cursors = establishCursors(opCtx, - Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor(), - executionNss, - ReadPreferenceSetting::get(opCtx), - requests, - false /* do not allow partial results */); - - // Convert remote cursors into a vector of "owned" cursors. - std::vector<OwnedRemoteCursor> ownedCursors; - for (auto&& cursor : cursors) { - ownedCursors.emplace_back(OwnedRemoteCursor(opCtx, std::move(cursor), executionNss)); - } - - // The merging pipeline is just a union of the results from each of the shards involved on the - // consumer side of the exchange. - auto mergePipeline = uassertStatusOK(Pipeline::create({}, expCtx)); - mergePipeline->setSplitState(Pipeline::SplitState::kSplitForMerge); - - SplitPipeline splitPipeline{nullptr, std::move(mergePipeline), boost::none}; - - // Relinquish ownership of the local consumer pipelines' cursors as each shard is now - // responsible for its own producer cursors. - for (const auto& pipeline : consumerPipelines) { - const auto& mergeCursors = - static_cast<DocumentSourceMergeCursors*>(pipeline.shardsPipeline->peekFront()); - mergeCursors->dismissCursorOwnership(); - } - return sharded_agg_helpers::DispatchShardPipelineResults{false, - std::move(ownedCursors), - {} /*TODO SERVER-36279*/, - std::move(splitPipeline), - nullptr, - BSONObj(), - numConsumers}; -} - -Status appendExplainResults(sharded_agg_helpers::DispatchShardPipelineResults&& dispatchResults, - const boost::intrusive_ptr<ExpressionContext>& mergeCtx, - BSONObjBuilder* result) { - if (dispatchResults.splitPipeline) { - auto* mergePipeline = dispatchResults.splitPipeline->mergePipeline.get(); - const char* mergeType = [&]() { - if (mergePipeline->canRunOnMongos()) { - return "mongos"; - } else if (dispatchResults.exchangeSpec) { - return "exchange"; - } else if (mergePipeline->needsPrimaryShardMerger()) { - return "primaryShard"; - } else { - return "anyShard"; - } - }(); - - *result << "mergeType" << mergeType; - - MutableDocument pipelinesDoc; - pipelinesDoc.addField("shardsPart", - Value(dispatchResults.splitPipeline->shardsPipeline->writeExplainOps( - *mergeCtx->explain))); - if (dispatchResults.exchangeSpec) { - BSONObjBuilder bob; - dispatchResults.exchangeSpec->exchangeSpec.serialize(&bob); - bob.append("consumerShards", dispatchResults.exchangeSpec->consumerShards); - pipelinesDoc.addField("exchange", Value(bob.obj())); - } - pipelinesDoc.addField("mergerPart", - Value(mergePipeline->writeExplainOps(*mergeCtx->explain))); - - *result << "splitPipeline" << pipelinesDoc.freeze(); - } else { - *result << "splitPipeline" << BSONNULL; - } - - BSONObjBuilder shardExplains(result->subobjStart("shards")); - for (const auto& shardResult : dispatchResults.remoteExplainOutput) { - invariant(shardResult.shardHostAndPort); - - uassertStatusOK(shardResult.swResponse.getStatus()); - uassertStatusOK(getStatusFromCommandResult(shardResult.swResponse.getValue().data)); - - auto shardId = shardResult.shardId.toString(); - const auto& data = shardResult.swResponse.getValue().data; - BSONObjBuilder explain(shardExplains.subobjStart(shardId)); - explain << "host" << shardResult.shardHostAndPort->toString(); - if (auto stagesElement = data["stages"]) { - explain << "stages" << stagesElement; - } else { - auto queryPlannerElement = data["queryPlanner"]; - uassert(51157, - str::stream() << "Malformed explain response received from shard " << shardId - << ": " << data.toString(), - queryPlannerElement); - explain << "queryPlanner" << queryPlannerElement; - if (auto executionStatsElement = data["executionStats"]) { - explain << "executionStats" << executionStatsElement; - } - } - } - - return Status::OK(); -} - -AsyncRequestsSender::Response establishMergingShardCursor(OperationContext* opCtx, - const NamespaceString& nss, - const AggregationRequest& request, - const BSONObj mergeCmdObj, - const ShardId& mergingShardId) { - if (MONGO_unlikely(clusterAggregateFailToEstablishMergingShardCursor.shouldFail())) { - log() << "clusterAggregateFailToEstablishMergingShardCursor fail point enabled."; - uasserted(ErrorCodes::FailPointEnabled, - "Asserting on establishing merging shard cursor due to failpoint."); - } - - MultiStatementTransactionRequestsSender ars( - opCtx, - Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor(), - nss.db().toString(), - {{mergingShardId, mergeCmdObj}}, - ReadPreferenceSetting::get(opCtx), - sharded_agg_helpers::getDesiredRetryPolicy(request)); - const auto response = ars.next(); - invariant(ars.done()); - return response; -} - -BSONObj establishMergingMongosCursor(OperationContext* opCtx, - const AggregationRequest& request, - const NamespaceString& requestedNss, - const LiteParsedPipeline& liteParsedPipeline, - std::unique_ptr<Pipeline, PipelineDeleter> pipelineForMerging, - const PrivilegeVector& privileges) { - - ClusterClientCursorParams params(requestedNss, ReadPreferenceSetting::get(opCtx)); - - params.originatingCommandObj = CurOp::get(opCtx)->opDescription().getOwned(); - params.tailableMode = pipelineForMerging->getContext()->tailableMode; - // A batch size of 0 is legal for the initial aggregate, but not valid for getMores, the batch - // size we pass here is used for getMores, so do not specify a batch size if the initial request - // had a batch size of 0. - params.batchSize = request.getBatchSize() == 0 - ? boost::none - : boost::optional<long long>(request.getBatchSize()); - params.lsid = opCtx->getLogicalSessionId(); - params.txnNumber = opCtx->getTxnNumber(); - params.originatingPrivileges = privileges; - - if (TransactionRouter::get(opCtx)) { - params.isAutoCommit = false; - } - - auto ccc = cluster_aggregation_planner::buildClusterCursor( - opCtx, std::move(pipelineForMerging), std::move(params)); - - auto cursorState = ClusterCursorManager::CursorState::NotExhausted; - - rpc::OpMsgReplyBuilder replyBuilder; - CursorResponseBuilder::Options options; - options.isInitialResponse = true; - - CursorResponseBuilder responseBuilder(&replyBuilder, options); - bool stashedResult = false; - - for (long long objCount = 0; objCount < request.getBatchSize(); ++objCount) { - ClusterQueryResult next; - try { - next = uassertStatusOK(ccc->next(RouterExecStage::ExecContext::kInitialFind)); - } catch (const ExceptionFor<ErrorCodes::CloseChangeStream>&) { - // This exception is thrown when a $changeStream stage encounters an event - // that invalidates the cursor. We should close the cursor and return without - // error. - cursorState = ClusterCursorManager::CursorState::Exhausted; - break; - } - - // Check whether we have exhausted the pipeline's results. - if (next.isEOF()) { - // We reached end-of-stream. If the cursor is not tailable, then we mark it as - // exhausted. If it is tailable, usually we keep it open (i.e. "NotExhausted") even when - // we reach end-of-stream. However, if all the remote cursors are exhausted, there is no - // hope of returning data and thus we need to close the mongos cursor as well. - if (!ccc->isTailable() || ccc->remotesExhausted()) { - cursorState = ClusterCursorManager::CursorState::Exhausted; - } - break; - } - - // If this result will fit into the current batch, add it. Otherwise, stash it in the cursor - // to be returned on the next getMore. - auto nextObj = *next.getResult(); - - if (!FindCommon::haveSpaceForNext(nextObj, objCount, responseBuilder.bytesUsed())) { - ccc->queueResult(nextObj); - stashedResult = true; - break; - } - - // Set the postBatchResumeToken. For non-$changeStream aggregations, this will be empty. - responseBuilder.setPostBatchResumeToken(ccc->getPostBatchResumeToken()); - responseBuilder.append(nextObj); - } - - // For empty batches, or in the case where the final result was added to the batch rather than - // being stashed, we update the PBRT here to ensure that it is the most recent available. - if (!stashedResult) { - responseBuilder.setPostBatchResumeToken(ccc->getPostBatchResumeToken()); - } - - ccc->detachFromOperationContext(); - - int nShards = ccc->getNumRemotes(); - CursorId clusterCursorId = 0; - - if (cursorState == ClusterCursorManager::CursorState::NotExhausted) { - auto authUsers = AuthorizationSession::get(opCtx->getClient())->getAuthenticatedUserNames(); - clusterCursorId = uassertStatusOK(Grid::get(opCtx)->getCursorManager()->registerCursor( - opCtx, - ccc.releaseCursor(), - requestedNss, - ClusterCursorManager::CursorType::MultiTarget, - ClusterCursorManager::CursorLifetime::Mortal, - authUsers)); - } - - // Fill out the aggregation metrics in CurOp. - if (clusterCursorId > 0) { - CurOp::get(opCtx)->debug().cursorid = clusterCursorId; - } - CurOp::get(opCtx)->debug().nShards = std::max(CurOp::get(opCtx)->debug().nShards, nShards); - CurOp::get(opCtx)->debug().cursorExhausted = (clusterCursorId == 0); - CurOp::get(opCtx)->debug().nreturned = responseBuilder.numDocs(); - - responseBuilder.done(clusterCursorId, requestedNss.ns()); - - auto bodyBuilder = replyBuilder.getBodyBuilder(); - CommandHelpers::appendSimpleCommandStatus(bodyBuilder, true); - bodyBuilder.doneFast(); - - return replyBuilder.releaseBody(); -} - -/** - * Returns the output of the listCollections command filtered to the namespace 'nss'. - */ -BSONObj getUnshardedCollInfo(const Shard* primaryShard, const NamespaceString& nss) { - ScopedDbConnection conn(primaryShard->getConnString()); - std::list<BSONObj> all = - conn->getCollectionInfos(nss.db().toString(), BSON("name" << nss.coll())); - if (all.empty()) { - // Collection does not exist, return an empty object. - return BSONObj(); - } - return all.front(); -} - -/** - * Returns the collection default collation or the simple collator if there is no default. If the - * collection does not exist, then returns an empty BSON Object. - */ -BSONObj getDefaultCollationForUnshardedCollection(const BSONObj collectionInfo) { - if (collectionInfo.isEmpty()) { - // Collection does not exist, return an empty object. - return BSONObj(); - } - - BSONObj defaultCollation = CollationSpec::kSimpleSpec; - if (collectionInfo["options"].type() == BSONType::Object) { - BSONObj collectionOptions = collectionInfo["options"].Obj(); - BSONElement collationElement; - auto status = bsonExtractTypedField( - collectionOptions, "collation", BSONType::Object, &collationElement); - if (status.isOK()) { - defaultCollation = collationElement.Obj().getOwned(); - uassert(ErrorCodes::BadValue, - "Default collation in collection metadata cannot be empty.", - !defaultCollation.isEmpty()); - } else if (status != ErrorCodes::NoSuchKey) { - uassertStatusOK(status); - } - } - return defaultCollation; -} - -/** - * Populates the "collation" and "uuid" parameters with the following semantics: - * - The "collation" parameter will be set to the default collation for the collection or the - * simple collation if there is no default. If the collection does not exist or if the aggregate - * is on the collectionless namespace, this will be set to an empty object. - * - The "uuid" is retrieved from the chunk manager for sharded collections or the listCollections - * output for unsharded collections. The UUID will remain unset if the aggregate is on the - * collectionless namespace. - */ -std::pair<BSONObj, boost::optional<UUID>> getCollationAndUUID( - const boost::optional<CachedCollectionRoutingInfo>& routingInfo, - const NamespaceString& nss, - const AggregationRequest& request, - const LiteParsedPipeline& litePipe) { - const bool collectionIsSharded = (routingInfo && routingInfo->cm()); - const bool collectionIsNotSharded = (routingInfo && !routingInfo->cm()); - - // If this is a change stream or a collectionless aggregation, we immediately return the user- - // defined collation if one exists, or an empty BSONObj otherwise. Change streams never inherit - // the collection's default collation, and since collectionless aggregations generally run on - // the 'admin' database, the standard logic would attempt to resolve its non-existent UUID and - // collation by sending a specious 'listCollections' command to the config servers. - if (litePipe.hasChangeStream() || nss.isCollectionlessAggregateNS()) { - return {request.getCollation(), boost::none}; - } - - // If the collection is unsharded, obtain collInfo from the primary shard. - const auto unshardedCollInfo = collectionIsNotSharded - ? getUnshardedCollInfo(routingInfo->db().primary().get(), nss) - : BSONObj(); - - // Return the collection UUID if available, or boost::none otherwise. - const auto getUUID = [&]() -> auto { - if (collectionIsSharded) { - return routingInfo->cm()->getUUID(); - } else { - return unshardedCollInfo["info"] && unshardedCollInfo["info"]["uuid"] - ? boost::optional<UUID>{uassertStatusOK( - UUID::parse(unshardedCollInfo["info"]["uuid"]))} - : boost::optional<UUID>{boost::none}; - } - }; - - // If the collection exists, return its default collation, or the simple - // collation if no explicit default is present. If the collection does not - // exist, return an empty BSONObj. - const auto getCollation = [&]() -> auto { - if (!collectionIsSharded && !collectionIsNotSharded) { - return BSONObj(); - } - if (collectionIsNotSharded) { - return getDefaultCollationForUnshardedCollection(unshardedCollInfo); - } else { - return routingInfo->cm()->getDefaultCollator() - ? routingInfo->cm()->getDefaultCollator()->getSpec().toBSON() - : CollationSpec::kSimpleSpec; - } - }; - - // If the user specified an explicit collation, we always adopt it. Otherwise, - // obtain the collection default or simple collation as appropriate, and return - // it along with the collection's UUID. - return {request.getCollation().isEmpty() ? getCollation() : request.getCollation(), getUUID()}; -} - -ShardId pickMergingShard(OperationContext* opCtx, - bool needsPrimaryShardMerge, - const std::vector<ShardId>& targetedShards, - ShardId primaryShard) { - auto& prng = opCtx->getClient()->getPrng(); - // If we cannot merge on mongoS, establish the merge cursor on a shard. Perform the merging - // command on random shard, unless the pipeline dictates that it needs to be run on the primary - // shard for the database. - return needsPrimaryShardMerge ? primaryShard - : targetedShards[prng.nextInt32(targetedShards.size())]; -} - // "Resolve" involved namespaces into a map. We won't try to execute anything on a mongos, but we // still have to populate this map so that any $lookups, etc. will be able to have a resolved view // definition. It's okay that this is incorrect, we will repopulate the real namespace map on the // mongod. Note that this function must be called before forwarding an aggregation command on an // unsharded collection, in order to verify that the involved namespaces are allowed to be sharded. -auto resolveInvolvedNamespaces(OperationContext* opCtx, const LiteParsedPipeline& litePipe) { +auto resolveInvolvedNamespaces(stdx::unordered_set<NamespaceString> involvedNamespaces) { StringMap<ExpressionContext::ResolvedNamespace> resolvedNamespaces; - for (auto&& nss : litePipe.getInvolvedNamespaces()) { + for (auto&& nss : involvedNamespaces) { resolvedNamespaces.try_emplace(nss.coll(), nss, std::vector<BSONObj>{}); } return resolvedNamespaces; } // Build an appropriate ExpressionContext for the pipeline. This helper instantiates an appropriate -// collator, creates a MongoProcessInterface for use by the pipeline's stages, and optionally -// extracts the UUID from the collection info if present. +// collator, creates a MongoProcessInterface for use by the pipeline's stages, and sets the +// collection UUID if provided. boost::intrusive_ptr<ExpressionContext> makeExpressionContext( OperationContext* opCtx, const AggregationRequest& request, - const LiteParsedPipeline& litePipe, BSONObj collationObj, boost::optional<UUID> uuid, StringMap<ExpressionContext::ResolvedNamespace> resolvedNamespaces) { @@ -592,127 +122,13 @@ boost::intrusive_ptr<ExpressionContext> makeExpressionContext( std::move(resolvedNamespaces), uuid); + // Keep the backing collation object on the context up to date with the resolved collator. + mergeCtx->collation = collationObj; + mergeCtx->inMongos = true; return mergeCtx; } -// Runs a pipeline on mongoS, having first validated that it is eligible to do so. This can be a -// pipeline which is split for merging, or an intact pipeline which must run entirely on mongoS. -Status runPipelineOnMongoS(const boost::intrusive_ptr<ExpressionContext>& expCtx, - const ClusterAggregate::Namespaces& namespaces, - const AggregationRequest& request, - const LiteParsedPipeline& litePipe, - std::unique_ptr<Pipeline, PipelineDeleter> pipeline, - BSONObjBuilder* result, - const PrivilegeVector& privileges) { - // We should never receive a pipeline which cannot run on mongoS. - invariant(!expCtx->explain); - invariant(pipeline->canRunOnMongos()); - - const auto& requestedNss = namespaces.requestedNss; - const auto opCtx = expCtx->opCtx; - - // Verify that the first stage can produce input for the remainder of the pipeline. - uassert(ErrorCodes::IllegalOperation, - str::stream() << "Aggregation pipeline must be run on mongoS, but " - << pipeline->getSources().front()->getSourceName() - << " is not capable of producing input", - !pipeline->getSources().front()->constraints().requiresInputDocSource); - - // Register the new mongoS cursor, and retrieve the initial batch of results. - auto cursorResponse = establishMergingMongosCursor( - opCtx, request, requestedNss, litePipe, std::move(pipeline), privileges); - - // We don't need to storePossibleCursor or propagate writeConcern errors; a pipeline with - // writing stages like $out can never run on mongoS. Filter the command response and return - // immediately. - CommandHelpers::filterCommandReplyForPassthrough(cursorResponse, result); - return getStatusFromCommandResult(result->asTempObj()); -} - -Status dispatchMergingPipeline( - const boost::intrusive_ptr<ExpressionContext>& expCtx, - const ClusterAggregate::Namespaces& namespaces, - const AggregationRequest& request, - const LiteParsedPipeline& litePipe, - const boost::optional<CachedCollectionRoutingInfo>& routingInfo, - sharded_agg_helpers::DispatchShardPipelineResults&& shardDispatchResults, - BSONObjBuilder* result, - const PrivilegeVector& privileges) { - // We should never be in a situation where we call this function on a non-merge pipeline. - invariant(shardDispatchResults.splitPipeline); - auto* mergePipeline = shardDispatchResults.splitPipeline->mergePipeline.get(); - invariant(mergePipeline); - auto* opCtx = expCtx->opCtx; - - std::vector<ShardId> targetedShards; - targetedShards.reserve(shardDispatchResults.remoteCursors.size()); - for (auto&& remoteCursor : shardDispatchResults.remoteCursors) { - targetedShards.emplace_back(remoteCursor->getShardId().toString()); - } - - cluster_aggregation_planner::addMergeCursorsSource( - mergePipeline, - litePipe, - shardDispatchResults.commandForTargetedShards, - std::move(shardDispatchResults.remoteCursors), - targetedShards, - shardDispatchResults.splitPipeline->shardCursorsSortSpec, - Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor()); - - // First, check whether we can merge on the mongoS. If the merge pipeline MUST run on mongoS, - // then ignore the internalQueryProhibitMergingOnMongoS parameter. - if (mergePipeline->requiredToRunOnMongos() || - (!internalQueryProhibitMergingOnMongoS.load() && mergePipeline->canRunOnMongos())) { - return runPipelineOnMongoS(expCtx, - namespaces, - request, - litePipe, - std::move(shardDispatchResults.splitPipeline->mergePipeline), - result, - privileges); - } - - // If we are not merging on mongoS, then this is not a $changeStream aggregation, and we - // therefore must have a valid routing table. - invariant(routingInfo); - - const ShardId mergingShardId = pickMergingShard(opCtx, - shardDispatchResults.needsPrimaryShardMerge, - targetedShards, - routingInfo->db().primaryId()); - const bool mergingShardContributesData = - std::find(targetedShards.begin(), targetedShards.end(), mergingShardId) != - targetedShards.end(); - - auto mergeCmdObj = createCommandForMergingShard( - request, expCtx, mergingShardId, mergingShardContributesData, mergePipeline); - - // Dispatch $mergeCursors to the chosen shard, store the resulting cursor, and return. - auto mergeResponse = establishMergingShardCursor( - opCtx, namespaces.executionNss, request, mergeCmdObj, mergingShardId); - uassertStatusOK(mergeResponse.swResponse); - - auto mergeCursorResponse = uassertStatusOK( - storePossibleCursor(opCtx, - mergingShardId, - *mergeResponse.shardHostAndPort, - mergeResponse.swResponse.getValue().data, - namespaces.requestedNss, - Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor(), - Grid::get(opCtx)->getCursorManager(), - privileges, - expCtx->tailableMode)); - - // Ownership for the shard cursors has been transferred to the merging shard. Dismiss the - // ownership in the current merging pipeline such that when it goes out of scope it does not - // attempt to kill the cursors. - auto mergeCursors = static_cast<DocumentSourceMergeCursors*>(mergePipeline->peekFront()); - mergeCursors->dismissCursorOwnership(); - - return appendCursorResponseToCommandResult(mergingShardId, mergeCursorResponse, result); -} - void appendEmptyResultSetWithStatus(OperationContext* opCtx, const NamespaceString& nss, Status status, @@ -740,199 +156,105 @@ Status ClusterAggregate::runAggregate(OperationContext* opCtx, << ", " << AggregationRequest::kFromMongosName << "] cannot be set to 'true' when sent to mongos", !request.needsMerge() && !request.isFromMongos()); - auto executionNsRoutingInfoStatus = - sharded_agg_helpers::getExecutionNsRoutingInfo(opCtx, namespaces.executionNss); - boost::optional<CachedCollectionRoutingInfo> routingInfo; - LiteParsedPipeline litePipe(request); + const auto isSharded = [](OperationContext* opCtx, const NamespaceString& nss) -> bool { const auto resolvedNsRoutingInfo = uassertStatusOK(getCollectionRoutingInfoForTxnCmd(opCtx, nss)); return resolvedNsRoutingInfo.cm().get(); }; - const bool involvesShardedCollections = litePipe.verifyIsSupported( - opCtx, isSharded, request.getExplain(), serverGlobalParams.enableMajorityReadConcern); - // If the routing table is valid, we obtain a reference to it. If the table is not valid, then - // either the database does not exist, or there are no shards in the cluster. In the latter - // case, we always return an empty cursor. In the former case, if the requested aggregation is a - // $changeStream, we allow the operation to continue so that stream cursors can be established - // on the given namespace before the database or collection is actually created. If the database - // does not exist and this is not a $changeStream, then we return an empty cursor. - if (executionNsRoutingInfoStatus.isOK()) { - routingInfo = std::move(executionNsRoutingInfoStatus.getValue()); - } else if (!(litePipe.hasChangeStream() && - executionNsRoutingInfoStatus == ErrorCodes::NamespaceNotFound)) { - appendEmptyResultSetWithStatus( - opCtx, namespaces.requestedNss, executionNsRoutingInfoStatus.getStatus(), result); - return Status::OK(); - } - - // Determine whether this aggregation must be dispatched to all shards in the cluster. - const bool mustRunOnAll = - sharded_agg_helpers::mustRunOnAllShards(namespaces.executionNss, litePipe); - - // If we don't have a routing table, then this is a $changeStream which must run on all shards. - invariant(routingInfo || (mustRunOnAll && litePipe.hasChangeStream())); - - auto resolvedNamespaces = resolveInvolvedNamespaces(opCtx, litePipe); - - // A pipeline is allowed to passthrough to the primary shard iff the following conditions are - // met: - // - // 1. The namespace of the aggregate and any other involved namespaces are unsharded. - // 2. Is allowed to be forwarded to shards. - // 3. Does not need to run on all shards. - // 4. Doesn't need transformation via DocumentSource::serialize(). - if (routingInfo && !routingInfo->cm() && !mustRunOnAll && - litePipe.allowedToPassthroughFromMongos() && !involvesShardedCollections) { - const auto primaryShardId = routingInfo->db().primary()->getId(); - return aggPassthrough( - opCtx, namespaces, routingInfo->db(), request, litePipe, privileges, result); - } - - // Populate the collection UUID and the appropriate collation to use. - auto collInfo = getCollationAndUUID(routingInfo, namespaces.executionNss, request, litePipe); - BSONObj collationObj = collInfo.first; - boost::optional<UUID> uuid = collInfo.second; - - // Build an ExpressionContext for the pipeline. This instantiates an appropriate collator, - // resolves all involved namespaces, and creates a shared MongoProcessInterface for use by the - // pipeline's stages. - auto expCtx = makeExpressionContext( - opCtx, request, litePipe, collationObj, uuid, std::move(resolvedNamespaces)); - - // Parse and optimize the full pipeline. - auto pipeline = uassertStatusOK(Pipeline::parse(request.getPipeline(), expCtx)); - pipeline->optimizePipeline(); - - // Check whether the entire pipeline must be run on mongoS. - if (pipeline->requiredToRunOnMongos()) { - // If this is an explain write the explain output and return. - if (expCtx->explain) { - *result << "splitPipeline" << BSONNULL << "mongos" - << Document{{"host", getHostNameCachedAndPort()}, - {"stages", pipeline->writeExplainOps(*expCtx->explain)}}; - return Status::OK(); - } - - return runPipelineOnMongoS( - expCtx, namespaces, request, litePipe, std::move(pipeline), result, privileges); - } + LiteParsedPipeline litePipe(request); + litePipe.verifyIsSupported( + opCtx, isSharded, request.getExplain(), serverGlobalParams.enableMajorityReadConcern); + auto hasChangeStream = litePipe.hasChangeStream(); + auto involvedNamespaces = litePipe.getInvolvedNamespaces(); + + const auto pipelineBuilder = [&](boost::optional<CachedCollectionRoutingInfo> routingInfo) { + // Populate the collection UUID and the appropriate collation to use. + auto [collationObj, uuid] = [&]() -> std::pair<BSONObj, boost::optional<UUID>> { + // If this is a change stream, take the user-defined collation if one exists, or an + // empty BSONObj otherwise. Change streams never inherit the collection's default + // collation, and since collectionless aggregations generally run on the 'admin' + // database, the standard logic would attempt to resolve its non-existent UUID and + // collation by sending a specious 'listCollections' command to the config servers. + if (hasChangeStream) { + return {request.getCollation(), boost::none}; + } - // If not, split the pipeline as necessary and dispatch to the relevant shards. - auto shardDispatchResults = sharded_agg_helpers::dispatchShardPipeline( - expCtx, namespaces.executionNss, request, litePipe, std::move(pipeline), collationObj); + return sharded_agg_helpers::getCollationAndUUID( + routingInfo, namespaces.executionNss, request.getCollation()); + }(); - // If the operation is an explain, then we verify that it succeeded on all targeted shards, - // write the results to the output builder, and return immediately. - if (expCtx->explain) { - return appendExplainResults(std::move(shardDispatchResults), expCtx, result); - } + // Build an ExpressionContext for the pipeline. This instantiates an appropriate collator, + // resolves all involved namespaces, and creates a shared MongoProcessInterface for use by + // the pipeline's stages. + auto expCtx = makeExpressionContext( + opCtx, request, collationObj, uuid, resolveInvolvedNamespaces(involvedNamespaces)); - // If this isn't an explain, then we must have established cursors on at least one shard. - invariant(shardDispatchResults.remoteCursors.size() > 0); - - // If we sent the entire pipeline to a single shard, store the remote cursor and return. - if (!shardDispatchResults.splitPipeline) { - invariant(shardDispatchResults.remoteCursors.size() == 1); - auto&& remoteCursor = std::move(shardDispatchResults.remoteCursors.front()); - const auto shardId = remoteCursor->getShardId().toString(); - const auto reply = uassertStatusOK(storePossibleCursor(opCtx, - namespaces.requestedNss, - std::move(remoteCursor), - privileges, - expCtx->tailableMode)); - return appendCursorResponseToCommandResult(shardId, reply, result); - } + // Parse and optimize the full pipeline. + auto pipeline = uassertStatusOK(Pipeline::parse(request.getPipeline(), expCtx)); + pipeline->optimizePipeline(); + return pipeline; + }; - // If we have the exchange spec then dispatch all consumers. - if (shardDispatchResults.exchangeSpec) { - shardDispatchResults = dispatchExchangeConsumerPipeline(expCtx, - namespaces.executionNss, - request, - litePipe, - collationObj, - &shardDispatchResults); + auto targetingStatus = + sharded_agg_helpers::AggregationTargeter::make(opCtx, + namespaces.executionNss, + pipelineBuilder, + involvedNamespaces, + hasChangeStream, + litePipe.allowedToPassthroughFromMongos()); + if (!targetingStatus.isOK()) { + appendEmptyResultSetWithStatus( + opCtx, namespaces.requestedNss, targetingStatus.getStatus(), result); + return Status::OK(); } - // If we reach here, we have a merge pipeline to dispatch. - return dispatchMergingPipeline(expCtx, - namespaces, - request, - litePipe, - routingInfo, - std::move(shardDispatchResults), - result, - privileges); -} + auto targeter = std::move(targetingStatus.getValue()); + switch (targeter.policy) { + case sharded_agg_helpers::AggregationTargeter::TargetingPolicy::kPassthrough: { + // A pipeline with $changeStream should never be allowed to passthrough. + invariant(!hasChangeStream); + return sharded_agg_helpers::runPipelineOnPrimaryShard(opCtx, + namespaces, + targeter.routingInfo->db(), + request.getExplain(), + request.serializeToCommandObj(), + privileges, + result); + } -Status ClusterAggregate::aggPassthrough(OperationContext* opCtx, - const Namespaces& namespaces, - const CachedDatabaseInfo& dbInfo, - const AggregationRequest& aggRequest, - const LiteParsedPipeline& liteParsedPipeline, - const PrivilegeVector& privileges, - BSONObjBuilder* out) { - // Format the command for the shard. This adds the 'fromMongos' field, wraps the command as an - // explain if necessary, and rewrites the result into a format safe to forward to shards. - BSONObj cmdObj = CommandHelpers::filterCommandRequestForPassthrough( - sharded_agg_helpers::createPassthroughCommandForShard( - opCtx, aggRequest, boost::none, nullptr, BSONObj())); - - const auto shardId = dbInfo.primary()->getId(); - const auto cmdObjWithShardVersion = (shardId != ShardRegistry::kConfigServerShardId) - ? appendShardVersion(std::move(cmdObj), ChunkVersion::UNSHARDED()) - : std::move(cmdObj); - - MultiStatementTransactionRequestsSender ars( - opCtx, - Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor(), - namespaces.executionNss.db().toString(), - {{shardId, appendDbVersionIfPresent(cmdObjWithShardVersion, dbInfo)}}, - ReadPreferenceSetting::get(opCtx), - Shard::RetryPolicy::kIdempotent); - auto response = ars.next(); - invariant(ars.done()); - - uassertStatusOK(response.swResponse); - auto commandStatus = getStatusFromCommandResult(response.swResponse.getValue().data); - - if (ErrorCodes::isStaleShardVersionError(commandStatus.code())) { - uassertStatusOK(commandStatus.withContext("command failed because of stale config")); - } else if (ErrorCodes::isSnapshotError(commandStatus.code())) { - uassertStatusOK( - commandStatus.withContext("command failed because can not establish a snapshot")); - } + case sharded_agg_helpers::AggregationTargeter::TargetingPolicy::kMongosRequired: { + auto expCtx = targeter.pipeline->getContext(); + // If this is an explain write the explain output and return. + if (expCtx->explain) { + *result << "splitPipeline" << BSONNULL << "mongos" + << Document{ + {"host", getHostNameCachedAndPort()}, + {"stages", targeter.pipeline->writeExplainOps(*expCtx->explain)}}; + return Status::OK(); + } - BSONObj result; - if (aggRequest.getExplain()) { - // If this was an explain, then we get back an explain result object rather than a cursor. - result = response.swResponse.getValue().data; - } else { - auto tailMode = liteParsedPipeline.hasChangeStream() - ? TailableModeEnum::kTailableAndAwaitData - : TailableModeEnum::kNormal; - result = uassertStatusOK( - storePossibleCursor(opCtx, - shardId, - *response.shardHostAndPort, - response.swResponse.getValue().data, - namespaces.requestedNss, - Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor(), - Grid::get(opCtx)->getCursorManager(), - privileges, - tailMode)); - } + return sharded_agg_helpers::runPipelineOnMongoS(namespaces, + request.getBatchSize(), + std::move(targeter.pipeline), + result, + privileges); + } - // First append the properly constructed writeConcernError. It will then be skipped - // in appendElementsUnique. - if (auto wcErrorElem = result["writeConcernError"]) { - appendWriteConcernErrorToCmdResponse(shardId, wcErrorElem, *out); + case sharded_agg_helpers::AggregationTargeter::TargetingPolicy::kAnyShard: { + return sharded_agg_helpers::dispatchPipelineAndMerge(opCtx, + std::move(targeter), + request.serializeToCommandObj(), + request.getBatchSize(), + namespaces, + privileges, + result, + hasChangeStream); + } } - out->appendElementsUnique(CommandHelpers::filterCommandReplyForPassthrough(result)); - - return getStatusFromCommandResult(out->asTempObj()); + MONGO_UNREACHABLE; } Status ClusterAggregate::retryOnViewError(OperationContext* opCtx, diff --git a/src/mongo/s/query/cluster_aggregate.h b/src/mongo/s/query/cluster_aggregate.h index 630f4d987b1..9d3c0a90eba 100644 --- a/src/mongo/s/query/cluster_aggregate.h +++ b/src/mongo/s/query/cluster_aggregate.h @@ -104,15 +104,6 @@ public: const PrivilegeVector& privileges, BSONObjBuilder* result, unsigned numberRetries = 0); - -private: - static Status aggPassthrough(OperationContext*, - const Namespaces&, - const CachedDatabaseInfo&, - const AggregationRequest&, - const LiteParsedPipeline&, - const PrivilegeVector& privileges, - BSONObjBuilder* result); }; } // namespace mongo diff --git a/src/mongo/s/query/cluster_aggregation_planner.cpp b/src/mongo/s/query/cluster_aggregation_planner.cpp index bf8665894b8..d3c899755cb 100644 --- a/src/mongo/s/query/cluster_aggregation_planner.cpp +++ b/src/mongo/s/query/cluster_aggregation_planner.cpp @@ -479,12 +479,12 @@ SplitPipeline splitPipeline(std::unique_ptr<Pipeline, PipelineDeleter> pipeline) } void addMergeCursorsSource(Pipeline* mergePipeline, - const LiteParsedPipeline& liteParsedPipeline, BSONObj cmdSentToShards, std::vector<OwnedRemoteCursor> ownedCursors, const std::vector<ShardId>& targetedShards, boost::optional<BSONObj> shardCursorsSortSpec, - std::shared_ptr<executor::TaskExecutor> executor) { + std::shared_ptr<executor::TaskExecutor> executor, + bool hasChangeStream) { auto* opCtx = mergePipeline->getContext()->opCtx; AsyncResultsMergerParams armParams; armParams.setSort(shardCursorsSortSpec); @@ -524,7 +524,7 @@ void addMergeCursorsSource(Pipeline* mergePipeline, auto mergeCursorsStage = DocumentSourceMergeCursors::create( std::move(executor), std::move(armParams), mergePipeline->getContext()); - if (liteParsedPipeline.hasChangeStream()) { + if (hasChangeStream) { mergePipeline->addInitialSource(DocumentSourceUpdateOnAddShard::create( mergePipeline->getContext(), Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor(), diff --git a/src/mongo/s/query/cluster_aggregation_planner.h b/src/mongo/s/query/cluster_aggregation_planner.h index 75baded9e31..3b8b70fcd1d 100644 --- a/src/mongo/s/query/cluster_aggregation_planner.h +++ b/src/mongo/s/query/cluster_aggregation_planner.h @@ -81,12 +81,12 @@ SplitPipeline splitPipeline(std::unique_ptr<Pipeline, PipelineDeleter> pipeline) * front of 'mergePipeline'. */ void addMergeCursorsSource(Pipeline* mergePipeline, - const LiteParsedPipeline&, BSONObj cmdSentToShards, - std::vector<OwnedRemoteCursor> remoteCursors, + std::vector<OwnedRemoteCursor> ownedCursors, const std::vector<ShardId>& targetedShards, boost::optional<BSONObj> shardCursorsSortSpec, - std::shared_ptr<executor::TaskExecutor> executor); + std::shared_ptr<executor::TaskExecutor> executor, + bool hasChangeStream); /** * Builds a ClusterClientCursor which will execute 'pipeline'. If 'pipeline' consists entirely of |