diff options
author | Max Hirschhorn <max.hirschhorn@mongodb.com> | 2021-01-08 14:49:27 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-01-08 15:32:55 +0000 |
commit | 0416c69269754a28d98c6288b09738d98c8bdfa2 (patch) | |
tree | d1be9212a0eb6fca35231407d6299a2538096ef2 | |
parent | 6221e2ee0ff5527f13d25a28a5de6a245bd0e641 (diff) | |
download | mongo-0416c69269754a28d98c6288b09738d98c8bdfa2.tar.gz |
SERVER-49293 Make resharding collection cloning resume from highest _id.
6 files changed, 254 insertions, 23 deletions
diff --git a/jstests/sharding/resharding_collection_cloner_resuming.js b/jstests/sharding/resharding_collection_cloner_resuming.js new file mode 100644 index 00000000000..d8a1b438eda --- /dev/null +++ b/jstests/sharding/resharding_collection_cloner_resuming.js @@ -0,0 +1,95 @@ +/** + * Tests the resuming behavior of resharding's collection cloning. + * + * @tags: [ + * requires_fcv_49, + * uses_atclustertime, + * ] + */ +(function() { +"use strict"; + +load("jstests/libs/uuid_util.js"); +load("jstests/sharding/libs/create_sharded_collection_util.js"); + +const st = new ShardingTest({mongos: 1, config: 1, shards: 2, rs: {nodes: 1}}); + +const inputCollection = st.s.getCollection("reshardingDb.coll"); + +CreateShardedCollectionUtil.shardCollectionWithChunks(inputCollection, {oldKey: 1}, [ + {min: {oldKey: MinKey}, max: {oldKey: 0}, shard: st.shard0.shardName}, + {min: {oldKey: 0}, max: {oldKey: MaxKey}, shard: st.shard1.shardName}, +]); + +const inputCollectionUUID = + getUUIDFromListCollections(inputCollection.getDB(), inputCollection.getName()); +const inputCollectionUUIDString = extractUUIDFromObject(inputCollectionUUID); + +const temporaryReshardingCollection = + st.s.getCollection(`reshardingDb.system.resharding.${inputCollectionUUIDString}`); + +CreateShardedCollectionUtil.shardCollectionWithChunks( + temporaryReshardingCollection, + {newKey: 1}, + [{min: {newKey: MinKey}, max: {newKey: MaxKey}, shard: st.shard0.shardName}]); + +const documents = [ + {_id: "a", info: "stays on shard0", oldKey: -10, newKey: 0}, + {_id: "b", info: "moves to shard0", oldKey: 10, newKey: 0}, + {_id: "c", info: "stays on shard0", oldKey: -10, newKey: 0}, + {_id: "d", info: "moves to shard0", oldKey: 10, newKey: 0}, +]; +assert.commandWorked(inputCollection.insert(documents)); +const originalInsertsTs = inputCollection.getDB().getSession().getOperationTime(); + +function testReshardCloneCollection(shard, expectedDocs, atClusterTime) { + const shardPrimary = shard.rs.getPrimary(); + shardPrimary.getDB(inputCollection.getDB().getName()) + .getSession() + .advanceClusterTime(inputCollection.getDB().getSession().getClusterTime()); + + assert.commandWorked(shardPrimary.adminCommand({ + testReshardCloneCollection: inputCollection.getFullName(), + shardKey: {newKey: 1}, + uuid: inputCollectionUUID, + shardId: shard.shardName, + atClusterTime: atClusterTime, + outputNs: temporaryReshardingCollection.getFullName(), + })); + + // We sort by _id so the order of `expectedDocs` can be deterministic. + assert.eq(expectedDocs, + shardPrimary.getCollection(temporaryReshardingCollection.getFullName()) + .find() + .sort({_id: 1}) + .toArray()); +} + +testReshardCloneCollection(st.shard0, documents, originalInsertsTs); + +// Cloning the sharded collection a second time should be a successful no-op. +testReshardCloneCollection(st.shard0, documents, originalInsertsTs); + +// Removing the "c" and "d" documents from the temporary resharding collection to simulate the +// cloner as having made partial progress. It should successfully resume from the "b" document. +assert.commandWorked(temporaryReshardingCollection.remove({_id: {$gt: "b"}}, {justOne: false})); +testReshardCloneCollection(st.shard0, documents, originalInsertsTs); + +// Insert another "d" document and verify that resuming now fails due to a duplicate key error. +assert.commandWorked( + inputCollection.insert({_id: "d", info: "stays on shard0", oldKey: -10, newKey: 0})); + +const duplicateInsertTs = inputCollection.getDB().getSession().getOperationTime(); + +assert.commandFailedWithCode(st.shard0.adminCommand({ + testReshardCloneCollection: inputCollection.getFullName(), + shardKey: {newKey: 1}, + uuid: inputCollectionUUID, + shardId: st.shard0.shardName, + atClusterTime: duplicateInsertTs, + outputNs: temporaryReshardingCollection.getFullName(), +}), + ErrorCodes.DuplicateKey); + +st.stop(); +})(); diff --git a/src/mongo/db/pipeline/sharded_agg_helpers.cpp b/src/mongo/db/pipeline/sharded_agg_helpers.cpp index 039288dc823..8a0fbc10998 100644 --- a/src/mongo/db/pipeline/sharded_agg_helpers.cpp +++ b/src/mongo/db/pipeline/sharded_agg_helpers.cpp @@ -600,7 +600,8 @@ void abandonCacheIfSentToShards(Pipeline* shardsPipeline) { std::unique_ptr<Pipeline, PipelineDeleter> targetShardsAndAddMergeCursors( const boost::intrusive_ptr<ExpressionContext>& expCtx, - stdx::variant<std::unique_ptr<Pipeline, PipelineDeleter>, AggregateCommand> targetRequest) { + stdx::variant<std::unique_ptr<Pipeline, PipelineDeleter>, AggregateCommand> targetRequest, + boost::optional<BSONObj> shardCursorsSortSpec) { auto&& [aggRequest, pipeline] = [&] { return stdx::visit( visit_helper::Overloaded{ @@ -642,10 +643,12 @@ std::unique_ptr<Pipeline, PipelineDeleter> targetShardsAndAddMergeCursors( } std::unique_ptr<Pipeline, PipelineDeleter> mergePipeline; - boost::optional<BSONObj> shardCursorsSortSpec = boost::none; if (shardDispatchResults.splitPipeline) { mergePipeline = std::move(shardDispatchResults.splitPipeline->mergePipeline); - shardCursorsSortSpec = shardDispatchResults.splitPipeline->shardCursorsSortSpec; + if (shardDispatchResults.splitPipeline->shardCursorsSortSpec) { + uassert(4929304, "Split pipeline provides its own sort already", !shardCursorsSortSpec); + shardCursorsSortSpec = shardDispatchResults.splitPipeline->shardCursorsSortSpec; + } } else { // We have not split the pipeline, and will execute entirely on the remote shards. Set up an // empty local pipeline which we will attach the merge cursors stage to. diff --git a/src/mongo/db/pipeline/sharded_agg_helpers.h b/src/mongo/db/pipeline/sharded_agg_helpers.h index b2be0661aab..2f8831f8bfa 100644 --- a/src/mongo/db/pipeline/sharded_agg_helpers.h +++ b/src/mongo/db/pipeline/sharded_agg_helpers.h @@ -204,7 +204,8 @@ std::unique_ptr<Pipeline, PipelineDeleter> attachCursorToPipeline(Pipeline* owne */ std::unique_ptr<Pipeline, PipelineDeleter> targetShardsAndAddMergeCursors( const boost::intrusive_ptr<ExpressionContext>& expCtx, - stdx::variant<std::unique_ptr<Pipeline, PipelineDeleter>, AggregateCommand> targetRequest); + stdx::variant<std::unique_ptr<Pipeline, PipelineDeleter>, AggregateCommand> targetRequest, + boost::optional<BSONObj> shardCursorsSortSpec = boost::none); /** * For a sharded or unsharded collection, establishes a remote cursor on only the specified shard, diff --git a/src/mongo/db/s/resharding/resharding_collection_cloner.cpp b/src/mongo/db/s/resharding/resharding_collection_cloner.cpp index 5819b96c22f..67e4f844de7 100644 --- a/src/mongo/db/s/resharding/resharding_collection_cloner.cpp +++ b/src/mongo/db/s/resharding/resharding_collection_cloner.cpp @@ -41,6 +41,7 @@ #include "mongo/db/client.h" #include "mongo/db/concurrency/write_conflict_exception.h" #include "mongo/db/curop.h" +#include "mongo/db/dbhelpers.h" #include "mongo/db/exec/document_value/document.h" #include "mongo/db/exec/document_value/value.h" #include "mongo/db/pipeline/aggregation_request_helper.h" @@ -48,6 +49,7 @@ #include "mongo/db/pipeline/document_source_match.h" #include "mongo/db/pipeline/document_source_replace_root.h" #include "mongo/db/pipeline/sharded_agg_helpers.h" +#include "mongo/db/query/query_request.h" #include "mongo/db/s/resharding/resharding_server_parameters_gen.h" #include "mongo/db/s/resharding_util.h" #include "mongo/db/service_context.h" @@ -60,6 +62,20 @@ #include "mongo/util/str.h" namespace mongo { +namespace { + +bool collectionHasSimpleCollation(OperationContext* opCtx, const NamespaceString& nss) { + auto catalogCache = Grid::get(opCtx)->catalogCache(); + auto sourceChunkMgr = uassertStatusOK(catalogCache->getCollectionRoutingInfo(opCtx, nss)); + + uassert(ErrorCodes::NamespaceNotSharded, + str::stream() << "Expected collection " << nss << " to be sharded", + sourceChunkMgr.isSharded()); + + return !sourceChunkMgr.getDefaultCollator(); +} + +} // namespace ReshardingCollectionCloner::ReshardingCollectionCloner(ShardKeyPattern newShardKeyPattern, NamespaceString sourceNss, @@ -75,7 +91,9 @@ ReshardingCollectionCloner::ReshardingCollectionCloner(ShardKeyPattern newShardK _outputNss(std::move(outputNss)) {} std::unique_ptr<Pipeline, PipelineDeleter> ReshardingCollectionCloner::makePipeline( - OperationContext* opCtx, std::shared_ptr<MongoProcessInterface> mongoProcessInterface) { + OperationContext* opCtx, + std::shared_ptr<MongoProcessInterface> mongoProcessInterface, + Value resumeId) { using Doc = Document; using Arr = std::vector<Value>; using V = Value; @@ -91,6 +109,19 @@ std::unique_ptr<Pipeline, PipelineDeleter> ReshardingCollectionCloner::makePipel NamespaceString(NamespaceString::kConfigDb, "cache.chunks." + tempNss.ns()); resolvedNamespaces[tempCacheChunksNss.coll()] = {tempCacheChunksNss, std::vector<BSONObj>{}}; + // sharded_agg_helpers::targetShardsAndAddMergeCursors() ignores the collation set on the + // AggregationRequest (or lack thereof) and instead only considers the collator set on the + // ExpressionContext. Setting nullptr as the collator on the ExpressionContext means that the + // aggregation pipeline is always using the "simple" collation, even when the collection default + // collation for _sourceNss is non-simple. The chunk ranges in the $lookup stage must be + // compared using the simple collation because collections are always sharded using the simple + // collation. However, resuming by _id is only efficient (i.e. non-blocking seek/sort) when the + // aggregation pipeline would be using the collection's default collation. We cannot do both so + // we choose to disallow automatic resuming for collections with non-simple default collations. + uassert(4929303, + "Cannot resume cloning when sharded collection has non-simple default collation", + resumeId.missing() || collectionHasSimpleCollation(opCtx, _sourceNss)); + auto expCtx = make_intrusive<ExpressionContext>(opCtx, boost::none, /* explain */ false, /* fromMongos */ @@ -107,6 +138,14 @@ std::unique_ptr<Pipeline, PipelineDeleter> ReshardingCollectionCloner::makePipel Pipeline::SourceContainer stages; + if (!resumeId.missing()) { + stages.emplace_back(DocumentSourceMatch::create( + Doc{{"$expr", + Doc{{"$gte", Arr{V{"$_id"_sd}, V{Doc{{"$literal", std::move(resumeId)}}}}}}}} + .toBson(), + expCtx)); + } + stages.emplace_back(DocumentSourceReplaceRoot::createFromBson( fromjson("{$replaceWith: {original: '$$ROOT'}}").firstElement(), expCtx)); @@ -154,16 +193,62 @@ std::unique_ptr<Pipeline, PipelineDeleter> ReshardingCollectionCloner::makePipel stages.emplace_back( DocumentSourceMatch::create(fromjson("{intersectingChunk: {$ne: []}}"), expCtx)); - stages.emplace_back(DocumentSourceReplaceRoot::createFromBson( - fromjson("{$replaceWith: '$original'}").firstElement(), expCtx)); - return Pipeline::create(std::move(stages), expCtx); + + // We use $arrayToObject to synthesize the $sortKeys needed by the AsyncResultsMerger to merge + // the results from all of the donor shards by {_id: 1}. This expression wouldn't be correct if + // the aggregation pipeline was using a non-"simple" collation. + stages.emplace_back( + DocumentSourceReplaceRoot::createFromBson(fromjson("{$replaceWith: {$mergeObjects: [\ + '$original',\ + {$arrayToObject: {$concatArrays: [[{\ + k: {$literal: '$sortKey'},\ + v: ['$original._id']\ + }]]}}\ + ]}}") + .firstElement(), + expCtx)); + + return Pipeline::create(std::move(stages), std::move(expCtx)); +} + +Value ReshardingCollectionCloner::_findHighestInsertedId(OperationContext* opCtx) { + AutoGetCollection outputColl(opCtx, _outputNss, MODE_IS); + uassert(ErrorCodes::NamespaceNotFound, + str::stream() << "Resharding collection cloner's output collection '" << _outputNss + << "' did not already exist", + outputColl); + + auto qr = std::make_unique<QueryRequest>(_outputNss); + qr->setLimit(1); + qr->setSort(BSON("_id" << -1)); + + auto recordId = Helpers::findOne(opCtx, *outputColl, std::move(qr), true /* requireIndex */); + if (!recordId.isNormal()) { + return Value{}; + } + + auto doc = outputColl->docFor(opCtx, recordId).value(); + auto value = Value{doc["_id"]}; + uassert(4929300, + "Missing _id field for document in temporary resharding collection", + !value.missing()); + + return value; } std::unique_ptr<Pipeline, PipelineDeleter> ReshardingCollectionCloner::_targetAggregationRequest( OperationContext* opCtx, const Pipeline& pipeline) { AggregateCommand request(_sourceNss, pipeline.serializeToBson()); request.setCollectionUUID(_sourceUUID); - request.setHint(BSON("_id" << 1)); + + auto hint = collectionHasSimpleCollation(opCtx, _sourceNss) + ? boost::optional<BSONObj>{BSON("_id" << 1)} + : boost::none; + + if (hint) { + request.setHint(*hint); + } + request.setReadConcern(BSON(repl::ReadConcernArgs::kLevelFieldName << repl::readConcernLevels::kSnapshotName << repl::ReadConcernArgs::kAtClusterTimeFieldName @@ -175,8 +260,11 @@ std::unique_ptr<Pipeline, PipelineDeleter> ReshardingCollectionCloner::_targetAg _sourceNss, "targeting donor shards for resharding collection cloning"_sd, [&] { + // We use the hint as an implied sort for $mergeCursors because + // the aggregation pipeline synthesizes the necessary $sortKeys + // fields in the result set. return sharded_agg_helpers::targetShardsAndAddMergeCursors( - pipeline.getContext(), request); + pipeline.getContext(), request, hint); }); } @@ -276,8 +364,36 @@ ExecutorFuture<void> ReshardingCollectionCloner::run( return AsyncTry([this, chainCtx] { if (!chainCtx->pipeline) { chainCtx->pipeline = _withTemporaryOperationContext([&](auto* opCtx) { + auto idToResumeFrom = _findHighestInsertedId(opCtx); auto pipeline = _targetAggregationRequest( - opCtx, *makePipeline(opCtx, MongoProcessInterface::create(opCtx))); + opCtx, + *makePipeline( + opCtx, MongoProcessInterface::create(opCtx), idToResumeFrom)); + + if (!idToResumeFrom.missing()) { + // Skip inserting the first document retrieved after resuming because + // $gte was used in the aggregation pipeline. + auto firstDoc = pipeline->getNext(); + uassert(4929301, + str::stream() + << "Expected pipeline to retrieve document with _id: " + << redact(idToResumeFrom.toString()), + firstDoc); + + // Note that the following uassert() could throw because we're using the + // simple string comparator and the collection could have a non-simple + // collation. However, it would still be correct to throw an exception + // because it would mean the collection being resharded contains multiple + // documents with the same _id value as far as global uniqueness is + // concerned. + const auto& firstId = (*firstDoc)["_id"]; + uassert(4929302, + str::stream() + << "Expected pipeline to retrieve document with _id: " + << redact(idToResumeFrom.toString()) + << ", but got _id: " << redact(firstId.toString()), + ValueComparator::kInstance.evaluate(firstId == idToResumeFrom)); + } pipeline->detachFromOperationContext(); pipeline.get_deleter().dismissDisposal(); diff --git a/src/mongo/db/s/resharding/resharding_collection_cloner.h b/src/mongo/db/s/resharding/resharding_collection_cloner.h index 74710fe63bb..d40209ccf03 100644 --- a/src/mongo/db/s/resharding/resharding_collection_cloner.h +++ b/src/mongo/db/s/resharding/resharding_collection_cloner.h @@ -34,6 +34,7 @@ #include "mongo/bson/timestamp.h" #include "mongo/db/catalog/collection_catalog.h" +#include "mongo/db/exec/document_value/value.h" #include "mongo/db/namespace_string.h" #include "mongo/db/pipeline/expression_context.h" #include "mongo/db/pipeline/pipeline.h" @@ -62,12 +63,16 @@ public: NamespaceString outputNss); std::unique_ptr<Pipeline, PipelineDeleter> makePipeline( - OperationContext* opCtx, std::shared_ptr<MongoProcessInterface> mongoProcessInterface); + OperationContext* opCtx, + std::shared_ptr<MongoProcessInterface> mongoProcessInterface, + Value resumeId = Value()); ExecutorFuture<void> run(std::shared_ptr<executor::TaskExecutor> executor, CancelationToken cancelToken); private: + Value _findHighestInsertedId(OperationContext* opCtx); + std::unique_ptr<Pipeline, PipelineDeleter> _targetAggregationRequest(OperationContext* opCtx, const Pipeline& pipeline); diff --git a/src/mongo/db/s/resharding/resharding_collection_cloner_test.cpp b/src/mongo/db/s/resharding/resharding_collection_cloner_test.cpp index eefefd0fa11..5116cb46583 100644 --- a/src/mongo/db/s/resharding/resharding_collection_cloner_test.cpp +++ b/src/mongo/db/s/resharding/resharding_collection_cloner_test.cpp @@ -124,11 +124,13 @@ TEST_F(ReshardingCollectionClonerTest, MinKeyChunk) { auto next = pipeline->getNext(); ASSERT(next); - ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 1 << "x" << MINKEY), next->toBson()); + ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 1 << "x" << MINKEY << "$sortKey" << BSON_ARRAY(1)), + next->toBson()); next = pipeline->getNext(); ASSERT(next); - ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 2 << "x" << -0.001), next->toBson()); + ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 2 << "x" << -0.001 << "$sortKey" << BSON_ARRAY(2)), + next->toBson()); ASSERT_FALSE(pipeline->getNext()); } @@ -148,19 +150,23 @@ TEST_F(ReshardingCollectionClonerTest, MaxKeyChunk) { auto next = pipeline->getNext(); ASSERT(next); - ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 3 << "x" << 0LL), next->toBson()); + ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 3 << "x" << 0LL << "$sortKey" << BSON_ARRAY(3)), + next->toBson()); next = pipeline->getNext(); ASSERT(next); - ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 4 << "x" << 0.0), next->toBson()); + ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 4 << "x" << 0.0 << "$sortKey" << BSON_ARRAY(4)), + next->toBson()); next = pipeline->getNext(); ASSERT(next); - ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 5 << "x" << 0.001), next->toBson()); + ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 5 << "x" << 0.001 << "$sortKey" << BSON_ARRAY(5)), + next->toBson()); next = pipeline->getNext(); ASSERT(next); - ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 6 << "x" << MAXKEY), next->toBson()); + ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 6 << "x" << MAXKEY << "$sortKey" << BSON_ARRAY(6)), + next->toBson()); ASSERT_FALSE(pipeline->getNext()); } @@ -193,19 +199,23 @@ TEST_F(ReshardingCollectionClonerTest, HashedShardKey) { auto next = pipeline->getNext(); ASSERT(next); - ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 3 << "x" << -0.123), next->toBson()); + ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 3 << "x" << -0.123 << "$sortKey" << BSON_ARRAY(3)), + next->toBson()); next = pipeline->getNext(); ASSERT(next); - ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 4 << "x" << 0), next->toBson()); + ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 4 << "x" << 0 << "$sortKey" << BSON_ARRAY(4)), + next->toBson()); next = pipeline->getNext(); ASSERT(next); - ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 5 << "x" << 0LL), next->toBson()); + ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 5 << "x" << 0LL << "$sortKey" << BSON_ARRAY(5)), + next->toBson()); next = pipeline->getNext(); ASSERT(next); - ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 6 << "x" << 0.123), next->toBson()); + ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 6 << "x" << 0.123 << "$sortKey" << BSON_ARRAY(6)), + next->toBson()); ASSERT_FALSE(pipeline->getNext()); } @@ -238,7 +248,8 @@ TEST_F(ReshardingCollectionClonerTest, CompoundHashedShardKey) { auto next = pipeline->getNext(); ASSERT(next); - ASSERT_BSONOBJ_BINARY_EQ(BSON("_id" << 4 << "x" << 0 << "y" << 0), next->toBson()); + ASSERT_BSONOBJ_BINARY_EQ( + BSON("_id" << 4 << "x" << 0 << "y" << 0 << "$sortKey" << BSON_ARRAY(4)), next->toBson()); ASSERT_FALSE(pipeline->getNext()); } |