diff options
-rw-r--r-- | jstests/sharding/dump_coll_metadata.js | 10 | ||||
-rw-r--r-- | jstests/sharding/migration_failure.js | 5 | ||||
-rw-r--r-- | src/mongo/db/s/split_chunk.cpp | 148 | ||||
-rw-r--r-- | src/mongo/s/catalog/sharding_catalog_client.h | 2 | ||||
-rw-r--r-- | src/mongo/s/catalog/sharding_catalog_manager_chunk_operations.cpp | 50 | ||||
-rw-r--r-- | src/mongo/s/commands/cluster_split_cmd.cpp | 2 |
6 files changed, 81 insertions, 136 deletions
diff --git a/jstests/sharding/dump_coll_metadata.js b/jstests/sharding/dump_coll_metadata.js index dbce60e1290..9b38a3c416c 100644 --- a/jstests/sharding/dump_coll_metadata.js +++ b/jstests/sharding/dump_coll_metadata.js @@ -33,14 +33,18 @@ assert(shardAdmin.runCommand({getShardVersion: coll + "xyz", fullMetadata: true}).metadata != undefined); - // Make sure we get multiple chunks after a split - assert(admin.runCommand({split: coll + "", middle: {_id: 0}}).ok); + // Make sure we get multiple chunks after a split and refresh -- splits by themselves do not + // cause the shard to refresh. + assert.commandWorked(admin.runCommand({split: coll + "", middle: {_id: 0}})); + assert.commandWorked( + st.shard0.getDB('admin').runCommand({forceRoutingTableRefresh: coll + ""})); - assert(shardAdmin.runCommand({getShardVersion: coll + ""}).ok); + assert.commandWorked(shardAdmin.runCommand({getShardVersion: coll + ""})); printjson(shardAdmin.runCommand({getShardVersion: coll + "", fullMetadata: true})); // Make sure we have chunks info result = shardAdmin.runCommand({getShardVersion: coll + "", fullMetadata: true}); + assert.commandWorked(result); metadata = result.metadata; assert.eq(metadata.chunks.length, 2); diff --git a/jstests/sharding/migration_failure.js b/jstests/sharding/migration_failure.js index f49a48a0169..7e3ba438262 100644 --- a/jstests/sharding/migration_failure.js +++ b/jstests/sharding/migration_failure.js @@ -40,9 +40,10 @@ assert.eq(oldVersion.t, newVersion.t, "The shard version major value should not change after a failed migration"); - assert.eq(oldVersion.i, + // Split does not cause a shard routing table refresh, but the moveChunk attempt will. + assert.eq(2, newVersion.i, - "The shard version minor value should not change after a failed migration"); + "The shard routing table should refresh on a failed migration and show the split"); assert.commandWorked(st.shard0.getDB("admin").runCommand( {configureFailPoint: 'failMigrationCommit', mode: 'off'})); diff --git a/src/mongo/db/s/split_chunk.cpp b/src/mongo/db/s/split_chunk.cpp index 28cdab7fd83..3dea2009f94 100644 --- a/src/mongo/db/s/split_chunk.cpp +++ b/src/mongo/db/s/split_chunk.cpp @@ -134,105 +134,40 @@ StatusWith<boost::optional<ChunkRange>> splitChunk(OperationContext* opCtx, ShardingState* shardingState = ShardingState::get(opCtx); std::string errmsg; - const BSONObj min = chunkRange.getMin(); - const BSONObj max = chunkRange.getMax(); - // // Lock the collection's metadata and get highest version for the current shard // TODO(SERVER-25086): Remove distLock acquisition from split chunk // const std::string whyMessage( - str::stream() << "splitting chunk [" << min << ", " << max << ") in " << nss.toString()); + str::stream() << "splitting chunk " << chunkRange.toString() << " in " << nss.toString()); auto scopedDistLock = Grid::get(opCtx)->catalogClient()->getDistLockManager()->lock( opCtx, nss.ns(), whyMessage, DistLockManager::kSingleLockAttemptTimeout); if (!scopedDistLock.isOK()) { errmsg = str::stream() << "could not acquire collection lock for " << nss.toString() - << " to split chunk [" << redact(min) << "," << redact(max) << ") " - << causedBy(redact(scopedDistLock.getStatus())); - warning() << errmsg; - return scopedDistLock.getStatus(); - } - - // Always check our version remotely - ChunkVersion shardVersion; - Status refreshStatus = shardingState->refreshMetadataNow(opCtx, nss, &shardVersion); - - if (!refreshStatus.isOK()) { - errmsg = str::stream() << "splitChunk cannot split chunk " - << "[" << redact(min) << "," << redact(max) << ") " - << causedBy(redact(refreshStatus)); - - warning() << errmsg; - return refreshStatus; - } - - if (shardVersion.majorVersion() == 0) { - // It makes no sense to split if our version is zero and we have no chunks - errmsg = str::stream() << "splitChunk cannot split chunk " - << "[" << redact(min) << "," << redact(max) << ") " - << " with zero shard version"; - - warning() << errmsg; - return {ErrorCodes::CannotSplit, errmsg}; - } - - // Even though the splitChunk command transmits a value in the operation's shardVersion - // field, this value does not actually contain the shard version, but the global collection - // version. - if (expectedCollectionEpoch != shardVersion.epoch()) { - std::string msg = str::stream() << "splitChunk cannot split chunk " - << "[" << redact(min) << "," << redact(max) << "), " - << "collection '" << nss.ns() << "' may have been dropped. " - << "current epoch: " << shardVersion.epoch() - << ", cmd epoch: " << expectedCollectionEpoch; - warning() << msg; - return {ErrorCodes::StaleEpoch, msg}; + << " to split chunk " << chunkRange.toString() << " " + << causedBy(scopedDistLock.getStatus()); + return {scopedDistLock.getStatus().code(), errmsg}; } - ScopedCollectionMetadata collMetadata; - { - AutoGetCollection autoColl(opCtx, nss, MODE_IS); - - // Get collection metadata - collMetadata = CollectionShardingState::get(opCtx, nss.ns())->getMetadata(); - } - - // With nonzero shard version, we must have metadata - invariant(collMetadata); - - KeyPattern shardKeyPattern(collMetadata->getKeyPattern()); - - // If the shard uses a hashed key, then we must make sure that the split point is of - // type NumberLong. - if (KeyPattern::isHashedKeyPattern(shardKeyPattern.toBSON())) { + // If the shard key is hashed, then we must make sure that the split points are of type + // NumberLong. + if (KeyPattern::isHashedKeyPattern(keyPatternObj)) { for (BSONObj splitKey : splitKeys) { BSONObjIterator it(splitKey); while (it.more()) { BSONElement splitKeyElement = it.next(); if (splitKeyElement.type() != NumberLong) { - errmsg = str::stream() << "splitChunk cannot split chunk [" << redact(min) - << "," << redact(max) << "), split point " + errmsg = str::stream() << "splitChunk cannot split chunk " + << chunkRange.toString() << ", split point " << splitKeyElement.toString() << " must be of type " "NumberLong for hashed shard key patterns"; - warning() << errmsg; return {ErrorCodes::CannotSplit, errmsg}; } } } } - ChunkVersion collVersion = collMetadata->getCollVersion(); - // With nonzero shard version, we must have a coll version >= our shard version - invariant(collVersion >= shardVersion); - - { - ChunkType chunkToMove; - chunkToMove.setMin(min); - chunkToMove.setMax(max); - uassertStatusOK(collMetadata->checkChunkIsValid(chunkToMove)); - } - // Commit the split to the config server. auto request = SplitChunkRequest(nss, shardName, expectedCollectionEpoch, chunkRange, splitKeys); @@ -248,22 +183,6 @@ StatusWith<boost::optional<ChunkRange>> splitChunk(OperationContext* opCtx, configCmdObj, Shard::RetryPolicy::kIdempotent); - // - // Refresh chunk metadata regardless of whether or not the split succeeded - // - { - ChunkVersion unusedShardVersion; - refreshStatus = shardingState->refreshMetadataNow(opCtx, nss, &unusedShardVersion); - - if (!refreshStatus.isOK()) { - errmsg = str::stream() << "failed to refresh metadata for split chunk [" << redact(min) - << "," << redact(max) << ") " << causedBy(redact(refreshStatus)); - - warning() << errmsg; - return refreshStatus; - } - } - // If we failed to get any response from the config server at all, despite retries, then we // should just go ahead and fail the whole operation. if (!cmdResponseStatus.isOK()) { @@ -276,31 +195,41 @@ StatusWith<boost::optional<ChunkRange>> splitChunk(OperationContext* opCtx, // Send stale epoch if epoch of request did not match epoch of collection if (commandStatus == ErrorCodes::StaleEpoch) { - std::string msg = str::stream() << "splitChunk cannot split chunk " - << "[" << redact(min) << "," << redact(max) << "), " - << "collection '" << nss.ns() << "' may have been dropped. " - << "current epoch: " << collVersion.epoch() - << ", cmd epoch: " << expectedCollectionEpoch; - warning() << msg; - - return {commandStatus.code(), str::stream() << msg << redact(causedBy(commandStatus))}; + return commandStatus; } // - // If _configsvrCommitChunkSplit returned an error, look at this shard's metadata to + // If _configsvrCommitChunkSplit returned an error, refresh and look at the metadata to // determine if the split actually did happen. This can happen if there's a network error // getting the response from the first call to _configsvrCommitChunkSplit, but it actually // succeeds, thus the automatic retry fails with a precondition violation, for example. // - if ((!commandStatus.isOK() || !writeConcernStatus.isOK()) && - checkMetadataForSuccessfulSplitChunk(opCtx, nss, chunkRange, splitKeys)) { + if (!commandStatus.isOK() || !writeConcernStatus.isOK()) { + { + ChunkVersion unusedShardVersion; + Status refreshStatus = + shardingState->refreshMetadataNow(opCtx, nss, &unusedShardVersion); + + if (!refreshStatus.isOK()) { + Status errorStatus = commandStatus.isOK() ? writeConcernStatus : commandStatus; + errmsg = str::stream() + << "splitChunk failed for chunk " << chunkRange.toString() << ", collection '" + << nss.ns() << "' due to " << errorStatus.toString() + << ". Attempt to verify if the commit succeeded anyway failed due to: " + << refreshStatus.toString(); + + warning() << redact(errmsg); + return {errorStatus.code(), errmsg}; + } + } - LOG(1) << "splitChunk [" << redact(min) << "," << redact(max) - << ") has already been committed."; - } else if (!commandStatus.isOK()) { - return commandStatus; - } else if (!writeConcernStatus.isOK()) { - return writeConcernStatus; + if (checkMetadataForSuccessfulSplitChunk(opCtx, nss, chunkRange, splitKeys)) { + // Split was committed. + } else if (!commandStatus.isOK()) { + return commandStatus; + } else if (!writeConcernStatus.isOK()) { + return writeConcernStatus; + } } AutoGetCollection autoColl(opCtx, nss, MODE_IS); @@ -322,12 +251,13 @@ StatusWith<boost::optional<ChunkRange>> splitChunk(OperationContext* opCtx, auto backChunk = ChunkType(); backChunk.setMin(splitKeys.back()); - backChunk.setMax(max); + backChunk.setMax(chunkRange.getMax()); auto frontChunk = ChunkType(); - frontChunk.setMin(min); + frontChunk.setMin(chunkRange.getMin()); frontChunk.setMax(splitKeys.front()); + KeyPattern shardKeyPattern(keyPatternObj); if (shardKeyPattern.globalMax().woCompare(backChunk.getMax()) == 0 && checkIfSingleDoc(opCtx, collection, idx, &backChunk)) { return boost::optional<ChunkRange>(ChunkRange(backChunk.getMin(), backChunk.getMax())); diff --git a/src/mongo/s/catalog/sharding_catalog_client.h b/src/mongo/s/catalog/sharding_catalog_client.h index 1285ca1f88b..0cc0181f072 100644 --- a/src/mongo/s/catalog/sharding_catalog_client.h +++ b/src/mongo/s/catalog/sharding_catalog_client.h @@ -240,7 +240,7 @@ public: /** * Applies oplog entries to the config servers. - * Used by mergeChunk, splitChunk, and moveChunk commands. + * Used by mergeChunk and splitChunk commands. * * @param updateOps: documents to write to the chunks collection. * @param preCondition: preconditions for applying documents. diff --git a/src/mongo/s/catalog/sharding_catalog_manager_chunk_operations.cpp b/src/mongo/s/catalog/sharding_catalog_manager_chunk_operations.cpp index 6dbe0902cd0..3b5464fa63b 100644 --- a/src/mongo/s/catalog/sharding_catalog_manager_chunk_operations.cpp +++ b/src/mongo/s/catalog/sharding_catalog_manager_chunk_operations.cpp @@ -229,7 +229,7 @@ BSONObj makeCommitChunkApplyOpsCommand(const NamespaceString& nss, } // namespace Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx, - const NamespaceString& ns, + const NamespaceString& nss, const OID& requestEpoch, const ChunkRange& range, const std::vector<BSONObj>& splitPoints, @@ -240,13 +240,15 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx, // move chunks on different collections to proceed in parallel Lock::ExclusiveLock lk(opCtx->lockState(), _kChunkOpLock); - // Get the chunk with highest version for this namespace + std::string errmsg; + + // Get the max chunk version for this namespace. auto findStatus = Grid::get(opCtx)->shardRegistry()->getConfigShard()->exhaustiveFindOnConfig( opCtx, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, repl::ReadConcernLevel::kLocalReadConcern, NamespaceString(ChunkType::ConfigNS), - BSON("ns" << ns.ns()), + BSON("ns" << nss.ns()), BSON(ChunkType::lastmod << -1), 1); @@ -255,17 +257,22 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx, } const auto& chunksVector = findStatus.getValue().docs; - if (chunksVector.empty()) - return {ErrorCodes::IllegalOperation, - "collection does not exist, isn't sharded, or has no chunks"}; + if (chunksVector.empty()) { + errmsg = str::stream() << "splitChunk cannot split chunk " << range.toString() + << ". Collection '" << nss.ns() + << "' no longer either exists, is sharded, or has chunks"; + return {ErrorCodes::IllegalOperation, errmsg}; + } ChunkVersion collVersion = ChunkVersion::fromBSON(chunksVector.front(), ChunkType::lastmod()); - // Return an error if epoch of chunk does not match epoch of request + // Return an error if collection epoch does not match epoch of request. if (collVersion.epoch() != requestEpoch) { - return {ErrorCodes::StaleEpoch, - "epoch of chunk does not match epoch of request. This most likely means " - "that the collection was dropped and re-created."}; + errmsg = str::stream() << "splitChunk cannot split chunk " << range.toString() + << ". Collection '" << nss.ns() << "' was dropped and re-created." + << " Current epoch: " << collVersion.epoch() + << ", cmd epoch: " << requestEpoch; + return {ErrorCodes::StaleEpoch, errmsg}; } std::vector<ChunkType> newChunks; @@ -323,9 +330,9 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx, // add the modified (new) chunk information as the update object BSONObjBuilder n(op.subobjStart("o")); - n.append(ChunkType::name(), ChunkType::genID(ns.ns(), startKey)); + n.append(ChunkType::name(), ChunkType::genID(nss.ns(), startKey)); currentMaxVersion.addToBSON(n, ChunkType::lastmod()); - n.append(ChunkType::ns(), ns.ns()); + n.append(ChunkType::ns(), nss.ns()); n.append(ChunkType::min(), startKey); n.append(ChunkType::max(), endKey); n.append(ChunkType::shard(), shardName); @@ -333,7 +340,7 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx, // add the chunk's _id as the query part of the update statement BSONObjBuilder q(op.subobjStart("o2")); - q.append(ChunkType::name(), ChunkType::genID(ns.ns(), startKey)); + q.append(ChunkType::name(), ChunkType::genID(nss.ns(), startKey)); q.done(); updates.append(op.obj()); @@ -354,9 +361,9 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx, BSONObjBuilder b; b.append("ns", ChunkType::ConfigNS); b.append("q", - BSON("query" << BSON(ChunkType::ns(ns.ns()) << ChunkType::min() << range.getMin() - << ChunkType::max() - << range.getMax()) + BSON("query" << BSON(ChunkType::ns(nss.ns()) << ChunkType::min() << range.getMin() + << ChunkType::max() + << range.getMax()) << "orderby" << BSON(ChunkType::lastmod() << -1))); { @@ -367,12 +374,12 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx, preCond.append(b.obj()); } - // apply the batch of updates to remote and local metadata + // apply the batch of updates to local metadata. Status applyOpsStatus = Grid::get(opCtx)->catalogClient()->applyChunkOpsDeprecated( opCtx, updates.arr(), preCond.arr(), - ns.ns(), + nss.ns(), currentMaxVersion, WriteConcernOptions(), repl::ReadConcernLevel::kLocalReadConcern); @@ -395,7 +402,7 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx, Grid::get(opCtx) ->catalogClient() - ->logChange(opCtx, "split", ns.ns(), logDetail.obj(), WriteConcernOptions()) + ->logChange(opCtx, "split", nss.ns(), logDetail.obj(), WriteConcernOptions()) .transitional_ignore(); } else { BSONObj beforeDetailObj = logDetail.obj(); @@ -411,7 +418,8 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx, Grid::get(opCtx) ->catalogClient() - ->logChange(opCtx, "multi-split", ns.ns(), chunkDetail.obj(), WriteConcernOptions()) + ->logChange( + opCtx, "multi-split", nss.ns(), chunkDetail.obj(), WriteConcernOptions()) .transitional_ignore(); } } @@ -495,7 +503,7 @@ Status ShardingCatalogManager::commitChunkMerge(OperationContext* opCtx, auto updates = buildMergeChunksApplyOpsUpdates(chunksToMerge, mergeVersion); auto preCond = buildMergeChunksApplyOpsPrecond(chunksToMerge, collVersion); - // apply the batch of updates to remote and local metadata + // apply the batch of updates to local metadata Status applyOpsStatus = Grid::get(opCtx)->catalogClient()->applyChunkOpsDeprecated( opCtx, updates, diff --git a/src/mongo/s/commands/cluster_split_cmd.cpp b/src/mongo/s/commands/cluster_split_cmd.cpp index 61c2ff04bfd..5c7a4f469fb 100644 --- a/src/mongo/s/commands/cluster_split_cmd.cpp +++ b/src/mongo/s/commands/cluster_split_cmd.cpp @@ -272,6 +272,8 @@ public: ChunkRange(chunk->getMin(), chunk->getMax()), {splitPoint})); + // This invalidation is only necessary so that auto-split can begin to track statistics for + // the chunks produced after the split instead of the single original chunk. Grid::get(opCtx)->catalogCache()->onStaleConfigError(std::move(routingInfo)); return true; |