summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDianna Hohensee <dianna.hohensee@10gen.com>2017-11-15 17:58:58 -0500
committerDianna Hohensee <dianna.hohensee@10gen.com>2017-11-28 13:32:07 -0500
commitc9a30dfc4dcf383131ae059b154968a302fbe17c (patch)
tree01a1d6dd7e480e5fb51d4a2ef8d3998f2ba6cbcf
parent85d6d1457bcf5cc2aaf650b5ba5d856a50461b52 (diff)
downloadmongo-c9a30dfc4dcf383131ae059b154968a302fbe17c.tar.gz
SERVER-27724 Minimize split chunk command routing table refreshes
-rw-r--r--jstests/sharding/dump_coll_metadata.js10
-rw-r--r--jstests/sharding/migration_failure.js5
-rw-r--r--src/mongo/db/s/split_chunk.cpp148
-rw-r--r--src/mongo/s/catalog/sharding_catalog_client.h2
-rw-r--r--src/mongo/s/catalog/sharding_catalog_manager_chunk_operations.cpp50
-rw-r--r--src/mongo/s/commands/cluster_split_cmd.cpp2
6 files changed, 81 insertions, 136 deletions
diff --git a/jstests/sharding/dump_coll_metadata.js b/jstests/sharding/dump_coll_metadata.js
index dbce60e1290..9b38a3c416c 100644
--- a/jstests/sharding/dump_coll_metadata.js
+++ b/jstests/sharding/dump_coll_metadata.js
@@ -33,14 +33,18 @@
assert(shardAdmin.runCommand({getShardVersion: coll + "xyz", fullMetadata: true}).metadata !=
undefined);
- // Make sure we get multiple chunks after a split
- assert(admin.runCommand({split: coll + "", middle: {_id: 0}}).ok);
+ // Make sure we get multiple chunks after a split and refresh -- splits by themselves do not
+ // cause the shard to refresh.
+ assert.commandWorked(admin.runCommand({split: coll + "", middle: {_id: 0}}));
+ assert.commandWorked(
+ st.shard0.getDB('admin').runCommand({forceRoutingTableRefresh: coll + ""}));
- assert(shardAdmin.runCommand({getShardVersion: coll + ""}).ok);
+ assert.commandWorked(shardAdmin.runCommand({getShardVersion: coll + ""}));
printjson(shardAdmin.runCommand({getShardVersion: coll + "", fullMetadata: true}));
// Make sure we have chunks info
result = shardAdmin.runCommand({getShardVersion: coll + "", fullMetadata: true});
+ assert.commandWorked(result);
metadata = result.metadata;
assert.eq(metadata.chunks.length, 2);
diff --git a/jstests/sharding/migration_failure.js b/jstests/sharding/migration_failure.js
index f49a48a0169..7e3ba438262 100644
--- a/jstests/sharding/migration_failure.js
+++ b/jstests/sharding/migration_failure.js
@@ -40,9 +40,10 @@
assert.eq(oldVersion.t,
newVersion.t,
"The shard version major value should not change after a failed migration");
- assert.eq(oldVersion.i,
+ // Split does not cause a shard routing table refresh, but the moveChunk attempt will.
+ assert.eq(2,
newVersion.i,
- "The shard version minor value should not change after a failed migration");
+ "The shard routing table should refresh on a failed migration and show the split");
assert.commandWorked(st.shard0.getDB("admin").runCommand(
{configureFailPoint: 'failMigrationCommit', mode: 'off'}));
diff --git a/src/mongo/db/s/split_chunk.cpp b/src/mongo/db/s/split_chunk.cpp
index 28cdab7fd83..3dea2009f94 100644
--- a/src/mongo/db/s/split_chunk.cpp
+++ b/src/mongo/db/s/split_chunk.cpp
@@ -134,105 +134,40 @@ StatusWith<boost::optional<ChunkRange>> splitChunk(OperationContext* opCtx,
ShardingState* shardingState = ShardingState::get(opCtx);
std::string errmsg;
- const BSONObj min = chunkRange.getMin();
- const BSONObj max = chunkRange.getMax();
-
//
// Lock the collection's metadata and get highest version for the current shard
// TODO(SERVER-25086): Remove distLock acquisition from split chunk
//
const std::string whyMessage(
- str::stream() << "splitting chunk [" << min << ", " << max << ") in " << nss.toString());
+ str::stream() << "splitting chunk " << chunkRange.toString() << " in " << nss.toString());
auto scopedDistLock = Grid::get(opCtx)->catalogClient()->getDistLockManager()->lock(
opCtx, nss.ns(), whyMessage, DistLockManager::kSingleLockAttemptTimeout);
if (!scopedDistLock.isOK()) {
errmsg = str::stream() << "could not acquire collection lock for " << nss.toString()
- << " to split chunk [" << redact(min) << "," << redact(max) << ") "
- << causedBy(redact(scopedDistLock.getStatus()));
- warning() << errmsg;
- return scopedDistLock.getStatus();
- }
-
- // Always check our version remotely
- ChunkVersion shardVersion;
- Status refreshStatus = shardingState->refreshMetadataNow(opCtx, nss, &shardVersion);
-
- if (!refreshStatus.isOK()) {
- errmsg = str::stream() << "splitChunk cannot split chunk "
- << "[" << redact(min) << "," << redact(max) << ") "
- << causedBy(redact(refreshStatus));
-
- warning() << errmsg;
- return refreshStatus;
- }
-
- if (shardVersion.majorVersion() == 0) {
- // It makes no sense to split if our version is zero and we have no chunks
- errmsg = str::stream() << "splitChunk cannot split chunk "
- << "[" << redact(min) << "," << redact(max) << ") "
- << " with zero shard version";
-
- warning() << errmsg;
- return {ErrorCodes::CannotSplit, errmsg};
- }
-
- // Even though the splitChunk command transmits a value in the operation's shardVersion
- // field, this value does not actually contain the shard version, but the global collection
- // version.
- if (expectedCollectionEpoch != shardVersion.epoch()) {
- std::string msg = str::stream() << "splitChunk cannot split chunk "
- << "[" << redact(min) << "," << redact(max) << "), "
- << "collection '" << nss.ns() << "' may have been dropped. "
- << "current epoch: " << shardVersion.epoch()
- << ", cmd epoch: " << expectedCollectionEpoch;
- warning() << msg;
- return {ErrorCodes::StaleEpoch, msg};
+ << " to split chunk " << chunkRange.toString() << " "
+ << causedBy(scopedDistLock.getStatus());
+ return {scopedDistLock.getStatus().code(), errmsg};
}
- ScopedCollectionMetadata collMetadata;
- {
- AutoGetCollection autoColl(opCtx, nss, MODE_IS);
-
- // Get collection metadata
- collMetadata = CollectionShardingState::get(opCtx, nss.ns())->getMetadata();
- }
-
- // With nonzero shard version, we must have metadata
- invariant(collMetadata);
-
- KeyPattern shardKeyPattern(collMetadata->getKeyPattern());
-
- // If the shard uses a hashed key, then we must make sure that the split point is of
- // type NumberLong.
- if (KeyPattern::isHashedKeyPattern(shardKeyPattern.toBSON())) {
+ // If the shard key is hashed, then we must make sure that the split points are of type
+ // NumberLong.
+ if (KeyPattern::isHashedKeyPattern(keyPatternObj)) {
for (BSONObj splitKey : splitKeys) {
BSONObjIterator it(splitKey);
while (it.more()) {
BSONElement splitKeyElement = it.next();
if (splitKeyElement.type() != NumberLong) {
- errmsg = str::stream() << "splitChunk cannot split chunk [" << redact(min)
- << "," << redact(max) << "), split point "
+ errmsg = str::stream() << "splitChunk cannot split chunk "
+ << chunkRange.toString() << ", split point "
<< splitKeyElement.toString()
<< " must be of type "
"NumberLong for hashed shard key patterns";
- warning() << errmsg;
return {ErrorCodes::CannotSplit, errmsg};
}
}
}
}
- ChunkVersion collVersion = collMetadata->getCollVersion();
- // With nonzero shard version, we must have a coll version >= our shard version
- invariant(collVersion >= shardVersion);
-
- {
- ChunkType chunkToMove;
- chunkToMove.setMin(min);
- chunkToMove.setMax(max);
- uassertStatusOK(collMetadata->checkChunkIsValid(chunkToMove));
- }
-
// Commit the split to the config server.
auto request =
SplitChunkRequest(nss, shardName, expectedCollectionEpoch, chunkRange, splitKeys);
@@ -248,22 +183,6 @@ StatusWith<boost::optional<ChunkRange>> splitChunk(OperationContext* opCtx,
configCmdObj,
Shard::RetryPolicy::kIdempotent);
- //
- // Refresh chunk metadata regardless of whether or not the split succeeded
- //
- {
- ChunkVersion unusedShardVersion;
- refreshStatus = shardingState->refreshMetadataNow(opCtx, nss, &unusedShardVersion);
-
- if (!refreshStatus.isOK()) {
- errmsg = str::stream() << "failed to refresh metadata for split chunk [" << redact(min)
- << "," << redact(max) << ") " << causedBy(redact(refreshStatus));
-
- warning() << errmsg;
- return refreshStatus;
- }
- }
-
// If we failed to get any response from the config server at all, despite retries, then we
// should just go ahead and fail the whole operation.
if (!cmdResponseStatus.isOK()) {
@@ -276,31 +195,41 @@ StatusWith<boost::optional<ChunkRange>> splitChunk(OperationContext* opCtx,
// Send stale epoch if epoch of request did not match epoch of collection
if (commandStatus == ErrorCodes::StaleEpoch) {
- std::string msg = str::stream() << "splitChunk cannot split chunk "
- << "[" << redact(min) << "," << redact(max) << "), "
- << "collection '" << nss.ns() << "' may have been dropped. "
- << "current epoch: " << collVersion.epoch()
- << ", cmd epoch: " << expectedCollectionEpoch;
- warning() << msg;
-
- return {commandStatus.code(), str::stream() << msg << redact(causedBy(commandStatus))};
+ return commandStatus;
}
//
- // If _configsvrCommitChunkSplit returned an error, look at this shard's metadata to
+ // If _configsvrCommitChunkSplit returned an error, refresh and look at the metadata to
// determine if the split actually did happen. This can happen if there's a network error
// getting the response from the first call to _configsvrCommitChunkSplit, but it actually
// succeeds, thus the automatic retry fails with a precondition violation, for example.
//
- if ((!commandStatus.isOK() || !writeConcernStatus.isOK()) &&
- checkMetadataForSuccessfulSplitChunk(opCtx, nss, chunkRange, splitKeys)) {
+ if (!commandStatus.isOK() || !writeConcernStatus.isOK()) {
+ {
+ ChunkVersion unusedShardVersion;
+ Status refreshStatus =
+ shardingState->refreshMetadataNow(opCtx, nss, &unusedShardVersion);
+
+ if (!refreshStatus.isOK()) {
+ Status errorStatus = commandStatus.isOK() ? writeConcernStatus : commandStatus;
+ errmsg = str::stream()
+ << "splitChunk failed for chunk " << chunkRange.toString() << ", collection '"
+ << nss.ns() << "' due to " << errorStatus.toString()
+ << ". Attempt to verify if the commit succeeded anyway failed due to: "
+ << refreshStatus.toString();
+
+ warning() << redact(errmsg);
+ return {errorStatus.code(), errmsg};
+ }
+ }
- LOG(1) << "splitChunk [" << redact(min) << "," << redact(max)
- << ") has already been committed.";
- } else if (!commandStatus.isOK()) {
- return commandStatus;
- } else if (!writeConcernStatus.isOK()) {
- return writeConcernStatus;
+ if (checkMetadataForSuccessfulSplitChunk(opCtx, nss, chunkRange, splitKeys)) {
+ // Split was committed.
+ } else if (!commandStatus.isOK()) {
+ return commandStatus;
+ } else if (!writeConcernStatus.isOK()) {
+ return writeConcernStatus;
+ }
}
AutoGetCollection autoColl(opCtx, nss, MODE_IS);
@@ -322,12 +251,13 @@ StatusWith<boost::optional<ChunkRange>> splitChunk(OperationContext* opCtx,
auto backChunk = ChunkType();
backChunk.setMin(splitKeys.back());
- backChunk.setMax(max);
+ backChunk.setMax(chunkRange.getMax());
auto frontChunk = ChunkType();
- frontChunk.setMin(min);
+ frontChunk.setMin(chunkRange.getMin());
frontChunk.setMax(splitKeys.front());
+ KeyPattern shardKeyPattern(keyPatternObj);
if (shardKeyPattern.globalMax().woCompare(backChunk.getMax()) == 0 &&
checkIfSingleDoc(opCtx, collection, idx, &backChunk)) {
return boost::optional<ChunkRange>(ChunkRange(backChunk.getMin(), backChunk.getMax()));
diff --git a/src/mongo/s/catalog/sharding_catalog_client.h b/src/mongo/s/catalog/sharding_catalog_client.h
index 1285ca1f88b..0cc0181f072 100644
--- a/src/mongo/s/catalog/sharding_catalog_client.h
+++ b/src/mongo/s/catalog/sharding_catalog_client.h
@@ -240,7 +240,7 @@ public:
/**
* Applies oplog entries to the config servers.
- * Used by mergeChunk, splitChunk, and moveChunk commands.
+ * Used by mergeChunk and splitChunk commands.
*
* @param updateOps: documents to write to the chunks collection.
* @param preCondition: preconditions for applying documents.
diff --git a/src/mongo/s/catalog/sharding_catalog_manager_chunk_operations.cpp b/src/mongo/s/catalog/sharding_catalog_manager_chunk_operations.cpp
index 6dbe0902cd0..3b5464fa63b 100644
--- a/src/mongo/s/catalog/sharding_catalog_manager_chunk_operations.cpp
+++ b/src/mongo/s/catalog/sharding_catalog_manager_chunk_operations.cpp
@@ -229,7 +229,7 @@ BSONObj makeCommitChunkApplyOpsCommand(const NamespaceString& nss,
} // namespace
Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
- const NamespaceString& ns,
+ const NamespaceString& nss,
const OID& requestEpoch,
const ChunkRange& range,
const std::vector<BSONObj>& splitPoints,
@@ -240,13 +240,15 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
// move chunks on different collections to proceed in parallel
Lock::ExclusiveLock lk(opCtx->lockState(), _kChunkOpLock);
- // Get the chunk with highest version for this namespace
+ std::string errmsg;
+
+ // Get the max chunk version for this namespace.
auto findStatus = Grid::get(opCtx)->shardRegistry()->getConfigShard()->exhaustiveFindOnConfig(
opCtx,
ReadPreferenceSetting{ReadPreference::PrimaryOnly},
repl::ReadConcernLevel::kLocalReadConcern,
NamespaceString(ChunkType::ConfigNS),
- BSON("ns" << ns.ns()),
+ BSON("ns" << nss.ns()),
BSON(ChunkType::lastmod << -1),
1);
@@ -255,17 +257,22 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
}
const auto& chunksVector = findStatus.getValue().docs;
- if (chunksVector.empty())
- return {ErrorCodes::IllegalOperation,
- "collection does not exist, isn't sharded, or has no chunks"};
+ if (chunksVector.empty()) {
+ errmsg = str::stream() << "splitChunk cannot split chunk " << range.toString()
+ << ". Collection '" << nss.ns()
+ << "' no longer either exists, is sharded, or has chunks";
+ return {ErrorCodes::IllegalOperation, errmsg};
+ }
ChunkVersion collVersion = ChunkVersion::fromBSON(chunksVector.front(), ChunkType::lastmod());
- // Return an error if epoch of chunk does not match epoch of request
+ // Return an error if collection epoch does not match epoch of request.
if (collVersion.epoch() != requestEpoch) {
- return {ErrorCodes::StaleEpoch,
- "epoch of chunk does not match epoch of request. This most likely means "
- "that the collection was dropped and re-created."};
+ errmsg = str::stream() << "splitChunk cannot split chunk " << range.toString()
+ << ". Collection '" << nss.ns() << "' was dropped and re-created."
+ << " Current epoch: " << collVersion.epoch()
+ << ", cmd epoch: " << requestEpoch;
+ return {ErrorCodes::StaleEpoch, errmsg};
}
std::vector<ChunkType> newChunks;
@@ -323,9 +330,9 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
// add the modified (new) chunk information as the update object
BSONObjBuilder n(op.subobjStart("o"));
- n.append(ChunkType::name(), ChunkType::genID(ns.ns(), startKey));
+ n.append(ChunkType::name(), ChunkType::genID(nss.ns(), startKey));
currentMaxVersion.addToBSON(n, ChunkType::lastmod());
- n.append(ChunkType::ns(), ns.ns());
+ n.append(ChunkType::ns(), nss.ns());
n.append(ChunkType::min(), startKey);
n.append(ChunkType::max(), endKey);
n.append(ChunkType::shard(), shardName);
@@ -333,7 +340,7 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
// add the chunk's _id as the query part of the update statement
BSONObjBuilder q(op.subobjStart("o2"));
- q.append(ChunkType::name(), ChunkType::genID(ns.ns(), startKey));
+ q.append(ChunkType::name(), ChunkType::genID(nss.ns(), startKey));
q.done();
updates.append(op.obj());
@@ -354,9 +361,9 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
BSONObjBuilder b;
b.append("ns", ChunkType::ConfigNS);
b.append("q",
- BSON("query" << BSON(ChunkType::ns(ns.ns()) << ChunkType::min() << range.getMin()
- << ChunkType::max()
- << range.getMax())
+ BSON("query" << BSON(ChunkType::ns(nss.ns()) << ChunkType::min() << range.getMin()
+ << ChunkType::max()
+ << range.getMax())
<< "orderby"
<< BSON(ChunkType::lastmod() << -1)));
{
@@ -367,12 +374,12 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
preCond.append(b.obj());
}
- // apply the batch of updates to remote and local metadata
+ // apply the batch of updates to local metadata.
Status applyOpsStatus = Grid::get(opCtx)->catalogClient()->applyChunkOpsDeprecated(
opCtx,
updates.arr(),
preCond.arr(),
- ns.ns(),
+ nss.ns(),
currentMaxVersion,
WriteConcernOptions(),
repl::ReadConcernLevel::kLocalReadConcern);
@@ -395,7 +402,7 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
Grid::get(opCtx)
->catalogClient()
- ->logChange(opCtx, "split", ns.ns(), logDetail.obj(), WriteConcernOptions())
+ ->logChange(opCtx, "split", nss.ns(), logDetail.obj(), WriteConcernOptions())
.transitional_ignore();
} else {
BSONObj beforeDetailObj = logDetail.obj();
@@ -411,7 +418,8 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
Grid::get(opCtx)
->catalogClient()
- ->logChange(opCtx, "multi-split", ns.ns(), chunkDetail.obj(), WriteConcernOptions())
+ ->logChange(
+ opCtx, "multi-split", nss.ns(), chunkDetail.obj(), WriteConcernOptions())
.transitional_ignore();
}
}
@@ -495,7 +503,7 @@ Status ShardingCatalogManager::commitChunkMerge(OperationContext* opCtx,
auto updates = buildMergeChunksApplyOpsUpdates(chunksToMerge, mergeVersion);
auto preCond = buildMergeChunksApplyOpsPrecond(chunksToMerge, collVersion);
- // apply the batch of updates to remote and local metadata
+ // apply the batch of updates to local metadata
Status applyOpsStatus = Grid::get(opCtx)->catalogClient()->applyChunkOpsDeprecated(
opCtx,
updates,
diff --git a/src/mongo/s/commands/cluster_split_cmd.cpp b/src/mongo/s/commands/cluster_split_cmd.cpp
index 61c2ff04bfd..5c7a4f469fb 100644
--- a/src/mongo/s/commands/cluster_split_cmd.cpp
+++ b/src/mongo/s/commands/cluster_split_cmd.cpp
@@ -272,6 +272,8 @@ public:
ChunkRange(chunk->getMin(), chunk->getMax()),
{splitPoint}));
+ // This invalidation is only necessary so that auto-split can begin to track statistics for
+ // the chunks produced after the split instead of the single original chunk.
Grid::get(opCtx)->catalogCache()->onStaleConfigError(std::move(routingInfo));
return true;