6 files changed, 81 insertions, 136 deletions
diff --git a/jstests/sharding/dump_coll_metadata.js b/jstests/sharding/dump_coll_metadata.js
index dbce60e1290..9b38a3c416c 100644
--- a/jstests/sharding/dump_coll_metadata.js
+++ b/jstests/sharding/dump_coll_metadata.js
@@ -33,14 +33,18 @@
     assert(shardAdmin.runCommand({getShardVersion: coll + "xyz", fullMetadata: true}).metadata !=
            undefined);
 
-    // Make sure we get multiple chunks after a split
-    assert(admin.runCommand({split: coll + "", middle: {_id: 0}}).ok);
+    // Make sure we get multiple chunks after a split and refresh -- splits by themselves do not
+    // cause the shard to refresh.
+    assert.commandWorked(admin.runCommand({split: coll + "", middle: {_id: 0}}));
+    assert.commandWorked(
+        st.shard0.getDB('admin').runCommand({forceRoutingTableRefresh: coll + ""}));
 
-    assert(shardAdmin.runCommand({getShardVersion: coll + ""}).ok);
+    assert.commandWorked(shardAdmin.runCommand({getShardVersion: coll + ""}));
     printjson(shardAdmin.runCommand({getShardVersion: coll + "", fullMetadata: true}));
 
     // Make sure we have chunks info
     result = shardAdmin.runCommand({getShardVersion: coll + "", fullMetadata: true});
+    assert.commandWorked(result);
     metadata = result.metadata;
 
     assert.eq(metadata.chunks.length, 2);
diff --git a/jstests/sharding/migration_failure.js b/jstests/sharding/migration_failure.js
index f49a48a0169..7e3ba438262 100644
--- a/jstests/sharding/migration_failure.js
+++ b/jstests/sharding/migration_failure.js
@@ -40,9 +40,10 @@
     assert.eq(oldVersion.t,
               newVersion.t,
               "The shard version major value should not change after a failed migration");
-    assert.eq(oldVersion.i,
+    // Split does not cause a shard routing table refresh, but the moveChunk attempt will.
+    assert.eq(2,
               newVersion.i,
-              "The shard version minor value should not change after a failed migration");
+              "The shard routing table should refresh on a failed migration and show the split");
 
     assert.commandWorked(st.shard0.getDB("admin").runCommand(
         {configureFailPoint: 'failMigrationCommit', mode: 'off'}));
diff --git a/src/mongo/db/s/split_chunk.cpp b/src/mongo/db/s/split_chunk.cpp
index 28cdab7fd83..3dea2009f94 100644
--- a/src/mongo/db/s/split_chunk.cpp
+++ b/src/mongo/db/s/split_chunk.cpp
@@ -134,105 +134,40 @@ StatusWith<boost::optional<ChunkRange>> splitChunk(OperationContext* opCtx,
     ShardingState* shardingState = ShardingState::get(opCtx);
     std::string errmsg;
 
-    const BSONObj min = chunkRange.getMin();
-    const BSONObj max = chunkRange.getMax();
-
     //
     // Lock the collection's metadata and get highest version for the current shard
     // TODO(SERVER-25086): Remove distLock acquisition from split chunk
     //
     const std::string whyMessage(
-        str::stream() << "splitting chunk [" << min << ", " << max << ") in " << nss.toString());
+        str::stream() << "splitting chunk " << chunkRange.toString() << " in " << nss.toString());
     auto scopedDistLock = Grid::get(opCtx)->catalogClient()->getDistLockManager()->lock(
         opCtx, nss.ns(), whyMessage, DistLockManager::kSingleLockAttemptTimeout);
     if (!scopedDistLock.isOK()) {
         errmsg = str::stream() << "could not acquire collection lock for " << nss.toString()
-                               << " to split chunk [" << redact(min) << "," << redact(max) << ") "
-                               << causedBy(redact(scopedDistLock.getStatus()));
-        warning() << errmsg;
-        return scopedDistLock.getStatus();
-    }
-
-    // Always check our version remotely
-    ChunkVersion shardVersion;
-    Status refreshStatus = shardingState->refreshMetadataNow(opCtx, nss, &shardVersion);
-
-    if (!refreshStatus.isOK()) {
-        errmsg = str::stream() << "splitChunk cannot split chunk "
-                               << "[" << redact(min) << "," << redact(max) << ") "
-                               << causedBy(redact(refreshStatus));
-
-        warning() << errmsg;
-        return refreshStatus;
-    }
-
-    if (shardVersion.majorVersion() == 0) {
-        // It makes no sense to split if our version is zero and we have no chunks
-        errmsg = str::stream() << "splitChunk cannot split chunk "
-                               << "[" << redact(min) << "," << redact(max) << ") "
-                               << " with zero shard version";
-
-        warning() << errmsg;
-        return {ErrorCodes::CannotSplit, errmsg};
-    }
-
-    // Even though the splitChunk command transmits a value in the operation's shardVersion
-    // field, this value does not actually contain the shard version, but the global collection
-    // version.
-    if (expectedCollectionEpoch != shardVersion.epoch()) {
-        std::string msg = str::stream() << "splitChunk cannot split chunk "
-                                        << "[" << redact(min) << "," << redact(max) << "), "
-                                        << "collection '" << nss.ns() << "' may have been dropped. "
-                                        << "current epoch: " << shardVersion.epoch()
-                                        << ", cmd epoch: " << expectedCollectionEpoch;
-        warning() << msg;
-        return {ErrorCodes::StaleEpoch, msg};
+                               << " to split chunk " << chunkRange.toString() << " "
+                               << causedBy(scopedDistLock.getStatus());
+        return {scopedDistLock.getStatus().code(), errmsg};
     }
 
-    ScopedCollectionMetadata collMetadata;
-    {
-        AutoGetCollection autoColl(opCtx, nss, MODE_IS);
-
-        // Get collection metadata
-        collMetadata = CollectionShardingState::get(opCtx, nss.ns())->getMetadata();
-    }
-
-    // With nonzero shard version, we must have metadata
-    invariant(collMetadata);
-
-    KeyPattern shardKeyPattern(collMetadata->getKeyPattern());
-
-    // If the shard uses a hashed key, then we must make sure that the split point is of
-    // type NumberLong.
-    if (KeyPattern::isHashedKeyPattern(shardKeyPattern.toBSON())) {
+    // If the shard key is hashed, then we must make sure that the split points are of type
+    // NumberLong.
+    if (KeyPattern::isHashedKeyPattern(keyPatternObj)) {
         for (BSONObj splitKey : splitKeys) {
             BSONObjIterator it(splitKey);
             while (it.more()) {
                 BSONElement splitKeyElement = it.next();
                 if (splitKeyElement.type() != NumberLong) {
-                    errmsg = str::stream() << "splitChunk cannot split chunk [" << redact(min)
-                                           << "," << redact(max) << "), split point "
+                    errmsg = str::stream() << "splitChunk cannot split chunk "
+                                           << chunkRange.toString() << ", split point "
                                            << splitKeyElement.toString()
                                            << " must be of type "
                                               "NumberLong for hashed shard key patterns";
-                    warning() << errmsg;
                     return {ErrorCodes::CannotSplit, errmsg};
                 }
             }
         }
     }
 
-    ChunkVersion collVersion = collMetadata->getCollVersion();
-    // With nonzero shard version, we must have a coll version >= our shard version
-    invariant(collVersion >= shardVersion);
-
-    {
-        ChunkType chunkToMove;
-        chunkToMove.setMin(min);
-        chunkToMove.setMax(max);
-        uassertStatusOK(collMetadata->checkChunkIsValid(chunkToMove));
-    }
-
     // Commit the split to the config server.
     auto request =
         SplitChunkRequest(nss, shardName, expectedCollectionEpoch, chunkRange, splitKeys);
@@ -248,22 +183,6 @@ StatusWith<boost::optional<ChunkRange>> splitChunk(OperationContext* opCtx,
             configCmdObj,
             Shard::RetryPolicy::kIdempotent);
 
-    //
-    // Refresh chunk metadata regardless of whether or not the split succeeded
-    //
-    {
-        ChunkVersion unusedShardVersion;
-        refreshStatus = shardingState->refreshMetadataNow(opCtx, nss, &unusedShardVersion);
-
-        if (!refreshStatus.isOK()) {
-            errmsg = str::stream() << "failed to refresh metadata for split chunk [" << redact(min)
-                                   << "," << redact(max) << ") " << causedBy(redact(refreshStatus));
-
-            warning() << errmsg;
-            return refreshStatus;
-        }
-    }
-
     // If we failed to get any response from the config server at all, despite retries, then we
     // should just go ahead and fail the whole operation.
     if (!cmdResponseStatus.isOK()) {
@@ -276,31 +195,41 @@ StatusWith<boost::optional<ChunkRange>> splitChunk(OperationContext* opCtx,
 
     // Send stale epoch if epoch of request did not match epoch of collection
     if (commandStatus == ErrorCodes::StaleEpoch) {
-        std::string msg = str::stream() << "splitChunk cannot split chunk "
-                                        << "[" << redact(min) << "," << redact(max) << "), "
-                                        << "collection '" << nss.ns() << "' may have been dropped. "
-                                        << "current epoch: " << collVersion.epoch()
-                                        << ", cmd epoch: " << expectedCollectionEpoch;
-        warning() << msg;
-
-        return {commandStatus.code(), str::stream() << msg << redact(causedBy(commandStatus))};
+        return commandStatus;
     }
 
     //
-    // If _configsvrCommitChunkSplit returned an error, look at this shard's metadata to
+    // If _configsvrCommitChunkSplit returned an error, refresh and look at the metadata to
     // determine if the split actually did happen. This can happen if there's a network error
     // getting the response from the first call to _configsvrCommitChunkSplit, but it actually
     // succeeds, thus the automatic retry fails with a precondition violation, for example.
     //
-    if ((!commandStatus.isOK() || !writeConcernStatus.isOK()) &&
-        checkMetadataForSuccessfulSplitChunk(opCtx, nss, chunkRange, splitKeys)) {
+    if (!commandStatus.isOK() || !writeConcernStatus.isOK()) {
+        {
+            ChunkVersion unusedShardVersion;
+            Status refreshStatus =
+                shardingState->refreshMetadataNow(opCtx, nss, &unusedShardVersion);
+
+            if (!refreshStatus.isOK()) {
+                Status errorStatus = commandStatus.isOK() ? writeConcernStatus : commandStatus;
+                errmsg = str::stream()
+                    << "splitChunk failed for chunk " << chunkRange.toString() << ", collection '"
+                    << nss.ns() << "' due to " << errorStatus.toString()
+                    << ". Attempt to verify if the commit succeeded anyway failed due to: "
+                    << refreshStatus.toString();
+
+                warning() << redact(errmsg);
+                return {errorStatus.code(), errmsg};
+            }
+        }
 
-        LOG(1) << "splitChunk [" << redact(min) << "," << redact(max)
-               << ") has already been committed.";
-    } else if (!commandStatus.isOK()) {
-        return commandStatus;
-    } else if (!writeConcernStatus.isOK()) {
-        return writeConcernStatus;
+        if (checkMetadataForSuccessfulSplitChunk(opCtx, nss, chunkRange, splitKeys)) {
+            // Split was committed.
+        } else if (!commandStatus.isOK()) {
+            return commandStatus;
+        } else if (!writeConcernStatus.isOK()) {
+            return writeConcernStatus;
+        }
     }
 
     AutoGetCollection autoColl(opCtx, nss, MODE_IS);
@@ -322,12 +251,13 @@ StatusWith<boost::optional<ChunkRange>> splitChunk(OperationContext* opCtx,
 
     auto backChunk = ChunkType();
     backChunk.setMin(splitKeys.back());
-    backChunk.setMax(max);
+    backChunk.setMax(chunkRange.getMax());
 
     auto frontChunk = ChunkType();
-    frontChunk.setMin(min);
+    frontChunk.setMin(chunkRange.getMin());
     frontChunk.setMax(splitKeys.front());
 
+    KeyPattern shardKeyPattern(keyPatternObj);
     if (shardKeyPattern.globalMax().woCompare(backChunk.getMax()) == 0 &&
         checkIfSingleDoc(opCtx, collection, idx, &backChunk)) {
         return boost::optional<ChunkRange>(ChunkRange(backChunk.getMin(), backChunk.getMax()));
diff --git a/src/mongo/s/catalog/sharding_catalog_client.h b/src/mongo/s/catalog/sharding_catalog_client.h
index 1285ca1f88b..0cc0181f072 100644
--- a/src/mongo/s/catalog/sharding_catalog_client.h
+++ b/src/mongo/s/catalog/sharding_catalog_client.h
@@ -240,7 +240,7 @@ public:
 
     /**
      * Applies oplog entries to the config servers.
-     * Used by mergeChunk, splitChunk, and moveChunk commands.
+     * Used by mergeChunk and splitChunk commands.
      *
      * @param updateOps: documents to write to the chunks collection.
      * @param preCondition: preconditions for applying documents.
diff --git a/src/mongo/s/catalog/sharding_catalog_manager_chunk_operations.cpp b/src/mongo/s/catalog/sharding_catalog_manager_chunk_operations.cpp
index 6dbe0902cd0..3b5464fa63b 100644
--- a/src/mongo/s/catalog/sharding_catalog_manager_chunk_operations.cpp
+++ b/src/mongo/s/catalog/sharding_catalog_manager_chunk_operations.cpp
@@ -229,7 +229,7 @@ BSONObj makeCommitChunkApplyOpsCommand(const NamespaceString& nss,
 }  // namespace
 
 Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
-                                                const NamespaceString& ns,
+                                                const NamespaceString& nss,
                                                 const OID& requestEpoch,
                                                 const ChunkRange& range,
                                                 const std::vector<BSONObj>& splitPoints,
@@ -240,13 +240,15 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
     // move chunks on different collections to proceed in parallel
     Lock::ExclusiveLock lk(opCtx->lockState(), _kChunkOpLock);
 
-    // Get the chunk with highest version for this namespace
+    std::string errmsg;
+
+    // Get the max chunk version for this namespace.
     auto findStatus = Grid::get(opCtx)->shardRegistry()->getConfigShard()->exhaustiveFindOnConfig(
         opCtx,
         ReadPreferenceSetting{ReadPreference::PrimaryOnly},
         repl::ReadConcernLevel::kLocalReadConcern,
         NamespaceString(ChunkType::ConfigNS),
-        BSON("ns" << ns.ns()),
+        BSON("ns" << nss.ns()),
         BSON(ChunkType::lastmod << -1),
         1);
 
@@ -255,17 +257,22 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
     }
 
     const auto& chunksVector = findStatus.getValue().docs;
-    if (chunksVector.empty())
-        return {ErrorCodes::IllegalOperation,
-                "collection does not exist, isn't sharded, or has no chunks"};
+    if (chunksVector.empty()) {
+        errmsg = str::stream() << "splitChunk cannot split chunk " << range.toString()
+                               << ". Collection '" << nss.ns()
+                               << "' no longer either exists, is sharded, or has chunks";
+        return {ErrorCodes::IllegalOperation, errmsg};
+    }
 
     ChunkVersion collVersion = ChunkVersion::fromBSON(chunksVector.front(), ChunkType::lastmod());
 
-    // Return an error if epoch of chunk does not match epoch of request
+    // Return an error if collection epoch does not match epoch of request.
     if (collVersion.epoch() != requestEpoch) {
-        return {ErrorCodes::StaleEpoch,
-                "epoch of chunk does not match epoch of request. This most likely means "
-                "that the collection was dropped and re-created."};
+        errmsg = str::stream() << "splitChunk cannot split chunk " << range.toString()
+                               << ". Collection '" << nss.ns() << "' was dropped and re-created."
+                               << " Current epoch: " << collVersion.epoch()
+                               << ", cmd epoch: " << requestEpoch;
+        return {ErrorCodes::StaleEpoch, errmsg};
     }
 
     std::vector<ChunkType> newChunks;
@@ -323,9 +330,9 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
 
         // add the modified (new) chunk information as the update object
         BSONObjBuilder n(op.subobjStart("o"));
-        n.append(ChunkType::name(), ChunkType::genID(ns.ns(), startKey));
+        n.append(ChunkType::name(), ChunkType::genID(nss.ns(), startKey));
         currentMaxVersion.addToBSON(n, ChunkType::lastmod());
-        n.append(ChunkType::ns(), ns.ns());
+        n.append(ChunkType::ns(), nss.ns());
         n.append(ChunkType::min(), startKey);
         n.append(ChunkType::max(), endKey);
         n.append(ChunkType::shard(), shardName);
@@ -333,7 +340,7 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
 
         // add the chunk's _id as the query part of the update statement
         BSONObjBuilder q(op.subobjStart("o2"));
-        q.append(ChunkType::name(), ChunkType::genID(ns.ns(), startKey));
+        q.append(ChunkType::name(), ChunkType::genID(nss.ns(), startKey));
         q.done();
 
         updates.append(op.obj());
@@ -354,9 +361,9 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
         BSONObjBuilder b;
         b.append("ns", ChunkType::ConfigNS);
         b.append("q",
-                 BSON("query" << BSON(ChunkType::ns(ns.ns()) << ChunkType::min() << range.getMin()
-                                                             << ChunkType::max()
-                                                             << range.getMax())
+                 BSON("query" << BSON(ChunkType::ns(nss.ns()) << ChunkType::min() << range.getMin()
+                                                              << ChunkType::max()
+                                                              << range.getMax())
                               << "orderby"
                               << BSON(ChunkType::lastmod() << -1)));
         {
@@ -367,12 +374,12 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
         preCond.append(b.obj());
     }
 
-    // apply the batch of updates to remote and local metadata
+    // apply the batch of updates to local metadata.
     Status applyOpsStatus = Grid::get(opCtx)->catalogClient()->applyChunkOpsDeprecated(
         opCtx,
         updates.arr(),
         preCond.arr(),
-        ns.ns(),
+        nss.ns(),
         currentMaxVersion,
         WriteConcernOptions(),
         repl::ReadConcernLevel::kLocalReadConcern);
@@ -395,7 +402,7 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
 
         Grid::get(opCtx)
             ->catalogClient()
-            ->logChange(opCtx, "split", ns.ns(), logDetail.obj(), WriteConcernOptions())
+            ->logChange(opCtx, "split", nss.ns(), logDetail.obj(), WriteConcernOptions())
             .transitional_ignore();
     } else {
         BSONObj beforeDetailObj = logDetail.obj();
@@ -411,7 +418,8 @@ Status ShardingCatalogManager::commitChunkSplit(OperationContext* opCtx,
 
             Grid::get(opCtx)
                 ->catalogClient()
-                ->logChange(opCtx, "multi-split", ns.ns(), chunkDetail.obj(), WriteConcernOptions())
+                ->logChange(
+                    opCtx, "multi-split", nss.ns(), chunkDetail.obj(), WriteConcernOptions())
                 .transitional_ignore();
         }
     }
@@ -495,7 +503,7 @@ Status ShardingCatalogManager::commitChunkMerge(OperationContext* opCtx,
     auto updates = buildMergeChunksApplyOpsUpdates(chunksToMerge, mergeVersion);
     auto preCond = buildMergeChunksApplyOpsPrecond(chunksToMerge, collVersion);
 
-    // apply the batch of updates to remote and local metadata
+    // apply the batch of updates to local metadata
     Status applyOpsStatus = Grid::get(opCtx)->catalogClient()->applyChunkOpsDeprecated(
         opCtx,
         updates,
diff --git a/src/mongo/s/commands/cluster_split_cmd.cpp b/src/mongo/s/commands/cluster_split_cmd.cpp
index 61c2ff04bfd..5c7a4f469fb 100644
--- a/src/mongo/s/commands/cluster_split_cmd.cpp
+++ b/src/mongo/s/commands/cluster_split_cmd.cpp
@@ -272,6 +272,8 @@ public:
                                                   ChunkRange(chunk->getMin(), chunk->getMax()),
                                                   {splitPoint}));
 
+        // This invalidation is only necessary so that auto-split can begin to track statistics for
+        // the chunks produced after the split instead of the single original chunk.
         Grid::get(opCtx)->catalogCache()->onStaleConfigError(std::move(routingInfo));
 
         return true;