From a945fb4697af6e2360ddcdd4eee2b73a0f42469f Mon Sep 17 00:00:00 2001
From: Kevin Pulo <kevin.pulo@mongodb.com>
Date: Wed, 21 Feb 2018 03:08:22 +0000
Subject: SERVER-29807 Make CollectionRangeDeleter wait for w:majority after
 each range

That is, wait once after the range has finished being deleted - rather than
waiting after each batch of ~100 document deletions (while yielding).

Also log before and after waiting for majority replication.
---
 src/mongo/db/s/collection_range_deleter.cpp | 100 +++++++++++++++-------------
 1 file changed, 54 insertions(+), 46 deletions(-)

(limited to 'src/mongo/db')

diff --git a/src/mongo/db/s/collection_range_deleter.cpp b/src/mongo/db/s/collection_range_deleter.cpp
index 4ae50230f6d..6c0fc7b0bf6 100644
--- a/src/mongo/db/s/collection_range_deleter.cpp
+++ b/src/mongo/db/s/collection_range_deleter.cpp
@@ -202,67 +202,75 @@ boost::optional<Date_t> CollectionRangeDeleter::cleanUpNextRange(
             wrote = e.toStatus();
             warning() << e.what();
         }
+    }  // drop autoColl
 
-        if (!wrote.isOK() || wrote.getValue() == 0) {
-            if (wrote.isOK()) {
-                LOG(0) << "No documents remain to delete in " << nss << " range "
-                       << redact(range->toString());
-            }
-
-            stdx::lock_guard<stdx::mutex> scopedLock(css->_metadataManager->_managerLock);
-            self->_pop(wrote.getStatus());
-            if (!self->_orphans.empty()) {
-                LOG(1) << "Deleting " << nss.ns() << " range "
-                       << redact(self->_orphans.front().range.toString()) << " next.";
-            }
-
-            return Date_t{};
+    if (!wrote.isOK() || wrote.getValue() == 0) {
+        if (wrote.isOK()) {
+            LOG(0) << "No documents remain to delete in " << nss << " range "
+                   << redact(range->toString());
         }
-    }  // drop autoColl
 
-    invariant(range);
-    invariantOK(wrote.getStatus());
-    invariant(wrote.getValue() > 0);
+        // Wait for majority replication even when wrote isn't OK or == 0, because it might have
+        // been OK and/or > 0 previously, and the deletions must be persistent before notifying
+        // clients in _pop().
 
-    repl::ReplClientInfo::forClient(opCtx->getClient()).setLastOpToSystemLastOpTime(opCtx);
-    const auto clientOpTime = repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp();
+        LOG(0) << "Waiting for majority replication of local deletions in " << nss.ns() << " range "
+               << redact(range->toString());
 
-    // Wait for replication outside the lock
-    const auto status = [&] {
-        try {
-            WriteConcernResult unusedWCResult;
-            return waitForWriteConcern(opCtx, clientOpTime, kMajorityWriteConcern, &unusedWCResult);
-        } catch (const DBException& e) {
-            return e.toStatus();
-        }
-    }();
+        repl::ReplClientInfo::forClient(opCtx->getClient()).setLastOpToSystemLastOpTime(opCtx);
+        const auto clientOpTime = repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp();
 
-    if (!status.isOK()) {
-        LOG(0) << "Error when waiting for write concern after removing " << nss << " range "
-               << redact(range->toString()) << " : " << redact(status.reason());
+        // Wait for replication outside the lock
+        const auto status = [&] {
+            try {
+                WriteConcernResult unusedWCResult;
+                return waitForWriteConcern(
+                    opCtx, clientOpTime, kMajorityWriteConcern, &unusedWCResult);
+            } catch (const DBException& e) {
+                return e.toStatus();
+            }
+        }();
 
+        // Get the lock again to finish off this range (including notifying, if necessary).
         // Don't allow lock interrupts while cleaning up.
         UninterruptibleLockGuard noInterrupt(opCtx->lockState());
         AutoGetCollection autoColl(opCtx, nss, MODE_IX);
         auto* const css = CollectionShardingState::get(opCtx, nss);
-
+        auto* const self = forTestOnly ? forTestOnly : &css->_metadataManager->_rangesToClean;
         stdx::lock_guard<stdx::mutex> scopedLock(css->_metadataManager->_managerLock);
-        auto* const self = &css->_metadataManager->_rangesToClean;
-
-        // If range were already popped (e.g. by dropping nss during the waitForWriteConcern above)
-        // its notification would have been triggered, so this check suffices to ensure that it is
-        // safe to pop the range here
-        if (!notification.ready()) {
-            invariant(!self->isEmpty() && self->_orphans.front().notification == notification);
-            LOG(0) << "Abandoning deletion of latest range in " << nss.ns() << " after "
-                   << wrote.getValue() << " local deletions because of replication failure";
-            self->_pop(status);
+
+        if (!status.isOK()) {
+            LOG(0) << "Error when waiting for write concern after removing " << nss << " range "
+                   << redact(range->toString()) << " : " << redact(status.reason());
+
+            // If range were already popped (e.g. by dropping nss during the waitForWriteConcern
+            // above) its notification would have been triggered, so this check suffices to ensure
+            // that it is safe to pop the range here
+            if (!notification.ready()) {
+                invariant(!self->isEmpty() && self->_orphans.front().notification == notification);
+                LOG(0) << "Abandoning deletion of latest range in " << nss.ns() << " after local "
+                       << "deletions because of replication failure";
+                self->_pop(status);
+            }
+        } else {
+            LOG(0) << "Finished deleting documents in " << nss.ns() << " range "
+                   << redact(range->toString());
+
+            self->_pop(wrote.getStatus());
         }
-    } else {
-        LOG(1) << "Deleted " << wrote.getValue() << " documents in " << nss.ns() << " range "
-               << redact(range->toString());
+
+        if (!self->_orphans.empty()) {
+            LOG(1) << "Deleting " << nss.ns() << " range "
+                   << redact(self->_orphans.front().range.toString()) << " next.";
+        }
+
+        return Date_t{};
     }
 
+    invariant(range);
+    invariantOK(wrote.getStatus());
+    invariant(wrote.getValue() > 0);
+
     notification.abandon();
     return Date_t{};
 }
-- 
cgit v1.2.1