SERVER-36913 Add functionality to LockManager for repl state transitions with prepared transactions.

author: Spencer T Brody <spencer@mongodb.com> 2018-08-22 14:37:16 -0400
committer: Spencer T Brody <spencer@mongodb.com> 2018-09-18 13:20:18 -0400
commit: e65ff57e108ed69c46cc0b0ccbdd675663de2469 (patch)
tree: 010051703fd944884d99571e31f1dd45b28da8a1 /src/mongo/db/concurrency
parent: 157691ef0babb24cd1566446f5f88206f9607564 (diff)
download: mongo-e65ff57e108ed69c46cc0b0ccbdd675663de2469.tar.gz
10 files changed, 642 insertions, 208 deletions
diff --git a/src/mongo/db/concurrency/SConscript b/src/mongo/db/concurrency/SConscript
index 0ca5123d60b..9ddfa508f36 100644
--- a/src/mongo/db/concurrency/SConscript
+++ b/src/mongo/db/concurrency/SConscript
@@ -38,6 +38,7 @@ env.Library(
         'lock_manager.cpp',
         'lock_state.cpp',
         'lock_stats.cpp',
+        'replication_lock_manager_manipulator.cpp',
     ],
     LIBDEPS=[
         '$BUILD_DIR/mongo/base',
diff --git a/src/mongo/db/concurrency/lock_manager.cpp b/src/mongo/db/concurrency/lock_manager.cpp
index b18007877f8..0b93ae967fb 100644
--- a/src/mongo/db/concurrency/lock_manager.cpp
+++ b/src/mongo/db/concurrency/lock_manager.cpp
@@ -129,208 +129,6 @@ MONGO_STATIC_ASSERT((sizeof(LockRequestStatusNames) / sizeof(LockRequestStatusNa
 }  // namespace
 
 /**
- * There is one of these objects for each resource that has a lock request. Empty objects (i.e.
- * LockHead with no requests) are allowed to exist on the lock manager's hash table.
- *
- * The memory and lifetime is controlled entirely by the LockManager class.
- *
- * Not thread-safe and should only be accessed under the LockManager's bucket lock. Must be locked
- * before locking a partition, not after.
- */
-struct LockHead {
-
-    /**
-     * Used for initialization of a LockHead, which might have been retrieved from cache and also in
-     * order to keep the LockHead structure a POD.
-     */
-    void initNew(ResourceId resId) {
-        resourceId = resId;
-
-        grantedList.reset();
-        memset(grantedCounts, 0, sizeof(grantedCounts));
-        grantedModes = 0;
-
-        conflictList.reset();
-        memset(conflictCounts, 0, sizeof(conflictCounts));
-        conflictModes = 0;
-
-        conversionsCount = 0;
-        compatibleFirstCount = 0;
-    }
-
-    /**
-     * True iff there may be partitions with granted requests for this resource.
-     */
-    bool partitioned() const {
-        return !partitions.empty();
-    }
-
-    /**
-     * Locates the request corresponding to the particular locker or returns nullptr. Must be called
-     * with the bucket holding this lock head locked.
-     */
-    LockRequest* findRequest(LockerId lockerId) const {
-        // Check the granted queue first
-        for (LockRequest* it = grantedList._front; it != nullptr; it = it->next) {
-            if (it->locker->getId() == lockerId) {
-                return it;
-            }
-        }
-
-        // Check the conflict queue second
-        for (LockRequest* it = conflictList._front; it != nullptr; it = it->next) {
-            if (it->locker->getId() == lockerId) {
-                return it;
-            }
-        }
-
-        return nullptr;
-    }
-
-    /**
-     * Finish creation of request and put it on the LockHead's conflict or granted queues. Returns
-     * LOCK_WAITING for conflict case and LOCK_OK otherwise.
-     */
-    LockResult newRequest(LockRequest* request) {
-        invariant(!request->partitionedLock);
-        request->lock = this;
-
-        // We cannot set request->partitioned to false, as this might be a migration, in which case
-        // access to that field is not protected. The 'partitioned' member instead indicates if a
-        // request was initially partitioned.
-
-        // New lock request. Queue after all granted modes and after any already requested
-        // conflicting modes
-        if (conflicts(request->mode, grantedModes) ||
-            (!compatibleFirstCount && conflicts(request->mode, conflictModes))) {
-            request->status = LockRequest::STATUS_WAITING;
-
-            // Put it on the conflict queue. Conflicts are granted front to back.
-            if (request->enqueueAtFront) {
-                conflictList.push_front(request);
-            } else {
-                conflictList.push_back(request);
-            }
-
-            incConflictModeCount(request->mode);
-
-            return LOCK_WAITING;
-        }
-
-        // No conflict, new request
-        request->status = LockRequest::STATUS_GRANTED;
-
-        grantedList.push_back(request);
-        incGrantedModeCount(request->mode);
-
-        if (request->compatibleFirst) {
-            compatibleFirstCount++;
-        }
-
-        return LOCK_OK;
-    }
-
-    /**
-     * Lock each partitioned LockHead in turn, and move any (granted) intent mode requests for
-     * lock->resourceId to lock, which must itself already be locked.
-     */
-    void migratePartitionedLockHeads();
-
-    // Methods to maintain the granted queue
-    void incGrantedModeCount(LockMode mode) {
-        invariant(grantedCounts[mode] >= 0);
-        if (++grantedCounts[mode] == 1) {
-            invariant((grantedModes & modeMask(mode)) == 0);
-            grantedModes |= modeMask(mode);
-        }
-    }
-
-    void decGrantedModeCount(LockMode mode) {
-        invariant(grantedCounts[mode] >= 1);
-        if (--grantedCounts[mode] == 0) {
-            invariant((grantedModes & modeMask(mode)) == modeMask(mode));
-            grantedModes &= ~modeMask(mode);
-        }
-    }
-
-    // Methods to maintain the conflict queue
-    void incConflictModeCount(LockMode mode) {
-        invariant(conflictCounts[mode] >= 0);
-        if (++conflictCounts[mode] == 1) {
-            invariant((conflictModes & modeMask(mode)) == 0);
-            conflictModes |= modeMask(mode);
-        }
-    }
-
-    void decConflictModeCount(LockMode mode) {
-        invariant(conflictCounts[mode] >= 1);
-        if (--conflictCounts[mode] == 0) {
-            invariant((conflictModes & modeMask(mode)) == modeMask(mode));
-            conflictModes &= ~modeMask(mode);
-        }
-    }
-
-    // Id of the resource which is protected by this lock. Initialized at construction time and does
-    // not change.
-    ResourceId resourceId;
-
-    //
-    // Granted queue
-    //
-
-    // Doubly-linked list of requests, which have been granted. Newly granted requests go to
-    // the end of the queue. Conversion requests are granted from the beginning forward.
-    LockRequestList grantedList;
-
-    // Counts the grants and conversion counts for each of the supported lock modes. These
-    // counts should exactly match the aggregated modes on the granted list.
-    uint32_t grantedCounts[LockModesCount];
-
-    // Bit-mask of the granted + converting modes on the granted queue. Maintained in lock-step
-    // with the grantedCounts array.
-    uint32_t grantedModes;
-
-    //
-    // Conflict queue
-    //
-
-    // Doubly-linked list of requests, which have not been granted yet because they conflict
-    // with the set of granted modes. Requests are queued at the end of the queue and are
-    // granted from the beginning forward, which gives these locks FIFO ordering. Exceptions
-    // are high-priority locks, such as the MMAP V1 flush lock.
-    LockRequestList conflictList;
-
-    // Counts the conflicting requests for each of the lock modes. These counts should exactly
-    // match the aggregated modes on the conflicts list.
-    uint32_t conflictCounts[LockModesCount];
-
-    // Bit-mask of the conflict modes on the conflict queue. Maintained in lock-step with the
-    // conflictCounts array.
-    uint32_t conflictModes;
-
-    // References partitions that may have PartitionedLockHeads for this LockHead.
-    // Non-empty implies the lock has no conflicts and only has intent modes as grantedModes.
-    // TODO: Remove this vector and make LockHead a POD
-    std::vector<LockManager::Partition*> partitions;
-
-    //
-    // Conversion
-    //
-
-    // Counts the number of requests on the granted queue, which have requested any kind of
-    // conflicting conversion and are blocked (i.e. all requests which are currently
-    // STATUS_CONVERTING). This is an optimization for unlocking in that we do not need to
-    // check the granted queue for requests in STATUS_CONVERTING if this count is zero. This
-    // saves cycles in the regular case and only burdens the less-frequent lock upgrade case.
-    uint32_t conversionsCount;
-
-    // Counts the number of requests on the granted queue, which have requested that the policy
-    // be switched to compatible-first. As long as this value is > 0, the policy will stay
-    // compatible-first.
-    uint32_t compatibleFirstCount;
-};
-
-/**
  * The PartitionedLockHead allows optimizing the case where requests overwhelmingly use
  * the intent lock modes MODE_IS and MODE_IX, which are compatible with each other.
  * Having to use a single LockHead causes contention where none would be needed.
@@ -370,6 +168,85 @@ struct PartitionedLockHead {
     LockRequestList grantedList;
 };
 
+//
+// LockHead
+//
+void LockHead::initNew(ResourceId resId) {
+    resourceId = resId;
+
+    grantedList.reset();
+    memset(grantedCounts, 0, sizeof(grantedCounts));
+    grantedModes = 0;
+
+    conflictList.reset();
+    memset(conflictCounts, 0, sizeof(conflictCounts));
+    conflictModes = 0;
+
+    conversionsCount = 0;
+    compatibleFirstCount = 0;
+}
+
+bool LockHead::partitioned() const {
+    return !partitions.empty();
+}
+
+LockRequest* LockHead::findRequest(LockerId lockerId) const {
+    // Check the granted queue first
+    for (LockRequest* it = grantedList._front; it != nullptr; it = it->next) {
+        if (it->locker->getId() == lockerId) {
+            return it;
+        }
+    }
+
+    // Check the conflict queue second
+    for (LockRequest* it = conflictList._front; it != nullptr; it = it->next) {
+        if (it->locker->getId() == lockerId) {
+            return it;
+        }
+    }
+
+    return nullptr;
+}
+
+LockResult LockHead::newRequest(LockRequest* request) {
+    invariant(!request->partitionedLock);
+    request->lock = this;
+
+    // We cannot set request->partitioned to false, as this might be a migration, in which case
+    // access to that field is not protected. The 'partitioned' member instead indicates if a
+    // request was initially partitioned.
+
+    // New lock request. Queue after all granted modes and after any already requested
+    // conflicting modes
+    if (conflicts(request->mode, grantedModes) ||
+        (!compatibleFirstCount && conflicts(request->mode, conflictModes))) {
+        request->status = LockRequest::STATUS_WAITING;
+
+        // Put it on the conflict queue. Conflicts are granted front to back.
+        if (request->enqueueAtFront) {
+            conflictList.push_front(request);
+        } else {
+            conflictList.push_back(request);
+        }
+
+        incConflictModeCount(request->mode);
+
+        return LOCK_WAITING;
+    }
+
+    // No conflict, new request
+    request->status = LockRequest::STATUS_GRANTED;
+
+    grantedList.push_back(request);
+    incGrantedModeCount(request->mode);
+
+    if (request->compatibleFirst) {
+        compatibleFirstCount++;
+    }
+
+    return LOCK_OK;
+}
+
 void LockHead::migratePartitionedLockHeads() {
     invariant(partitioned());
 
@@ -403,6 +280,38 @@ void LockHead::migratePartitionedLockHeads() {
     }
 }
 
+void LockHead::incGrantedModeCount(LockMode mode) {
+    invariant(grantedCounts[mode] >= 0);
+    if (++grantedCounts[mode] == 1) {
+        invariant((grantedModes & modeMask(mode)) == 0);
+        grantedModes |= modeMask(mode);
+    }
+}
+
+void LockHead::decGrantedModeCount(LockMode mode) {
+    invariant(grantedCounts[mode] >= 1);
+    if (--grantedCounts[mode] == 0) {
+        invariant((grantedModes & modeMask(mode)) == modeMask(mode));
+        grantedModes &= ~modeMask(mode);
+    }
+}
+
+void LockHead::incConflictModeCount(LockMode mode) {
+    invariant(conflictCounts[mode] >= 0);
+    if (++conflictCounts[mode] == 1) {
+        invariant((conflictModes & modeMask(mode)) == 0);
+        conflictModes |= modeMask(mode);
+    }
+}
+
+void LockHead::decConflictModeCount(LockMode mode) {
+    invariant(conflictCounts[mode] >= 1);
+    if (--conflictCounts[mode] == 0) {
+        invariant((conflictModes & modeMask(mode)) == modeMask(mode));
+        conflictModes &= ~modeMask(mode);
+    }
+}
+
 //
 // LockManager
 //
@@ -1204,4 +1113,8 @@ const char* lockRequestStatusName(LockRequest::Status status) {
     return LockRequestStatusNames[status];
 }
 
+LockManager::TemporaryResourceQueue::TemporaryResourceQueue(ResourceId resourceId) {
+    _lockHead.initNew(resourceId);
+}
+
 }  // namespace mongo
diff --git a/src/mongo/db/concurrency/lock_manager.h b/src/mongo/db/concurrency/lock_manager.h
index 2956232f996..5f481b76911 100644
--- a/src/mongo/db/concurrency/lock_manager.h
+++ b/src/mongo/db/concurrency/lock_manager.h
@@ -57,6 +57,8 @@ public:
     LockManager();
     ~LockManager();
 
+    class TemporaryResourceQueue;
+
     /**
       * Acquires lock on the specified resource in the specified mode and returns the outcome
       * of the operation. See the details for LockResult for more information on what the
@@ -138,6 +140,10 @@ private:
     // The lockheads need access to the partitions
     friend struct LockHead;
 
+    // ReplicationLockManagerManipulator manipulates LockManager state directly in order to
+    // encapsulate specific logic for replication state transitions.
+    friend class ReplicationLockManagerManipulator;
+
     // These types describe the locks hash table
 
     struct LockBucket {
@@ -200,7 +206,7 @@ private:
      *
      * @param lock Lock whose grant state should be recalculated.
      * @param checkConflictQueue Whether to go through the conflict queue. This is an
-     *          optimisation in that we only need to check the conflict queue if one of the
+     *          optimization in that we only need to check the conflict queue if one of the
      *          granted modes, which was conflicting before became zero.
      */
     void _onLockModeChanged(LockHead* lock, bool checkConflictQueue);
@@ -313,4 +319,146 @@ private:
     bool _foundCycle;
 };
 
+/**
+ * There is one of these objects for each resource that has a lock request. Empty objects (i.e.
+ * LockHead with no requests) are allowed to exist on the lock manager's hash table.
+ *
+ * The memory and lifetime is controlled entirely by the LockManager class.
+ *
+ * Not thread-safe and should only be accessed under the LockManager's bucket lock. Must be locked
+ * before locking a partition, not after.
+ *
+ * This struct *should* just be a private sub-class of LockManager since it is effectively an
+ * implementation detail of LockManager.  The reason it isn't is because it is also used by
+ * ReplicationLockManagerManipulator and because it is forward-declared in lock_manager_defs.h.
+ * Nevertheless this struct should be thought of as an implementation detail of LockManager and
+ * should not be used directly except by LockManager and friend classes of LockManager that serve to
+ * extend LockManager functionality.
+ */
+struct LockHead {
+private:
+    friend class DeadlockDetector;
+    friend class LockManager;
+    friend class ReplicationLockManagerManipulator;
+
+    /**
+     * Used for initialization of a LockHead, which might have been retrieved from cache and also in
+     * order to keep the LockHead structure a POD.
+     */
+    void initNew(ResourceId resId);
+
+    /**
+     * True iff there may be partitions with granted requests for this resource.
+     */
+    bool partitioned() const;
+
+    /**
+     * Locates the request corresponding to the particular locker or returns nullptr. Must be called
+     * with the bucket holding this lock head locked.
+     */
+    LockRequest* findRequest(LockerId lockerId) const;
+
+    /**
+     * Finish creation of request and put it on the LockHead's conflict or granted queues. Returns
+     * LOCK_WAITING for conflict case and LOCK_OK otherwise.
+     */
+    LockResult newRequest(LockRequest* request);
+
+    /**
+     * Lock each partitioned LockHead in turn, and move any (granted) intent mode requests for
+     * lock->resourceId to lock, which must itself already be locked.
+     */
+    void migratePartitionedLockHeads();
+
+    // Methods to maintain the granted queue
+    void incGrantedModeCount(LockMode mode);
+
+    void decGrantedModeCount(LockMode mode);
+
+    // Methods to maintain the conflict queue
+    void incConflictModeCount(LockMode mode);
+
+    void decConflictModeCount(LockMode mode);
+
+    // Id of the resource which is protected by this lock. Initialized at construction time and does
+    // not change.
+    ResourceId resourceId;
+
+    //
+    // Granted queue
+    //
+
+    // Doubly-linked list of requests, which have been granted. Newly granted requests go to
+    // the end of the queue. Conversion requests are granted from the beginning forward.
+    LockRequestList grantedList;
+
+    // Counts the grants and conversion counts for each of the supported lock modes. These
+    // counts should exactly match the aggregated modes on the granted list.
+    uint32_t grantedCounts[LockModesCount];
+
+    // Bit-mask of the granted + converting modes on the granted queue. Maintained in lock-step
+    // with the grantedCounts array.
+    uint32_t grantedModes;
+
+    //
+    // Conflict queue
+    //
+
+    // Doubly-linked list of requests, which have not been granted yet because they conflict
+    // with the set of granted modes. Requests are queued at the end of the queue and are
+    // granted from the beginning forward, which gives these locks FIFO ordering. Exceptions
+    // are high-priority locks, such as the MMAP V1 flush lock.
+    LockRequestList conflictList;
+
+    // Counts the conflicting requests for each of the lock modes. These counts should exactly
+    // match the aggregated modes on the conflicts list.
+    uint32_t conflictCounts[LockModesCount];
+
+    // Bit-mask of the conflict modes on the conflict queue. Maintained in lock-step with the
+    // conflictCounts array.
+    uint32_t conflictModes;
+
+    // References partitions that may have PartitionedLockHeads for this LockHead.
+    // Non-empty implies the lock has no conflicts and only has intent modes as grantedModes.
+    // TODO: Remove this vector and make LockHead a POD
+    std::vector<LockManager::Partition*> partitions;
+
+    //
+    // Conversion
+    //
+
+    // Counts the number of requests on the granted queue, which have requested any kind of
+    // conflicting conversion and are blocked (i.e. all requests which are currently
+    // STATUS_CONVERTING). This is an optimization for unlocking in that we do not need to
+    // check the granted queue for requests in STATUS_CONVERTING if this count is zero. This
+    // saves cycles in the regular case and only burdens the less-frequent lock upgrade case.
+    uint32_t conversionsCount;
+
+    // Counts the number of requests on the granted queue, which have requested that the policy
+    // be switched to compatible-first. As long as this value is > 0, the policy will stay
+    // compatible-first.
+    uint32_t compatibleFirstCount;
+};
+
+/**
+ * This class wraps a LockHead and is used to represent a temporary state for a resource managed by
+ * the LockManager.  This allows us to prepare lock requests against a resource without those
+ * requests actually being present in the "true" version of the resource which is actually being
+ * managed by the LockManager.  We use this to avoid exposing LockHead, which is an implementation
+ * detail of the LockManager, while still providing a handle to state for a LockManager resource.
+ */
+class LockManager::TemporaryResourceQueue {
+public:
+    explicit TemporaryResourceQueue(ResourceId resourceId);
+
+    ResourceId getResourceId() const {
+        return _lockHead.resourceId;
+    }
+
+private:
+    friend class ReplicationLockManagerManipulator;
+
+    LockHead _lockHead;
+};
+
 }  // namespace mongo
diff --git a/src/mongo/db/concurrency/lock_state.cpp b/src/mongo/db/concurrency/lock_state.cpp
index e526931d785..576d476abae 100644
--- a/src/mongo/db/concurrency/lock_state.cpp
+++ b/src/mongo/db/concurrency/lock_state.cpp
@@ -34,6 +34,7 @@
 
 #include <vector>
 
+#include "mongo/db/concurrency/replication_lock_manager_manipulator.h"
 #include "mongo/db/namespace_string.h"
 #include "mongo/db/service_context.h"
 #include "mongo/platform/compiler.h"
@@ -120,6 +121,12 @@ AtomicUInt64 idCounter(0);
 // Partitioned global lock statistics, so we don't hit the same bucket
 PartitionedInstanceWideLockStats globalStats;
 
+// When this is set, lock acquisitions for the global resource go into the TemporaryRequestQueue
+// stored in this decoration instead of letting the LockManager look up and put the requests into
+// the true global resource state.
+OperationContext::Decoration<LockManager::TemporaryResourceQueue*> globalResourceShadow =
+    OperationContext::declareDecoration<LockManager::TemporaryResourceQueue*>();
+
 }  // namespace
 
 bool LockerImpl::_shouldDelayUnlock(ResourceId resId, LockMode mode) const {
@@ -263,7 +270,8 @@ LockerImpl::~LockerImpl() {
     invariant(!inAWriteUnitOfWork());
     invariant(_numResourcesToUnlockAtEndUnitOfWork == 0);
     invariant(_requests.empty());
-    invariant(_modeForTicket == MODE_NONE);
+    invariant(_modeForTicket == MODE_NONE,
+              str::stream() << "_modeForTicket found: " << _modeForTicket);
 
     // Reset the locking statistics so the object can be reused
     _stats.reset();
@@ -349,7 +357,7 @@ LockResult LockerImpl::_lockGlobalBegin(OperationContext* opCtx, LockMode mode,
     }
 
     LockMode actualLockMode = mode;
-    if (opCtx) {
+    if (opCtx && opCtx->getServiceContext()) {
         auto storageEngine = opCtx->getServiceContext()->getStorageEngine();
         if (storageEngine && !storageEngine->supportsDBLocking()) {
             actualLockMode = isSharedLockMode(mode) ? MODE_S : MODE_X;
@@ -361,7 +369,7 @@ LockResult LockerImpl::_lockGlobalBegin(OperationContext* opCtx, LockMode mode,
 
     // Currently, deadlock detection does not happen inline with lock acquisition so the only
     // unsuccessful result that the lock manager would return is LOCK_WAITING.
-    invariant(result == LOCK_WAITING);
+    invariant(result == LOCK_WAITING, str::stream() << "Unexpected lock result: " << result);
 
     return result;
 }
@@ -656,6 +664,42 @@ void LockerImpl::restoreLockState(OperationContext* opCtx, const Locker::LockSna
     invariant(_modeForTicket != MODE_NONE);
 }
 
+void LockerImpl::restoreLockStateWithTemporaryGlobalResource(
+    OperationContext* opCtx,
+    const LockSnapshot& state,
+    LockManager::TemporaryResourceQueue* tempGlobalResource) {
+    invariant(tempGlobalResource->getResourceId().getType() == ResourceType::RESOURCE_GLOBAL);
+    invariant(globalResourceShadow(opCtx) == nullptr);
+
+    globalResourceShadow(opCtx) = tempGlobalResource;
+    ON_BLOCK_EXIT([&] { globalResourceShadow(opCtx) = nullptr; });
+
+    restoreLockState(opCtx, state);
+}
+
+void LockerImpl::replaceGlobalLockStateWithTemporaryGlobalResource(
+    LockManager::TemporaryResourceQueue* tempGlobalResource) {
+    invariant(tempGlobalResource->getResourceId().getType() == ResourceType::RESOURCE_GLOBAL);
+
+    // Transfer the LockRequests from tempGlobalResource into the true resource for the global lock
+    // that is managed by the LockManager.  This also removes the existing MODE_X LockRequest from
+    // the granted list for the true global resource, but we still need to delete that LockRequest
+    // and remove it from this Locker's _requests list.
+    ReplicationLockManagerManipulator(&globalLockManager)
+        .replaceGlobalLocksWithLocksFromTemporaryGlobalResource(resourceIdGlobal,
+                                                                tempGlobalResource);
+
+    // Release the ticket that this Locker was holding for the global X lock.
+    invariant(_modeForTicket == MODE_X);
+    _releaseTicket();
+    _modeForTicket = MODE_NONE;
+
+    // Now fully delete the LockRequest.
+    auto it = _requests.find(resourceIdGlobal);
+    scoped_spinlock scopedLock(_lock);
+    it.remove();
+}
+
 LockResult LockerImpl::lockBegin(OperationContext* opCtx, ResourceId resId, LockMode mode) {
     dassert(!getWaitingResource().isValid());
 
@@ -711,8 +755,21 @@ LockResult LockerImpl::lockBegin(OperationContext* opCtx, ResourceId resId, Lock
     // otherwise we might reset state if the lock becomes granted very fast.
     _notify.clear();
 
-    LockResult result = isNew ? globalLockManager.lock(resId, request, mode)
-                              : globalLockManager.convert(resId, request, mode);
+    LockResult result{LockResult::LOCK_INVALID};
+    if (resType == RESOURCE_GLOBAL && opCtx && globalResourceShadow(opCtx)) {
+        // If we're trying to lock the global resource and we have a temporary global resource
+        // installed, use the temporary resource instead of letting the LockManager look up the
+        // true resource for the global lock.
+        invariant(isNew);
+        ReplicationLockManagerManipulator(&globalLockManager)
+            .lockUncontestedTemporaryGlobalResource(globalResourceShadow(opCtx), request, mode);
+        // lockUncontestedTemporaryGlobalResource can't fail.
+        result = LockResult::LOCK_OK;
+    } else {
+        // Normal case using the true global lock head.
+        result = isNew ? globalLockManager.lock(resId, request, mode)
+                       : globalLockManager.convert(resId, request, mode);
+    }
 
     if (result == LOCK_WAITING) {
         globalStats.recordWait(_id, resId, mode);
diff --git a/src/mongo/db/concurrency/lock_state.h b/src/mongo/db/concurrency/lock_state.h
index daf86c3eb5e..8eb9ec4053c 100644
--- a/src/mongo/db/concurrency/lock_state.h
+++ b/src/mongo/db/concurrency/lock_state.h
@@ -193,6 +193,14 @@ public:
         restoreLockState(nullptr, stateToRestore);
     }
 
+    void restoreLockStateWithTemporaryGlobalResource(
+        OperationContext* opCtx,
+        const LockSnapshot& stateToRestore,
+        LockManager::TemporaryResourceQueue* tempGlobalResource) override;
+
+    void replaceGlobalLockStateWithTemporaryGlobalResource(
+        LockManager::TemporaryResourceQueue* tempGlobalResource) override;
+
     virtual void releaseTicket();
     virtual void reacquireTicket(OperationContext* opCtx);
 
diff --git a/src/mongo/db/concurrency/lock_state_test.cpp b/src/mongo/db/concurrency/lock_state_test.cpp
index 4d2da27bd94..3ee780e364b 100644
--- a/src/mongo/db/concurrency/lock_state_test.cpp
+++ b/src/mongo/db/concurrency/lock_state_test.cpp
@@ -37,6 +37,7 @@
 #include "mongo/config.h"
 #include "mongo/db/concurrency/lock_manager_test_help.h"
 #include "mongo/db/concurrency/locker.h"
+#include "mongo/db/operation_context_noop.h"
 #include "mongo/unittest/unittest.h"
 #include "mongo/util/log.h"
 #include "mongo/util/timer.h"
@@ -657,4 +658,102 @@ TEST(LockerImpl, ConvertLockPendingUnlockAndUnlock) {
     locker.unlockGlobal();
 }
 
+/**
+ * This test mimics the process that replica set state transitions go through.
+ * First a request for the global X lock is enqueued, then the existing locks held by various
+ * ongoing prepared transactions are yielded so that the global X lock request can succeed.
+ * When the state transition is complete we then need to atomically restore the locks that were
+ * yielded while dropping the global X lock that was held for the transition.
+ */
+TEST(LockerImpl, AtomicLockRestoreAndGlobalLockReleaseForReplStateTransitions) {
+    const ResourceId globalResId(RESOURCE_GLOBAL, ResourceId::SINGLETON_GLOBAL);
+    const ResourceId resIdDatabase(RESOURCE_DATABASE, "TestDB"_sd);
+    const ResourceId resIdCollection1(RESOURCE_COLLECTION, "TestDB.collection1"_sd);
+    const ResourceId resIdCollection2(RESOURCE_COLLECTION, "TestDB.collection2"_sd);
+    const ResourceId resIdCollection3(RESOURCE_COLLECTION, "TestDB.collection2"_sd);
+
+    Locker::LockSnapshot lockInfo1, lockInfo2, lockInfo3;
+    LockerImpl stepUpLocker, txnLocker1, txnLocker2, txnLocker3, randomOpLocker1, randomOpLocker2;
+    OperationContextNoop opCtx1, opCtx2, opCtx3;
+    ON_BLOCK_EXIT([&] {
+        // clean up locks on test completion.
+        stepUpLocker.unlockGlobal();
+        txnLocker1.unlockGlobal();
+        txnLocker2.unlockGlobal();
+        txnLocker3.unlockGlobal();
+        randomOpLocker1.unlockGlobal();
+        randomOpLocker2.unlockGlobal();
+    });
+
+    // Take some locks, mimicking the locks that would be held by ongoing prepared transactions.
+    txnLocker1.lockGlobal(&opCtx1, MODE_IX);
+    ASSERT_EQUALS(LOCK_OK, txnLocker1.lock(&opCtx1, resIdDatabase, MODE_IX));
+    ASSERT_EQUALS(LOCK_OK, txnLocker1.lock(&opCtx1, resIdCollection1, MODE_IX));
+    ASSERT_EQUALS(LOCK_OK, txnLocker1.lock(&opCtx1, resIdCollection2, MODE_IX));
+
+    txnLocker2.lockGlobal(&opCtx2, MODE_IX);
+    ASSERT_EQUALS(LOCK_OK, txnLocker2.lock(&opCtx2, resIdDatabase, MODE_IX));
+    ASSERT_EQUALS(LOCK_OK, txnLocker2.lock(&opCtx2, resIdCollection2, MODE_IX));
+
+    txnLocker3.lockGlobal(&opCtx3, MODE_IX);
+    ASSERT_EQUALS(LOCK_OK, txnLocker3.lock(&opCtx3, resIdDatabase, MODE_IX));
+    ASSERT_EQUALS(LOCK_OK, txnLocker3.lock(&opCtx3, resIdCollection3, MODE_IX));
+
+    // Enqueue request for global X lock in stepUpLocker, mimicking the lock held by the thread
+    // performing the state transition.
+    ASSERT_EQUALS(LockResult::LOCK_WAITING, stepUpLocker.lockGlobalBegin(MODE_X, Date_t::max()));
+
+    // Enqueue a lock request behind the pending global X request to ensure that it gets granted
+    // later when the global X lock is released.
+    ASSERT_EQUALS(LockResult::LOCK_WAITING,
+                  randomOpLocker1.lockGlobalBegin(MODE_IS, Date_t::max()));
+
+    // Yield locks on all txn threads.
+    ASSERT_TRUE(txnLocker1.saveLockStateAndUnlock(&lockInfo1));
+    ASSERT_TRUE(txnLocker2.saveLockStateAndUnlock(&lockInfo2));
+    ASSERT_TRUE(txnLocker3.saveLockStateAndUnlock(&lockInfo3));
+
+    // Ensure that stepUpLocker is now able to acquire the global X lock.
+    ASSERT_EQUALS(LockResult::LOCK_OK, stepUpLocker.lockGlobalComplete(Date_t::max()));
+
+    // Enqueue a lock request behind the global X lock to ensure that it gets granted
+    // later when the global X lock is released.
+    ASSERT_EQUALS(LockResult::LOCK_WAITING,
+                  randomOpLocker2.lockGlobalBegin(MODE_IX, Date_t::max()));
+
+    // Now we need to atomically release the global X lock from stepUpLocker and restore the lock
+    // state for the txn threads. We start by restoring the lock state from the txn threads with
+    // the global locks placed into a temporary LockHead.
+    LockManager::TemporaryResourceQueue tempGlobalResource(globalResId);
+
+    txnLocker1.restoreLockStateWithTemporaryGlobalResource(&opCtx1, lockInfo1, &tempGlobalResource);
+    txnLocker2.restoreLockStateWithTemporaryGlobalResource(&opCtx2, lockInfo2, &tempGlobalResource);
+    txnLocker3.restoreLockStateWithTemporaryGlobalResource(&opCtx3, lockInfo3, &tempGlobalResource);
+
+    // Atomically release the global X lock from stepUpLocker and move the global locks for the
+    // txn threads from the temporary LockHead into the true LockHead for the global lock.
+    stepUpLocker.replaceGlobalLockStateWithTemporaryGlobalResource(&tempGlobalResource);
+
+    // Make sure all the locks were acquired and released appropriately.
+    ASSERT_EQUALS(MODE_NONE, stepUpLocker.getLockMode(globalResId));
+
+    ASSERT_EQUALS(MODE_IX, txnLocker1.getLockMode(globalResId));
+    ASSERT_EQUALS(MODE_IX, txnLocker1.getLockMode(resIdDatabase));
+    ASSERT_EQUALS(MODE_IX, txnLocker1.getLockMode(resIdCollection1));
+    ASSERT_EQUALS(MODE_IX, txnLocker1.getLockMode(resIdCollection2));
+
+    ASSERT_EQUALS(MODE_IX, txnLocker2.getLockMode(globalResId));
+    ASSERT_EQUALS(MODE_IX, txnLocker2.getLockMode(resIdDatabase));
+    ASSERT_EQUALS(MODE_IX, txnLocker2.getLockMode(resIdCollection2));
+
+    ASSERT_EQUALS(MODE_IX, txnLocker3.getLockMode(globalResId));
+    ASSERT_EQUALS(MODE_IX, txnLocker3.getLockMode(resIdDatabase));
+    ASSERT_EQUALS(MODE_IX, txnLocker3.getLockMode(resIdCollection3));
+
+    // Make sure the pending global lock requests got granted when the global X lock was released.
+    ASSERT_EQUALS(LockResult::LOCK_OK, randomOpLocker1.lockGlobalComplete(Date_t::now()));
+    ASSERT_EQUALS(LockResult::LOCK_OK, randomOpLocker2.lockGlobalComplete(Date_t::now()));
+    ASSERT_EQUALS(MODE_IS, randomOpLocker1.getLockMode(globalResId));
+    ASSERT_EQUALS(MODE_IX, randomOpLocker2.getLockMode(globalResId));
+}
 }  // namespace mongo
diff --git a/src/mongo/db/concurrency/locker.h b/src/mongo/db/concurrency/locker.h
index 29b0ca9d364..07e45d052c5 100644
--- a/src/mongo/db/concurrency/locker.h
+++ b/src/mongo/db/concurrency/locker.h
@@ -366,6 +366,26 @@ public:
     virtual void restoreLockState(const LockSnapshot& stateToRestore) = 0;
 
     /**
+     * Works like restoreLockState but for any global locks in the state to restore, rather than
+     * restoring them into the true global lock resource owned by the LockManager,
+     * restores the global locks into the TemporaryResourceQueue for the global resource that is
+     * provided.  Locks on resources other than the global lock are restored to their true
+     * LockManager-owned resource objects.
+     */
+    virtual void restoreLockStateWithTemporaryGlobalResource(
+        OperationContext* opCtx,
+        const LockSnapshot& stateToRestore,
+        LockManager::TemporaryResourceQueue* tempGlobalResource) = 0;
+
+    /**
+     * Atomically releases the global X lock from the true global resource managed by the
+     * LockManager and transfers the locks from the 'tempGlobalResource' into the true global
+     * resource.
+     */
+    virtual void replaceGlobalLockStateWithTemporaryGlobalResource(
+        LockManager::TemporaryResourceQueue* tempGlobalResource) = 0;
+
+    /**
      * Releases the ticket associated with the Locker. This allows locks to be held without
      * contributing to reader/writer throttling.
      */
diff --git a/src/mongo/db/concurrency/locker_noop.h b/src/mongo/db/concurrency/locker_noop.h
index 232b0b5f685..957627c8124 100644
--- a/src/mongo/db/concurrency/locker_noop.h
+++ b/src/mongo/db/concurrency/locker_noop.h
@@ -178,6 +178,18 @@ public:
         MONGO_UNREACHABLE;
     }
 
+    void restoreLockStateWithTemporaryGlobalResource(
+        OperationContext* opCtx,
+        const LockSnapshot& stateToRestore,
+        LockManager::TemporaryResourceQueue* tempGlobalResource) override {
+        MONGO_UNREACHABLE;
+    }
+
+    void replaceGlobalLockStateWithTemporaryGlobalResource(
+        LockManager::TemporaryResourceQueue* tempGlobalResource) override {
+        MONGO_UNREACHABLE;
+    }
+
     virtual void releaseTicket() {
         MONGO_UNREACHABLE;
     }
diff --git a/src/mongo/db/concurrency/replication_lock_manager_manipulator.cpp b/src/mongo/db/concurrency/replication_lock_manager_manipulator.cpp
new file mode 100644
index 00000000000..009bbbe3c20
--- /dev/null
+++ b/src/mongo/db/concurrency/replication_lock_manager_manipulator.cpp
@@ -0,0 +1,105 @@
+/**
+ *    Copyright (C) 2018 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kDefault
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/concurrency/replication_lock_manager_manipulator.h"
+
+namespace mongo {
+
+ReplicationLockManagerManipulator::ReplicationLockManagerManipulator(LockManager* lockManager)
+    : _lockManager(lockManager) {}
+
+void ReplicationLockManagerManipulator::lockUncontestedTemporaryGlobalResource(
+    LockManager::TemporaryResourceQueue* tempGlobalResource, LockRequest* request, LockMode mode) {
+    // Sanity check that requests are not being reused without proper cleanup
+    invariant(request->status == LockRequest::STATUS_NEW);
+    invariant(request->recursiveCount == 1);
+    invariant(!request->partitioned);
+    invariant(tempGlobalResource->_lockHead.resourceId.getType() == ResourceType::RESOURCE_GLOBAL);
+
+    request->mode = mode;
+    const auto lockResult = tempGlobalResource->_lockHead.newRequest(request);
+    invariant(lockResult == LockResult::LOCK_OK);
+}
+
+void ReplicationLockManagerManipulator::replaceGlobalLocksWithLocksFromTemporaryGlobalResource(
+    ResourceId resId, LockManager::TemporaryResourceQueue* tempGlobalResource) {
+    invariant(resId.getType() == ResourceType::RESOURCE_GLOBAL);
+    invariant(tempGlobalResource->_lockHead.resourceId == resId);
+
+    LockManager::LockBucket* bucket = _lockManager->_getBucket(resId);
+    stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
+    LockHead* trueGlobalLockHead = bucket->findOrInsert(resId);
+
+    LockHead* tempGlobalLockHead = &tempGlobalResource->_lockHead;
+
+    invariant(trueGlobalLockHead->grantedCounts[MODE_X] == 1);
+    invariant(trueGlobalLockHead->compatibleFirstCount == 1);
+    invariant(tempGlobalLockHead->conflictList.empty());
+
+    LockRequest* existingGlobalLockRequest = trueGlobalLockHead->grantedList._front;
+    invariant(!existingGlobalLockRequest->next);
+    invariant(existingGlobalLockRequest->mode == MODE_X);
+    invariant(existingGlobalLockRequest->status == LockRequest::Status::STATUS_GRANTED);
+
+    // Remove the existing granted MODE_X lock from the trueGlobalLockHead so it can be replaced
+    // by the locks from tempGlobalLockHead.
+    trueGlobalLockHead->grantedList.remove(existingGlobalLockRequest);
+    trueGlobalLockHead->decGrantedModeCount(existingGlobalLockRequest->mode);
+    trueGlobalLockHead->compatibleFirstCount--;
+
+    // Now iterate over the granted LockRequests in the tempGlobalLockHead and transfer them over
+    // to the trueGlobalLockHead.
+    for (LockRequest* it = tempGlobalLockHead->grantedList._front; it != nullptr;) {
+        LockRequest* next = it->next;
+
+        invariant(it->mode == MODE_IX);
+        invariant(it->status == LockRequest::Status::STATUS_GRANTED);
+        invariant(it->lock == tempGlobalLockHead);
+
+        it->lock = trueGlobalLockHead;
+        tempGlobalLockHead->grantedList.remove(it);
+        tempGlobalLockHead->decGrantedModeCount(it->mode);
+        trueGlobalLockHead->grantedList.push_back(it);
+        trueGlobalLockHead->incGrantedModeCount(it->mode);
+
+        it = next;
+    }
+    invariant(tempGlobalLockHead->grantedList.empty());
+    invariant(tempGlobalLockHead->grantedCounts[MODE_X] == 0);
+    invariant(tempGlobalLockHead->grantedModes == 0);
+
+    // Grant any pending requests against the true global lock head that can proceed now that the
+    // global X lock has been released.
+    _lockManager->_onLockModeChanged(trueGlobalLockHead, true);
+}
+
+}  // namespace mongo
diff --git a/src/mongo/db/concurrency/replication_lock_manager_manipulator.h b/src/mongo/db/concurrency/replication_lock_manager_manipulator.h
new file mode 100644
index 00000000000..44048de2eb2
--- /dev/null
+++ b/src/mongo/db/concurrency/replication_lock_manager_manipulator.h
@@ -0,0 +1,71 @@
+/**
+ *    Copyright (C) 2018 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/concurrency/lock_manager.h"
+
+namespace mongo {
+
+/**
+ * This friend class to the LockManager extends LockManager functionality to enable behaviors
+ * required for replication state transitions, which must atomically release the global X lock and
+ * restore locks for prepared transactions into their individual Lockers.
+ */
+class ReplicationLockManagerManipulator {
+    MONGO_DISALLOW_COPYING(ReplicationLockManagerManipulator);
+
+public:
+    explicit ReplicationLockManagerManipulator(LockManager* lockManager);
+    ~ReplicationLockManagerManipulator() = default;
+
+    /**
+     * Works like LockManager::lock() except that it only works for the Global lock and rather than
+     * looking up the true LockHead for the global resource ID, it puts the LockRequest into the
+     * given TemporaryResourceQueue, which is guaranteed not to have any conflicting locks for the
+     * given request.
+     */
+    void lockUncontestedTemporaryGlobalResource(
+        LockManager::TemporaryResourceQueue* tempGlobalResource,
+        LockRequest* request,
+        LockMode mode);
+
+    /**
+     * Takes the locks from a given TemporaryResourceQueue for the Global resource and moves them
+     * into the true LockHead for the global resource, atomically releasing the global X lock from
+     * the true LockHead for the global resource in the process.
+     */
+    void replaceGlobalLocksWithLocksFromTemporaryGlobalResource(
+        ResourceId resId, LockManager::TemporaryResourceQueue* tempGlobalResource);
+
+
+private:
+    LockManager* _lockManager;
+};
+
+}  // namespace mongo
author	Spencer T Brody <spencer@mongodb.com>	2018-08-22 14:37:16 -0400
committer	Spencer T Brody <spencer@mongodb.com>	2018-09-18 13:20:18 -0400
commit	e65ff57e108ed69c46cc0b0ccbdd675663de2469 (patch)
tree	010051703fd944884d99571e31f1dd45b28da8a1 /src/mongo/db/concurrency
parent	157691ef0babb24cd1566446f5f88206f9607564 (diff)
download	mongo-e65ff57e108ed69c46cc0b0ccbdd675663de2469.tar.gz