summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRandolph Tan <randolph@10gen.com>2014-09-29 14:26:40 -0400
committerRandolph Tan <randolph@10gen.com>2014-10-02 13:29:46 -0400
commitcfddfb4b976a38a23f319abae6021c5864fa16d9 (patch)
treea3596900b34984529e65808f0dc37b32e121658b
parent84220a31cdcda093347dea33fce56bcfdc4fe9ed (diff)
downloadmongo-cfddfb4b976a38a23f319abae6021c5864fa16d9.tar.gz
SERVER-14215 Log error when ScopedDistributedLock::acquire fails to acquire lock before timeout
-rw-r--r--src/mongo/base/error_codes.err2
-rw-r--r--src/mongo/db/auth/authz_manager_external_state_s.cpp6
-rw-r--r--src/mongo/s/config_upgrade.cpp8
-rw-r--r--src/mongo/s/d_merge.cpp6
-rw-r--r--src/mongo/s/d_migrate.cpp19
-rw-r--r--src/mongo/s/d_split.cpp8
-rw-r--r--src/mongo/s/distlock.cpp119
-rw-r--r--src/mongo/s/distlock.h99
8 files changed, 131 insertions, 136 deletions
diff --git a/src/mongo/base/error_codes.err b/src/mongo/base/error_codes.err
index fc183566269..d7fb5aabd8b 100644
--- a/src/mongo/base/error_codes.err
+++ b/src/mongo/base/error_codes.err
@@ -105,6 +105,8 @@ error_code("IncompatibleAuditMetadata", 102)
error_code("NewReplicaSetConfigurationIncompatible", 103)
error_code("NodeNotElectable", 104)
error_code("IncompatibleShardingMetadata", 105)
+error_code("DistributedClockSkewed", 106)
+error_code("LockFailed", 107)
# Non-sequential error codes (for compatibility only)
error_code("NotMaster", 10107) #this comes from assert_util.h
diff --git a/src/mongo/db/auth/authz_manager_external_state_s.cpp b/src/mongo/db/auth/authz_manager_external_state_s.cpp
index cc2fdb5a83d..28f8fdbce56 100644
--- a/src/mongo/db/auth/authz_manager_external_state_s.cpp
+++ b/src/mongo/db/auth/authz_manager_external_state_s.cpp
@@ -348,11 +348,11 @@ namespace mongo {
configServer.getConnectionString(), "authorizationData"));
lockHolder->setLockMessage(why.toString());
- std::string errmsg;
- if (!lockHolder->acquire(_authzUpdateLockAcquisitionTimeoutMillis, &errmsg)) {
+ Status acquisitionStatus = lockHolder->acquire(_authzUpdateLockAcquisitionTimeoutMillis);
+ if (!acquisitionStatus.isOK()) {
warning() <<
"Error while attempting to acquire distributed lock for user modification: " <<
- errmsg << endl;
+ acquisitionStatus.toString() << endl;
return false;
}
_authzDataUpdateLock.reset(lockHolder.release());
diff --git a/src/mongo/s/config_upgrade.cpp b/src/mongo/s/config_upgrade.cpp
index bb3c7ede26d..bc5e2812a07 100644
--- a/src/mongo/s/config_upgrade.cpp
+++ b/src/mongo/s/config_upgrade.cpp
@@ -505,11 +505,9 @@ namespace mongo {
upgradeLock.setLockMessage(stream() << "upgrading config database to new format v"
<< CURRENT_CONFIG_VERSION);
- if (!upgradeLock.acquire(20 * 60 * 1000, errMsg)) {
-
- *errMsg = stream() << "could not acquire upgrade lock for config upgrade to v"
- << CURRENT_CONFIG_VERSION << causedBy(errMsg);
-
+ Status acquisitionStatus = upgradeLock.acquire(20 * 60 * 1000);
+ if (!acquisitionStatus.isOK()) {
+ *errMsg = acquisitionStatus.toString();
return false;
}
diff --git a/src/mongo/s/d_merge.cpp b/src/mongo/s/d_merge.cpp
index 9d1e728ac4f..bdfcda69b6b 100644
--- a/src/mongo/s/d_merge.cpp
+++ b/src/mongo/s/d_merge.cpp
@@ -78,11 +78,11 @@ namespace mongo {
collLock.setLockMessage( stream() << "merging chunks in " << nss.ns() << " from "
<< minKey << " to " << maxKey );
- if ( !collLock.tryAcquire( errMsg ) ) {
-
+ Status acquisitionStatus = collLock.tryAcquire();
+ if (!acquisitionStatus.isOK()) {
*errMsg = stream() << "could not acquire collection lock for " << nss.ns()
<< " to merge chunks in [" << minKey << "," << maxKey << ")"
- << causedBy( *errMsg );
+ << causedBy(acquisitionStatus);
warning() << *errMsg << endl;
return false;
diff --git a/src/mongo/s/d_migrate.cpp b/src/mongo/s/d_migrate.cpp
index 2e735cc442a..132a5e9071b 100644
--- a/src/mongo/s/d_migrate.cpp
+++ b/src/mongo/s/d_migrate.cpp
@@ -972,13 +972,13 @@ namespace mongo {
collLock.setLockMessage(str::stream() << "migrating chunk [" << minKey << ", " << maxKey
<< ") in " << ns);
- if (!collLock.tryAcquire(&errmsg)) {
+ Status acquisitionStatus = collLock.tryAcquire();
+ if (!acquisitionStatus.isOK()) {
+ errmsg = stream() << "could not acquire collection lock for " << ns
+ << " to migrate chunk [" << minKey << "," << maxKey << ")"
+ << causedBy(acquisitionStatus);
- errmsg = str::stream() << "could not acquire collection lock for " << ns
- << " to migrate chunk [" << minKey << "," << maxKey << ")"
- << causedBy(errmsg);
-
- warning() << errmsg;
+ warning() << errmsg << endl;
return false;
}
@@ -1224,11 +1224,10 @@ namespace mongo {
}
// Ensure distributed lock still held
- string lockHeldMsg;
- bool lockHeld = collLock.verifyLockHeld(&lockHeldMsg);
- if ( !lockHeld ) {
+ Status lockStatus = collLock.checkStatus();
+ if (!lockStatus.isOK()) {
errmsg = str::stream() << "not entering migrate critical section because "
- << lockHeldMsg;
+ << lockStatus.toString();
warning() << errmsg << endl;
return false;
}
diff --git a/src/mongo/s/d_split.cpp b/src/mongo/s/d_split.cpp
index b30aeab2c74..e745899eaf3 100644
--- a/src/mongo/s/d_split.cpp
+++ b/src/mongo/s/d_split.cpp
@@ -594,13 +594,13 @@ namespace mongo {
collLock.setLockMessage(str::stream() << "splitting chunk [" << minKey << ", " << maxKey
<< ") in " << ns);
- if (!collLock.tryAcquire(&errmsg)) {
-
+ Status acquisitionStatus = collLock.tryAcquire();
+ if (!acquisitionStatus.isOK()) {
errmsg = str::stream() << "could not acquire collection lock for " << ns
<< " to split chunk [" << minKey << "," << maxKey << ")"
- << causedBy(errmsg);
+ << causedBy(acquisitionStatus);
- warning() << errmsg;
+ warning() << errmsg << endl;
return false;
}
diff --git a/src/mongo/s/distlock.cpp b/src/mongo/s/distlock.cpp
index b4c1c11f541..a3eba8c065a 100644
--- a/src/mongo/s/distlock.cpp
+++ b/src/mongo/s/distlock.cpp
@@ -293,13 +293,11 @@ namespace mongo {
if ( _seen.count( s ) > 0 ) return s;
// Check our clock skew
- try {
- if( lock.isRemoteTimeSkewed() ) {
- throw LockException( str::stream() << "clock skew of the cluster " << conn.toString() << " is too far out of bounds to allow distributed locking." , 13650 );
- }
- }
- catch( LockException& e) {
- throw LockException( str::stream() << "error checking clock skew of cluster " << conn.toString() << causedBy( e ) , 13651);
+ if (lock.isRemoteTimeSkewed()) {
+ throw LockException(
+ str::stream() << "clock skew of the cluster " << conn.toString()
+ << " is too far out of bounds to allow distributed locking.",
+ ErrorCodes::DistributedClockSkewed);
}
boost::thread t( stdx::bind( &DistributedLockPinger::distLockPingThread, this, conn, getJSTimeVirtualThreadSkew(), processId, sleepTime) );
@@ -539,7 +537,7 @@ namespace mongo {
}
- bool DistributedLock::isLockHeld( double timeout, string* errMsg ) {
+ Status DistributedLock::checkStatus(double timeout) {
BSONObj lockObj;
try {
@@ -549,37 +547,35 @@ namespace mongo {
conn.done();
}
catch ( DBException& e ) {
- *errMsg = str::stream() << "error checking whether lock " << _name << " is held "
- << causedBy( e );
- return false;
+ return e.toStatus();
}
if ( lockObj.isEmpty() ) {
- *errMsg = str::stream() << "no lock for " << _name << " exists in the locks collection";
- return false;
+ return Status(ErrorCodes::LockFailed,
+ str::stream() << "no lock for " << _name << " exists in the locks collection");
}
if ( lockObj[LocksType::state()].numberInt() < 2 ) {
- *errMsg = str::stream() << "lock " << _name << " current state is not held ("
- << lockObj[LocksType::state()].numberInt() << ")";
- return false;
+ return Status(ErrorCodes::LockFailed,
+ str::stream() << "lock " << _name << " current state is not held ("
+ << lockObj[LocksType::state()].numberInt() << ")");
}
if ( lockObj[LocksType::process()].String() != _processId ) {
- *errMsg = str::stream() << "lock " << _name << " is currently being held by "
- << "another process ("
- << lockObj[LocksType::process()].String() << ")";
- return false;
+ return Status(ErrorCodes::LockFailed,
+ str::stream() << "lock " << _name << " is currently being held by "
+ << "another process ("
+ << lockObj[LocksType::process()].String() << ")");
}
if ( distLockPinger.willUnlockOID( lockObj[LocksType::lockID()].OID() ) ) {
- *errMsg = str::stream() << "lock " << _name << " is not held and is currently being "
- << "scheduled for lazy unlock by "
- << lockObj[LocksType::lockID()].OID();
- return false;
+ return Status(ErrorCodes::LockFailed,
+ str::stream() << "lock " << _name << " is not held and is currently being "
+ << "scheduled for lazy unlock by "
+ << lockObj[LocksType::lockID()].OID());
}
- return true;
+ return Status::OK();
}
static void logErrMsgOrWarn(const StringData& messagePrefix,
@@ -757,7 +753,12 @@ namespace mongo {
// and after the lock times out, we can be pretty sure the time is
// increasing at the same rate on all servers and therefore our
// timeout is accurate
- uassert( 14023, str::stream() << "remote time in cluster " << _conn.toString() << " is now skewed, cannot force lock.", !isRemoteTimeSkewed() );
+ if (isRemoteTimeSkewed()) {
+ throw LockException(
+ str::stream() << "remote time in cluster " << _conn.toString()
+ << " is now skewed, cannot force lock.",
+ ErrorCodes::DistributedClockSkewed);
+ }
// Make sure we break the lock with the correct "ts" (OID) value, otherwise
// we can overwrite a new lock inserted in the meantime.
@@ -783,6 +784,10 @@ namespace mongo {
// are required for a lock to be held.
warning() << "lock forcing " << lockName << " inconsistent" << endl;
}
+ catch (const LockException& ) {
+ // Let the exception go up and don't repackage the exception.
+ throw;
+ }
catch( std::exception& e ) {
conn.done();
throw LockException( str::stream() << "exception forcing distributed lock "
@@ -1164,7 +1169,7 @@ namespace mongo {
}
}
- bool ScopedDistributedLock::tryAcquire(string* errMsg) {
+ Status ScopedDistributedLock::tryAcquire() {
try {
_acquired = _lock.lock_try(_why,
false,
@@ -1172,45 +1177,45 @@ namespace mongo {
static_cast<double>(_socketTimeoutMillis / 1000));
}
catch (const DBException& e) {
+ return e.toStatus();
+ }
- *errMsg = str::stream() << "error acquiring distributed lock " << _lock._name << " for "
- << _why << causedBy(e);
-
- return false;
+ if (_acquired) {
+ return Status::OK();
}
- return _acquired;
+ return Status(ErrorCodes::LockBusy, str::stream() << "Lock for " << _why << " is taken.");
}
void ScopedDistributedLock::unlock() {
_lock.unlock(&_other);
}
- bool ScopedDistributedLock::acquire(long long waitForMillis, string* errMsg) {
-
- string dummy;
- if (!errMsg) errMsg = &dummy;
-
+ Status ScopedDistributedLock::acquire(long long waitForMillis) {
Timer timer;
Timer msgTimer;
- while (!_acquired && (waitForMillis <= 0 || timer.millis() < waitForMillis)) {
-
- string acquireErrMsg;
- _acquired = tryAcquire(&acquireErrMsg);
-
- if (_acquired) break;
+ Status lastStatus = Status::OK();
+ while (waitForMillis <= 0 || timer.millis() < waitForMillis) {
+ lastStatus = tryAcquire();
+ _acquired = lastStatus.isOK();
- // Set our error message to the last error, in case we break with !_acquired
- *errMsg = acquireErrMsg;
+ if (_acquired) {
+ verify(!_other.isEmpty());
+ return Status::OK();
+ }
if (waitForMillis == 0) break;
+ if (lastStatus.code() == ErrorCodes::DistributedClockSkewed) {
+ return lastStatus;
+ }
+
// Periodically message for debugging reasons
if (msgTimer.seconds() > 10) {
log() << "waited " << timer.seconds() << "s for distributed lock " << _lock._name
- << " for " << _why << endl;
+ << " for " << _why << ": " << lastStatus.toString() << endl;
msgTimer.reset();
}
@@ -1219,29 +1224,15 @@ namespace mongo {
sleepmillis(std::min(_lockTryIntervalMillis, timeRemainingMillis));
}
- if (_acquired) {
- verify(!_other.isEmpty());
- return true;
- }
-
- *errMsg = str::stream() << "could not acquire distributed lock " << _lock._name << " for "
- << _why << " after " << timer.seconds()
- << "s, other lock may be held: " << _other << causedBy(errMsg);
-
- return false;
+ return lastStatus;
}
- /**
- * Returns false if the lock is known _not_ to be held, otherwise asks the underlying
- * lock to issue a 'isLockHeld' call and returns whatever that calls does.
- */
- bool ScopedDistributedLock::verifyLockHeld(std::string* errMsg) {
+ Status ScopedDistributedLock::checkStatus() {
if (!_acquired) {
- *errMsg = "lock was never acquired";
- return false;
+ return Status(ErrorCodes::LockFailed, "lock was never acquired");
}
- return _lock.isLockHeld(static_cast<double>(_socketTimeoutMillis / 1000), errMsg);
+ return _lock.checkStatus(static_cast<double>(_socketTimeoutMillis / 1000));
}
}
diff --git a/src/mongo/s/distlock.h b/src/mongo/s/distlock.h
index 32b08719efb..1dddd495efe 100644
--- a/src/mongo/s/distlock.h
+++ b/src/mongo/s/distlock.h
@@ -166,12 +166,11 @@ namespace mongo {
bool lock_try( const std::string& why , bool reenter = false, BSONObj * other = 0, double timeout = 0.0 );
/**
- * Returns true if we currently believe we hold this lock and it was possible to
- * confirm that, within 'timeout' seconds, if provided, with the config servers. If the
- * lock is not held or if we failed to contact the config servers within the timeout,
- * returns false.
+ * Returns OK if this lock is held (but does not guarantee that this owns it) and
+ * it was possible to confirm that, within 'timeout' seconds, if provided, with the
+ * config servers.
*/
- bool isLockHeld( double timeout, std::string* errMsg );
+ Status checkStatus(double timeout);
/**
* Releases a previously taken lock.
@@ -243,37 +242,39 @@ namespace mongo {
class MONGO_CLIENT_API dist_lock_try {
public:
- dist_lock_try() : _lock(NULL), _got(false) {}
+ dist_lock_try() : _lock(NULL), _got(false) {}
- dist_lock_try( const dist_lock_try& that ) : _lock(that._lock), _got(that._got), _other(that._other) {
- _other.getOwned();
+ dist_lock_try( const dist_lock_try& that ) :
+ _lock(that._lock), _got(that._got), _other(that._other) {
- // Make sure the lock ownership passes to this object,
- // so we only unlock once.
- ((dist_lock_try&) that)._got = false;
- ((dist_lock_try&) that)._lock = NULL;
- ((dist_lock_try&) that)._other = BSONObj();
- }
+ _other.getOwned();
- // Needed so we can handle lock exceptions in context of lock try.
- dist_lock_try& operator=( const dist_lock_try& that ){
+ // Make sure the lock ownership passes to this object,
+ // so we only unlock once.
+ ((dist_lock_try&) that)._got = false;
+ ((dist_lock_try&) that)._lock = NULL;
+ ((dist_lock_try&) that)._other = BSONObj();
+ }
+
+ // Needed so we can handle lock exceptions in context of lock try.
+ dist_lock_try& operator=( const dist_lock_try& that ){
- if( this == &that ) return *this;
+ if( this == &that ) return *this;
- _lock = that._lock;
- _got = that._got;
- _other = that._other;
- _other.getOwned();
- _why = that._why;
+ _lock = that._lock;
+ _got = that._got;
+ _other = that._other;
+ _other.getOwned();
+ _why = that._why;
- // Make sure the lock ownership passes to this object,
- // so we only unlock once.
- ((dist_lock_try&) that)._got = false;
- ((dist_lock_try&) that)._lock = NULL;
- ((dist_lock_try&) that)._other = BSONObj();
+ // Make sure the lock ownership passes to this object,
+ // so we only unlock once.
+ ((dist_lock_try&) that)._got = false;
+ ((dist_lock_try&) that)._lock = NULL;
+ ((dist_lock_try&) that)._other = BSONObj();
- return *this;
- }
+ return *this;
+ }
dist_lock_try( DistributedLock * lock , const std::string& why, double timeout = 0.0 )
: _lock(lock), _why(why) {
@@ -288,22 +289,20 @@ namespace mongo {
}
/**
- * Returns false if the lock is known _not_ to be held, otherwise asks the underlying
- * lock to issue a 'isLockHeld' call and returns whatever that calls does.
+ * Returns not OK if the lock is known _not_ to be held.
*/
- bool isLockHeld( double timeout, std::string* errMsg) {
+ Status checkStatus(double timeout) {
if ( !_lock ) {
- *errMsg = "Lock is not currently set up";
- return false;
+ return Status(ErrorCodes::LockFailed, "Lock is not currently set up");
}
if ( !_got ) {
- *errMsg = str::stream() << "Lock " << _lock->_name << " is currently held by "
- << _other;
- return false;
+ return Status(ErrorCodes::LockFailed,
+ str::stream() << "Lock " << _lock->_name << " is currently held by "
+ << _other);
}
- return _lock->isLockHeld( timeout, errMsg );
+ return _lock->checkStatus(timeout);
}
bool got() const { return _got; }
@@ -332,10 +331,13 @@ namespace mongo {
~ScopedDistributedLock();
/**
- * Tries once to obtain a lock, and can fail with an error message.
- * Returns true if the lock was successfully acquired.
+ * Tries to obtain the lock once.
+ *
+ * Returns OK if the lock was successfully acquired.
+ * Returns ErrorCodes::DistributedClockSkewed when a clock skew is detected.
+ * Returns ErrorCodes::LockBusy if the lock is being held.
*/
- bool tryAcquire(std::string* errMsg);
+ Status tryAcquire();
/**
* Tries to unlock the lock if acquired. Cannot report an error or block indefinitely
@@ -345,20 +347,23 @@ namespace mongo {
/**
* Tries multiple times to unlock the lock, using the specified lock try interval, until
- * a certain amount of time has passed. An error message is immediately returned if the
- * lock acquisition attempt fails with an error message.
+ * a certain amount of time has passed.
+ *
* waitForMillis = 0 indicates there should only be one attempt to acquire the lock, and
* no waiting.
* waitForMillis = -1 indicates we should retry indefinitely.
- * @return true if the lock was acquired
+ *
+ * Returns OK if the lock was successfully acquired.
+ * Returns ErrorCodes::DistributedClockSkewed when a clock skew is detected.
+ * Returns ErrorCodes::LockBusy if the lock is being held.
*/
- bool acquire(long long waitForMillis, std::string* errMsg);
+ Status acquire(long long waitForMillis);
/**
* If lock is held, remotely verifies that the lock has not been forced as a sanity check.
- * If the lock is not held or cannot be verified, returns false with errMsg.
+ * If the lock is not held or cannot be verified, returns not OK.
*/
- bool verifyLockHeld(std::string* errMsg);
+ Status checkStatus();
bool isAcquired() const {
return _acquired;