From 8d4850ae65b5ca314a540090bd94714cf327b71b Mon Sep 17 00:00:00 2001 From: Eliot Horowitz Date: Thu, 17 Feb 2011 19:59:33 -0500 Subject: detect clock skew in DistributedLock SERVER-2584 --- client/distlock.cpp | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'client') diff --git a/client/distlock.cpp b/client/distlock.cpp index 307ae971f73..9ec98eaed18 100644 --- a/client/distlock.cpp +++ b/client/distlock.cpp @@ -182,7 +182,7 @@ namespace mongo { { // make sure its there so we can use simple update logic below - BSONObj o = conn->findOne( _ns , _id ); + BSONObj o = conn->findOne( _ns , _id ).getOwned(); if ( o.isEmpty() ) { try { log(4) << "dist_lock inserting initial doc in " << _ns << " for lock " << _name << endl; @@ -192,32 +192,41 @@ namespace mongo { log() << "dist_lock could not insert initial doc: " << e << endl; } } - + else if ( o["state"].numberInt() > 0 ) { BSONObj lastPing = conn->findOne( lockPingNS , o["process"].wrap( "_id" ) ); if ( lastPing.isEmpty() ) { // if a lock is taken but there's no ping for it, we're in an inconsistent situation // if the lock holder (mongos or d) does not exist anymore, the lock could safely be removed // but we'd require analysis of the situation before a manual intervention - log(LL_ERROR) << "config.locks: " << _name << " lock is taken by old process? " - << "remove the following lock if the process is not active anymore: " << o << endl; + error() << "config.locks: " << _name << " lock is taken by old process? " + << "remove the following lock if the process is not active anymore: " << o << endl; *other = o; - other->getOwned(); conn.done(); return false; } - unsigned long long elapsed = jsTime() - lastPing["ping"].Date(); // in ms + unsigned long long now = jsTime(); + unsigned long long pingTime = lastPing["ping"].Date(); + + if ( now < pingTime ) { + // clock skew + warning() << "dist_lock has detected clock skew of " << ( pingTime - now ) << "ms" << endl; + *other = o; + conn.done(); + return false; + } + + unsigned long long elapsed = now - pingTime; elapsed = elapsed / ( 1000 * 60 ); // convert to minutes if ( elapsed > ( 60 * 24 * 365 * 100 ) /* 100 years */ ) { warning() << "distlock elapsed time seems impossible: " << lastPing << endl; } - + if ( elapsed <= _takeoverMinutes ) { log(1) << "dist_lock lock failed because taken by: " << o << " elapsed minutes: " << elapsed << endl; *other = o; - other->getOwned(); conn.done(); return false; } @@ -226,8 +235,8 @@ namespace mongo { conn->update( _ns , _id , BSON( "$set" << BSON( "state" << 0 ) ) ); string err = conn->getLastError(); if ( ! err.empty() ) { - log( LL_WARNING ) << "dist_lock take over from: " << o << " failed: " << err << endl; - *other = o; + warning() << "dist_lock take over from: " << o << " failed: " << err << endl; + *other = o.getOwned(); other->getOwned(); conn.done(); return false; -- cgit v1.2.1