From 4d957329dc47c05a0edfd6b85f8170508aa3321f Mon Sep 17 00:00:00 2001 From: Greg Studer Date: Fri, 9 Aug 2013 15:52:05 -0400 Subject: SERVER-10458 sanity check before critical section that all cloned docs sent --- src/mongo/s/d_migrate.cpp | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/src/mongo/s/d_migrate.cpp b/src/mongo/s/d_migrate.cpp index ec528226b27..85eb18294dd 100644 --- a/src/mongo/s/d_migrate.cpp +++ b/src/mongo/s/d_migrate.cpp @@ -639,6 +639,11 @@ namespace mongo { _cloneLocs.erase( dl ); } + std::size_t cloneLocsRemaining() { + scoped_spinlock lk( _trackerLocks ); + return _cloneLocs.size(); + } + long long mbUsed() const { return _memoryUsed / ( 1024 * 1024 ); } bool getInCriticalSection() const { @@ -1162,15 +1167,20 @@ namespace mongo { timing.done( 3 ); // 4. + + // Track last result from TO shard for sanity check + BSONObj res; for ( int i=0; i<86400; i++ ) { // don't want a single chunk move to take more than a day verify( !Lock::isLocked() ); // Exponential sleep backoff, up to 1024ms. Don't sleep much on the first few // iterations, since we want empty chunk migrations to be fast. sleepmillis( 1 << std::min( i , 10 ) ); + scoped_ptr conn( ScopedDbConnection::getScopedDbConnection( toShard.getConnString() ) ); - BSONObj res; + bool ok; + res = BSONObj(); try { ok = conn->get()->runCommand( "admin" , BSON( "_recvChunkStatus" << 1 ) , res ); res = res.getOwned(); @@ -1218,9 +1228,26 @@ namespace mongo { // 5. // Before we get into the critical section of the migration, let's double check - // that the config servers are reachable and the lock is in place. - log() << "About to check if it is safe to enter critical section"; + // that the docs have been cloned, the config servers are reachable, + // and the lock is in place. + log() << "About to check if it is safe to enter critical section" << endl; + + // Ensure all cloned docs have actually been transferred + std::size_t locsRemaining = migrateFromStatus.cloneLocsRemaining(); + if ( locsRemaining != 0 ) { + + errmsg = + str::stream() << "moveChunk cannot enter critical section before all data is" + << " cloned, " << locsRemaining << " locs were not transferred" + << " but to-shard reported " << res; + + // Should never happen, but safe to abort before critical section + error() << errmsg << migrateLog; + dassert( false ); + return false; + } + // Ensure distributed lock still held string lockHeldMsg; bool lockHeld = dlk.isLockHeld( 30.0 /* timeout */, &lockHeldMsg ); if ( !lockHeld ) { -- cgit v1.2.1