diff options
author | Alberto Lerner <alerner@10gen.com> | 2010-12-13 16:11:05 -0500 |
---|---|---|
committer | Alberto Lerner <alerner@10gen.com> | 2010-12-13 16:11:16 -0500 |
commit | 22242889332ff65589ecea6262e33016b98cc8f2 (patch) | |
tree | 43a96815cf6ab1e9bec8f43f5a49b4256622387d /s/balance.cpp | |
parent | 98f5d5396a9eadb7e9ccb88875e8213fc0a51607 (diff) | |
download | mongo-22242889332ff65589ecea6262e33016b98cc8f2.tar.gz |
SERVER-2213 balancer tolerates servers being down at mongos startup
Diffstat (limited to 's/balance.cpp')
-rw-r--r-- | s/balance.cpp | 62 |
1 files changed, 40 insertions, 22 deletions
diff --git a/s/balance.cpp b/s/balance.cpp index 9d6391492fa..6e4a922b1e9 100644 --- a/s/balance.cpp +++ b/s/balance.cpp @@ -1,4 +1,4 @@ -// balance.cpp +//@file balance.cpp /** * Copyright (C) 2008 10gen Inc. @@ -81,19 +81,6 @@ namespace mongo { return movedCount; } - void Balancer::_ping(){ - assert( _myid.size() && _started ); - try { - ScopedDbConnection conn( configServer.getPrimary() ); - _ping( conn.conn() ); - conn.done(); - } - catch ( std::exception& e ){ - log() << "bare ping failed: " << e.what() << endl; - } - - } - void Balancer::_ping( DBClientBase& conn ){ WriteConcern w = conn.getWriteConcern(); conn.setWriteConcern( W_NONE ); @@ -219,22 +206,53 @@ namespace mongo { } } - void Balancer::run(){ + bool Balancer::_init() { + try { + + log() << "about to contact config servers and shards" << endl; + + // contact the config server and refresh shard information + // checks that each shard is indeed a different process (no hostname mixup) + // these checks are redundant in that they're redone at every new round but we want to do them initially here + // so to catch any problem soon + Shard::reloadShardInfo(); + _checkOIDs(); + + log() << "config servers and shards contacted successfully" << endl; - { // init stuff, don't want to do at static init StringBuilder buf; buf << getHostNameCached() << ":" << cmdLine.port; _myid = buf.str(); - log() << "balancer myid: " << _myid << endl; - _started = time(0); - Shard::reloadShardInfo(); + log() << "balancer id: " << _myid << " started at " << time_t_to_String_short(_started) << endl; + + return true; + + } catch ( std::exception& e ) { + + log( LL_WARNING ) << "could not initialize balancer, please check that all shards and config servers are up" << endl; + return false; + + } + } + + void Balancer::run(){ + + // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer thread prematurely + while ( ! inShutdown() ) { + + if ( ! _init() ) { + log() << "will retry to initialize balancer in one minute" << endl; + sleepsecs( 60 ); + continue; + } + + break; } - - _ping(); - _checkOIDs(); + // getConnectioString and the constructor of a DistributedLock do not throw, which is what we expect on while + // on the balancer thread ConnectionString config = configServer.getConnectionString(); DistributedLock balanceLock( config , "balancer" ); |