diff options
author | Randolph Tan <randolph@10gen.com> | 2014-10-27 14:53:04 -0400 |
---|---|---|
committer | Randolph Tan <randolph@10gen.com> | 2014-10-29 13:03:38 -0400 |
commit | fbbb0d2a1d845728cd714272199a652573e2f27d (patch) | |
tree | ec2286f7c8e93582a7306c059fbe4c5261cbb03a /src/mongo/s | |
parent | 27c5131ff96d66ecc7ac8f7eee0ba72a7aa3c890 (diff) | |
download | mongo-fbbb0d2a1d845728cd714272199a652573e2f27d.tar.gz |
SERVER-15593 Initial autosplit heuristics are very aggressive when config servers are down
Diffstat (limited to 'src/mongo/s')
-rw-r--r-- | src/mongo/s/chunk.cpp | 10 | ||||
-rw-r--r-- | src/mongo/s/commands_admin.cpp | 2 | ||||
-rw-r--r-- | src/mongo/s/config.cpp | 37 | ||||
-rw-r--r-- | src/mongo/s/config.h | 12 |
4 files changed, 48 insertions, 13 deletions
diff --git a/src/mongo/s/chunk.cpp b/src/mongo/s/chunk.cpp index a8b70aa162a..addc11b79a4 100644 --- a/src/mongo/s/chunk.cpp +++ b/src/mongo/s/chunk.cpp @@ -538,6 +538,14 @@ namespace mongo { } TicketHolderReleaser releaser( &(getManager()->_splitHeuristics._splitTickets) ); + if (!configServer.allUp(true)) { + LOG(1) << "not performing auto-split because not all config servers are up"; + + // Back off indirectly by resetting _dataWritten. + _dataWritten = 0; + return false; + } + // this is a bit ugly // we need it so that mongos blocks for the writes to actually be committed // this does mean mongos has more back pressure than mongod alone @@ -1354,7 +1362,7 @@ namespace mongo { uassert( 13331 , "collection's metadata is undergoing changes. Please try again." , dlk.got() ); - uassert( 10174 , "config servers not all up" , configServer.allUp() ); + uassert(10174, "config servers not all up", configServer.allUp(false)); set<Shard> seen; diff --git a/src/mongo/s/commands_admin.cpp b/src/mongo/s/commands_admin.cpp index 7c978405f61..f899eb66224 100644 --- a/src/mongo/s/commands_admin.cpp +++ b/src/mongo/s/commands_admin.cpp @@ -96,7 +96,7 @@ namespace mongo { bool okForConfigChanges( string& errmsg ) { string e; - if ( ! configServer.allUp(e) ) { + if (!configServer.allUp(false, e)) { errmsg = str::stream() << "not all config servers are up: " << e; return false; } diff --git a/src/mongo/s/config.cpp b/src/mongo/s/config.cpp index 078a5de0bb1..e9ba0d91b60 100644 --- a/src/mongo/s/config.cpp +++ b/src/mongo/s/config.cpp @@ -198,7 +198,9 @@ namespace mongo { vector<Shard>* initShards) { uassert( 8042 , "db doesn't have sharding enabled" , _shardingEnabled ); - uassert( 13648 , str::stream() << "can't shard collection because not all config servers are up" , configServer.allUp() ); + uassert(13648, + str::stream() << "can't shard collection because not all config servers are up", + configServer.allUp(false)); ChunkManagerPtr manager; @@ -648,7 +650,7 @@ namespace mongo { configServer.logChange( "dropDatabase.start" , _name , BSONObj() ); // 1 - if ( ! configServer.allUp( errmsg ) ) { + if (!configServer.allUp(false, errmsg)) { LOG(1) << "\t DBConfig::dropDatabase not all up" << endl; return 0; } @@ -667,7 +669,7 @@ namespace mongo { return false; } - if ( ! configServer.allUp( errmsg ) ) { + if (!configServer.allUp(false, errmsg)) { log() << "error removing from config server even after checking!" << endl; return 0; } @@ -995,21 +997,38 @@ namespace mongo { return true; } - bool ConfigServer::allUp() { + bool ConfigServer::allUp(bool localCheckOnly) { string errmsg; - return allUp( errmsg ); + return allUp(localCheckOnly, errmsg); } - bool ConfigServer::allUp( string& errmsg ) { + bool ConfigServer::allUp(bool localCheckOnly, string& errmsg) { try { ScopedDbConnection conn(_primary.getConnString(), 30.0); + + // Note: SyncClusterConnection is different from normal connection types in + // that it can be instantiated even if all the config servers are down. + if (!conn->isStillConnected()) { + errmsg = str::stream() << "Not all config servers " + << _primary.toString() << " are reachable"; + LOG(1) << errmsg; + return false; + } + + if (localCheckOnly) { + return true; + } + + // Note: For SyncClusterConnection, gle will only be sent to the first + // node, and it is not even guaranteed to be invoked. conn->getLastError(); conn.done(); return true; } - catch ( DBException& ) { - log() << "ConfigServer::allUp : " << _primary.toString() << " seems down!" << endl; - errmsg = _primary.toString() + " seems down"; + catch (const DBException& excep) { + errmsg = str::stream() << "Not all config servers " + << _primary.toString() << " are reachable" + << causedBy(excep); return false; } diff --git a/src/mongo/s/config.h b/src/mongo/s/config.h index fceb3f6b219..78810ed8653 100644 --- a/src/mongo/s/config.h +++ b/src/mongo/s/config.h @@ -240,8 +240,16 @@ namespace mongo { */ bool checkHostsAreUnique( const std::vector<std::string>& configHosts, std::string* errmsg ); - bool allUp(); - bool allUp( std::string& errmsg ); + /** + * Checks if all config servers are up. + * + * If localCheckOnly is true, only check if the socket is still open with no errors. + * Otherwise, also send a getLastError command with recv timeout. + * + * TODO: fix this - SERVER-15811 + */ + bool allUp(bool localCheckOnly); + bool allUp(bool localCheckOnly, std::string& errmsg); int dbConfigVersion(); int dbConfigVersion( DBClientBase& conn ); |