diff options
author | Greg Studer <greg@10gen.com> | 2013-06-12 11:52:56 -0400 |
---|---|---|
committer | Dan Pasette <dan@10gen.com> | 2013-06-19 19:41:13 -0400 |
commit | 42980b6c626bcf130cc72bc441285fd8bb0f17f3 (patch) | |
tree | e2978322427dfe71ba448f09c681ad8c7476b14c | |
parent | af0b49cbb0ca6888d6f133081b7d41464bf4135f (diff) | |
download | mongo-42980b6c626bcf130cc72bc441285fd8bb0f17f3.tar.gz |
SERVER-9909 balancer reload idle sharded coll data, and don't abort round
Conflicts:
src/mongo/s/grid.cpp
-rw-r--r-- | jstests/sharding/fair_balancer_round.js | 39 | ||||
-rw-r--r-- | src/mongo/s/balance.cpp | 109 | ||||
-rw-r--r-- | src/mongo/s/grid.cpp | 6 |
3 files changed, 110 insertions, 44 deletions
diff --git a/jstests/sharding/fair_balancer_round.js b/jstests/sharding/fair_balancer_round.js new file mode 100644 index 00000000000..0f1631b80e7 --- /dev/null +++ b/jstests/sharding/fair_balancer_round.js @@ -0,0 +1,39 @@ +// +// Tests that a balancer round loads newly sharded collection data +// + +var options = {separateConfig : true, mongosOptions : {verbose : 1}}; + +var st = new ShardingTest({shards : 2, mongos : 2, other : options}); + +// Stop balancer initially +st.stopBalancer(); + +var mongos = st.s0; +var staleMongos = st.s1; +var coll = mongos.getCollection("foo.bar"); + +// Stop first mongos from balancing +assert(mongos.adminCommand({configureFailPoint : "neverBalance", mode : "alwaysOn"}).ok); + +// Shard collection through first mongos +assert(mongos.adminCommand({enableSharding : coll.getDB() + ""}).ok); +assert(mongos.adminCommand({shardCollection : coll + "", key : {_id : 1}}).ok); + +// Create a bunch of chunks +var numSplits = 20; +for ( var i = 0; i < numSplits; i++) { + assert(mongos.adminCommand({split : coll + "", middle : {_id : i}}).ok); +} + +// Start balancer, which lets the stale mongos balance +st.startBalancer(); + +// Make sure we eventually start moving chunks +assert.soon(function() { + return mongos.getCollection("config.changelog").count({what : /moveChunk/}) > 0; +}, "no balance happened", 5 * 60 * 1000); + +jsTest.log("DONE!"); + +st.stop(); diff --git a/src/mongo/s/balance.cpp b/src/mongo/s/balance.cpp index 13f764f608d..de72b25dada 100644 --- a/src/mongo/s/balance.cpp +++ b/src/mongo/s/balance.cpp @@ -52,58 +52,70 @@ namespace mongo { for ( vector<CandidateChunkPtr>::const_iterator it = candidateChunks->begin(); it != candidateChunks->end(); ++it ) { const CandidateChunk& chunkInfo = *it->get(); - DBConfigPtr cfg = grid.getDBConfig( chunkInfo.ns ); - verify( cfg ); + // Changes to metadata, borked metadata, and connectivity problems should cause us to + // abort this chunk move, but shouldn't cause us to abort the entire round of chunks. + // TODO: Handle all these things more cleanly, since they're expected problems + try { - ChunkManagerPtr cm = cfg->getChunkManager( chunkInfo.ns ); - verify( cm ); + DBConfigPtr cfg = grid.getDBConfig( chunkInfo.ns ); + verify( cfg ); - ChunkPtr c = cm->findIntersectingChunk( chunkInfo.chunk.min ); - if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) { - // likely a split happened somewhere - cm = cfg->getChunkManager( chunkInfo.ns , true /* reload */); + // NOTE: We purposely do not reload metadata here, since _doBalanceRound already + // tried to do so once. + ChunkManagerPtr cm = cfg->getChunkManager( chunkInfo.ns ); verify( cm ); - c = cm->findIntersectingChunk( chunkInfo.chunk.min ); + ChunkPtr c = cm->findIntersectingChunk( chunkInfo.chunk.min ); if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) { - log() << "chunk mismatch after reload, ignoring will retry issue " << chunkInfo.chunk.toString() << endl; + // likely a split happened somewhere + cm = cfg->getChunkManager( chunkInfo.ns , true /* reload */); + verify( cm ); + + c = cm->findIntersectingChunk( chunkInfo.chunk.min ); + if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) { + log() << "chunk mismatch after reload, ignoring will retry issue " << chunkInfo.chunk.toString() << endl; + continue; + } + } + + BSONObj res; + if (c->moveAndCommit(Shard::make(chunkInfo.to), + Chunk::MaxChunkSize, + secondaryThrottle, + waitForDelete, + res)) { + movedCount++; continue; } - } - BSONObj res; - if (c->moveAndCommit(Shard::make(chunkInfo.to), - Chunk::MaxChunkSize, - secondaryThrottle, - waitForDelete, - res)) { - movedCount++; - continue; - } + // the move requires acquiring the collection metadata's lock, which can fail + log() << "balancer move failed: " << res << " from: " << chunkInfo.from << " to: " << chunkInfo.to + << " chunk: " << chunkInfo.chunk << endl; - // the move requires acquiring the collection metadata's lock, which can fail - log() << "balancer move failed: " << res << " from: " << chunkInfo.from << " to: " << chunkInfo.to - << " chunk: " << chunkInfo.chunk << endl; + if ( res["chunkTooBig"].trueValue() ) { + // reload just to be safe + cm = cfg->getChunkManager( chunkInfo.ns ); + verify( cm ); + c = cm->findIntersectingChunk( chunkInfo.chunk.min ); - if ( res["chunkTooBig"].trueValue() ) { - // reload just to be safe - cm = cfg->getChunkManager( chunkInfo.ns ); - verify( cm ); - c = cm->findIntersectingChunk( chunkInfo.chunk.min ); - - log() << "forcing a split because migrate failed for size reasons" << endl; - - res = BSONObj(); - c->singleSplit( true , res ); - log() << "forced split results: " << res << endl; - - if ( ! res["ok"].trueValue() ) { - log() << "marking chunk as jumbo: " << c->toString() << endl; - c->markAsJumbo(); - // we increment moveCount so we do another round right away - movedCount++; - } + log() << "forcing a split because migrate failed for size reasons" << endl; + + res = BSONObj(); + c->singleSplit( true , res ); + log() << "forced split results: " << res << endl; + if ( ! res["ok"].trueValue() ) { + log() << "marking chunk as jumbo: " << c->toString() << endl; + c->markAsJumbo(); + // we increment moveCount so we do another round right away + movedCount++; + } + + } + } + catch( const DBException& ex ) { + warning() << "could not move chunk " << chunkInfo.chunk.toString() + << ", continuing balancing round" << causedBy( ex ) << endl; } } @@ -295,9 +307,18 @@ namespace mongo { cursor.reset(); DBConfigPtr cfg = grid.getDBConfig( ns ); - verify( cfg ); - ChunkManagerPtr cm = cfg->getChunkManager( ns ); - verify( cm ); + if ( !cfg ) { + warning() << "could not load db config to balance " << ns << " collection" << endl; + continue; + } + + // This line reloads the chunk manager once if this process doesn't know the collection + // is sharded yet. + ChunkManagerPtr cm = cfg->getChunkManagerIfExists( ns, true ); + if ( !cm ) { + warning() << "could not load chunks to balance " << ns << " collection" << endl; + continue; + } // loop through tags to make sure no chunk spans tags; splits on tag min. for all chunks bool didAnySplits = false; diff --git a/src/mongo/s/grid.cpp b/src/mongo/s/grid.cpp index ef6046c558b..4e48113c05f 100644 --- a/src/mongo/s/grid.cpp +++ b/src/mongo/s/grid.cpp @@ -30,11 +30,14 @@ #include "mongo/s/type_database.h" #include "mongo/s/type_settings.h" #include "mongo/s/type_shard.h" +#include "mongo/util/fail_point_service.h" #include "mongo/util/startup_test.h" #include "mongo/util/stringutils.h" namespace mongo { + MONGO_FP_DECLARE(neverBalance); + DBConfigPtr Grid::getDBConfig( string database , bool create , const string& shardNameHint ) { { string::size_type i = database.find( "." ); @@ -471,6 +474,9 @@ namespace mongo { */ bool Grid::shouldBalance( const string& ns, BSONObj* balancerDocOut ) const { + // Allow disabling the balancer for testing + if ( MONGO_FAIL_POINT(neverBalance) ) return false; + scoped_ptr<ScopedDbConnection> conn( ScopedDbConnection::getInternalScopedDbConnection( configServer.getPrimary().getConnString(), 30)); BSONObj balancerDoc; |