summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGreg Studer <greg@10gen.com>2013-06-12 11:52:56 -0400
committerDan Pasette <dan@10gen.com>2013-06-19 19:41:13 -0400
commit42980b6c626bcf130cc72bc441285fd8bb0f17f3 (patch)
treee2978322427dfe71ba448f09c681ad8c7476b14c
parentaf0b49cbb0ca6888d6f133081b7d41464bf4135f (diff)
downloadmongo-42980b6c626bcf130cc72bc441285fd8bb0f17f3.tar.gz
SERVER-9909 balancer reload idle sharded coll data, and don't abort round
Conflicts: src/mongo/s/grid.cpp
-rw-r--r--jstests/sharding/fair_balancer_round.js39
-rw-r--r--src/mongo/s/balance.cpp109
-rw-r--r--src/mongo/s/grid.cpp6
3 files changed, 110 insertions, 44 deletions
diff --git a/jstests/sharding/fair_balancer_round.js b/jstests/sharding/fair_balancer_round.js
new file mode 100644
index 00000000000..0f1631b80e7
--- /dev/null
+++ b/jstests/sharding/fair_balancer_round.js
@@ -0,0 +1,39 @@
+//
+// Tests that a balancer round loads newly sharded collection data
+//
+
+var options = {separateConfig : true, mongosOptions : {verbose : 1}};
+
+var st = new ShardingTest({shards : 2, mongos : 2, other : options});
+
+// Stop balancer initially
+st.stopBalancer();
+
+var mongos = st.s0;
+var staleMongos = st.s1;
+var coll = mongos.getCollection("foo.bar");
+
+// Stop first mongos from balancing
+assert(mongos.adminCommand({configureFailPoint : "neverBalance", mode : "alwaysOn"}).ok);
+
+// Shard collection through first mongos
+assert(mongos.adminCommand({enableSharding : coll.getDB() + ""}).ok);
+assert(mongos.adminCommand({shardCollection : coll + "", key : {_id : 1}}).ok);
+
+// Create a bunch of chunks
+var numSplits = 20;
+for ( var i = 0; i < numSplits; i++) {
+ assert(mongos.adminCommand({split : coll + "", middle : {_id : i}}).ok);
+}
+
+// Start balancer, which lets the stale mongos balance
+st.startBalancer();
+
+// Make sure we eventually start moving chunks
+assert.soon(function() {
+ return mongos.getCollection("config.changelog").count({what : /moveChunk/}) > 0;
+}, "no balance happened", 5 * 60 * 1000);
+
+jsTest.log("DONE!");
+
+st.stop();
diff --git a/src/mongo/s/balance.cpp b/src/mongo/s/balance.cpp
index 13f764f608d..de72b25dada 100644
--- a/src/mongo/s/balance.cpp
+++ b/src/mongo/s/balance.cpp
@@ -52,58 +52,70 @@ namespace mongo {
for ( vector<CandidateChunkPtr>::const_iterator it = candidateChunks->begin(); it != candidateChunks->end(); ++it ) {
const CandidateChunk& chunkInfo = *it->get();
- DBConfigPtr cfg = grid.getDBConfig( chunkInfo.ns );
- verify( cfg );
+ // Changes to metadata, borked metadata, and connectivity problems should cause us to
+ // abort this chunk move, but shouldn't cause us to abort the entire round of chunks.
+ // TODO: Handle all these things more cleanly, since they're expected problems
+ try {
- ChunkManagerPtr cm = cfg->getChunkManager( chunkInfo.ns );
- verify( cm );
+ DBConfigPtr cfg = grid.getDBConfig( chunkInfo.ns );
+ verify( cfg );
- ChunkPtr c = cm->findIntersectingChunk( chunkInfo.chunk.min );
- if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) {
- // likely a split happened somewhere
- cm = cfg->getChunkManager( chunkInfo.ns , true /* reload */);
+ // NOTE: We purposely do not reload metadata here, since _doBalanceRound already
+ // tried to do so once.
+ ChunkManagerPtr cm = cfg->getChunkManager( chunkInfo.ns );
verify( cm );
- c = cm->findIntersectingChunk( chunkInfo.chunk.min );
+ ChunkPtr c = cm->findIntersectingChunk( chunkInfo.chunk.min );
if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) {
- log() << "chunk mismatch after reload, ignoring will retry issue " << chunkInfo.chunk.toString() << endl;
+ // likely a split happened somewhere
+ cm = cfg->getChunkManager( chunkInfo.ns , true /* reload */);
+ verify( cm );
+
+ c = cm->findIntersectingChunk( chunkInfo.chunk.min );
+ if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) {
+ log() << "chunk mismatch after reload, ignoring will retry issue " << chunkInfo.chunk.toString() << endl;
+ continue;
+ }
+ }
+
+ BSONObj res;
+ if (c->moveAndCommit(Shard::make(chunkInfo.to),
+ Chunk::MaxChunkSize,
+ secondaryThrottle,
+ waitForDelete,
+ res)) {
+ movedCount++;
continue;
}
- }
- BSONObj res;
- if (c->moveAndCommit(Shard::make(chunkInfo.to),
- Chunk::MaxChunkSize,
- secondaryThrottle,
- waitForDelete,
- res)) {
- movedCount++;
- continue;
- }
+ // the move requires acquiring the collection metadata's lock, which can fail
+ log() << "balancer move failed: " << res << " from: " << chunkInfo.from << " to: " << chunkInfo.to
+ << " chunk: " << chunkInfo.chunk << endl;
- // the move requires acquiring the collection metadata's lock, which can fail
- log() << "balancer move failed: " << res << " from: " << chunkInfo.from << " to: " << chunkInfo.to
- << " chunk: " << chunkInfo.chunk << endl;
+ if ( res["chunkTooBig"].trueValue() ) {
+ // reload just to be safe
+ cm = cfg->getChunkManager( chunkInfo.ns );
+ verify( cm );
+ c = cm->findIntersectingChunk( chunkInfo.chunk.min );
- if ( res["chunkTooBig"].trueValue() ) {
- // reload just to be safe
- cm = cfg->getChunkManager( chunkInfo.ns );
- verify( cm );
- c = cm->findIntersectingChunk( chunkInfo.chunk.min );
-
- log() << "forcing a split because migrate failed for size reasons" << endl;
-
- res = BSONObj();
- c->singleSplit( true , res );
- log() << "forced split results: " << res << endl;
-
- if ( ! res["ok"].trueValue() ) {
- log() << "marking chunk as jumbo: " << c->toString() << endl;
- c->markAsJumbo();
- // we increment moveCount so we do another round right away
- movedCount++;
- }
+ log() << "forcing a split because migrate failed for size reasons" << endl;
+
+ res = BSONObj();
+ c->singleSplit( true , res );
+ log() << "forced split results: " << res << endl;
+ if ( ! res["ok"].trueValue() ) {
+ log() << "marking chunk as jumbo: " << c->toString() << endl;
+ c->markAsJumbo();
+ // we increment moveCount so we do another round right away
+ movedCount++;
+ }
+
+ }
+ }
+ catch( const DBException& ex ) {
+ warning() << "could not move chunk " << chunkInfo.chunk.toString()
+ << ", continuing balancing round" << causedBy( ex ) << endl;
}
}
@@ -295,9 +307,18 @@ namespace mongo {
cursor.reset();
DBConfigPtr cfg = grid.getDBConfig( ns );
- verify( cfg );
- ChunkManagerPtr cm = cfg->getChunkManager( ns );
- verify( cm );
+ if ( !cfg ) {
+ warning() << "could not load db config to balance " << ns << " collection" << endl;
+ continue;
+ }
+
+ // This line reloads the chunk manager once if this process doesn't know the collection
+ // is sharded yet.
+ ChunkManagerPtr cm = cfg->getChunkManagerIfExists( ns, true );
+ if ( !cm ) {
+ warning() << "could not load chunks to balance " << ns << " collection" << endl;
+ continue;
+ }
// loop through tags to make sure no chunk spans tags; splits on tag min. for all chunks
bool didAnySplits = false;
diff --git a/src/mongo/s/grid.cpp b/src/mongo/s/grid.cpp
index ef6046c558b..4e48113c05f 100644
--- a/src/mongo/s/grid.cpp
+++ b/src/mongo/s/grid.cpp
@@ -30,11 +30,14 @@
#include "mongo/s/type_database.h"
#include "mongo/s/type_settings.h"
#include "mongo/s/type_shard.h"
+#include "mongo/util/fail_point_service.h"
#include "mongo/util/startup_test.h"
#include "mongo/util/stringutils.h"
namespace mongo {
+ MONGO_FP_DECLARE(neverBalance);
+
DBConfigPtr Grid::getDBConfig( string database , bool create , const string& shardNameHint ) {
{
string::size_type i = database.find( "." );
@@ -471,6 +474,9 @@ namespace mongo {
*/
bool Grid::shouldBalance( const string& ns, BSONObj* balancerDocOut ) const {
+ // Allow disabling the balancer for testing
+ if ( MONGO_FAIL_POINT(neverBalance) ) return false;
+
scoped_ptr<ScopedDbConnection> conn( ScopedDbConnection::getInternalScopedDbConnection(
configServer.getPrimary().getConnString(), 30));
BSONObj balancerDoc;