summaryrefslogtreecommitdiff
path: root/db/repl
diff options
context:
space:
mode:
authorKristina <kristina@10gen.com>2010-12-22 18:42:08 -0500
committerKristina <kristina@10gen.com>2010-12-22 18:43:08 -0500
commit52a46867c7b862edf871df4a27f49858ff535802 (patch)
tree860b5e9de4753fd75eb91c626f9a66795da56aab /db/repl
parent0c14ad831cb7578eb0402015d845f4e77c374c1b (diff)
downloadmongo-52a46867c7b862edf871df4a27f49858ff535802.tar.gz
sync against secondaries if primary is unavailable SERVER-2158
Diffstat (limited to 'db/repl')
-rw-r--r--db/repl/rs.h2
-rw-r--r--db/repl/rs_sync.cpp129
2 files changed, 91 insertions, 40 deletions
diff --git a/db/repl/rs.h b/db/repl/rs.h
index f0250d484f4..b41807e3920 100644
--- a/db/repl/rs.h
+++ b/db/repl/rs.h
@@ -348,6 +348,8 @@ namespace mongo {
unsigned _syncRollback(OplogReader& r);
void syncRollback(OplogReader& r);
void syncFixUp(HowToFixUp& h, OplogReader& r);
+ bool _getOplogReader(OplogReader& r, string& hn);
+ bool _isStale(OplogReader& r, const string& hn);
public:
void syncThread();
};
diff --git a/db/repl/rs_sync.cpp b/db/repl/rs_sync.cpp
index 0d8358b0385..935dea2dc51 100644
--- a/db/repl/rs_sync.cpp
+++ b/db/repl/rs_sync.cpp
@@ -176,38 +176,87 @@ namespace mongo {
return golive;
}
+ /**
+ * Checks if the oplog given is too far ahead to read from.
+ *
+ * @param r the oplog
+ * @param hn the hostname (for log messages)
+ *
+ * @return if we are stale compared to the oplog on hn
+ */
+ bool ReplSetImpl::_isStale(OplogReader& r, const string& hn) {
+ BSONObj remoteOldestOp = r.findOne(rsoplog, Query());
+ OpTime ts = remoteOldestOp["ts"]._opTime();
+ DEV log() << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog;
+ else log(3) << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog;
+ DEV {
+ // debugging sync1.js...
+ log() << "replSet lastOpTimeWritten: " << lastOpTimeWritten.toStringLong() << rsLog;
+ log() << "replSet our state: " << state().toString() << rsLog;
+ }
+ if( lastOpTimeWritten < ts ) {
+ log() << "replSet error RS102 too stale to catch up, at least from " << hn << rsLog;
+ log() << "replSet our last optime : " << lastOpTimeWritten.toStringLong() << rsLog;
+ log() << "replSet oldest at " << hn << " : " << ts.toStringLong() << rsLog;
+ log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog;
+ sethbmsg("error RS102 too stale to catch up");
+ changeState(MemberState::RS_RECOVERING);
+ sleepsecs(120);
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Tries to connect the oplog reader to a potential sync source. If
+ * successful, it checks that we are not stale compared to this source.
+ *
+ * @param r reader to populate
+ * @param hn hostname to try
+ *
+ * @return if both checks pass, it returns true, otherwise false.
+ */
+ bool ReplSetImpl::_getOplogReader(OplogReader& r, string& hn) {
+ if( !r.connect(hn) ) {
+ log(2) << "replSet can't connect to " << hn << " to read operations" << rsLog;
+ return false;
+ }
+ if( _isStale(r, hn)) {
+ return false;
+ }
+ return true;
+ }
+
/* tail the primary's oplog. ok to return, will be re-called. */
void ReplSetImpl::syncTail() {
// todo : locking vis a vis the mgr...
-
- const Member *primary = box.getPrimary();
- if( primary == 0 ) return;
- string hn = primary->h().toString();
OplogReader r;
- if( !r.connect(primary->h().toString()) ) {
- log(2) << "replSet can't connect to " << hn << " to read operations" << rsLog;
- return;
- }
-
- /* first make sure we are not hopelessly out of sync by being very stale. */
- {
- BSONObj remoteOldestOp = r.findOne(rsoplog, Query());
- OpTime ts = remoteOldestOp["ts"]._opTime();
- DEV log() << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog;
- else log(3) << "replSet remoteOldestOp: " << ts.toStringLong() << rsLog;
- DEV {
- // debugging sync1.js...
- log() << "replSet lastOpTimeWritten: " << lastOpTimeWritten.toStringLong() << rsLog;
- log() << "replSet our state: " << state().toString() << rsLog;
+ string hn;
+
+ const Member *target = box.getPrimary();
+ // if we cannot reach the master but someone else is more up-to-date
+ // than we are, sync from them.
+ if( target == 0 ) {
+ Member *max = 0;
+ for(Member *m = head(); m; m=m->next()) {
+ hn = m->h().toString();
+ if (m->hbinfo().up() &&
+ (!max || m->hbinfo().opTime > max->hbinfo().opTime) &&
+ _getOplogReader(r, hn)) {
+ max = m;
+ break;
+ }
+ }
+
+ if (max && max->hbinfo().opTime > lastOpTimeWritten) {
+ target = max;
+ }
+ else {
+ return;
}
- if( lastOpTimeWritten < ts ) {
- log() << "replSet error RS102 too stale to catch up, at least from primary: " << hn << rsLog;
- log() << "replSet our last optime : " << lastOpTimeWritten.toStringLong() << rsLog;
- log() << "replSet oldest at " << hn << " : " << ts.toStringLong() << rsLog;
- log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog;
- sethbmsg("error RS102 too stale to catch up");
- changeState(MemberState::RS_RECOVERING);
- sleepsecs(120);
+ } else {
+ hn = target->h().toString();
+ if (!_getOplogReader(r, hn)) {
return;
}
}
@@ -257,7 +306,7 @@ namespace mongo {
long long h = o["h"].numberLong();
if( ts != lastOpTimeWritten || h != lastH ) {
log() << "replSet our last op time written: " << lastOpTimeWritten.toStringPretty() << endl;
- log() << "replset primary's GTE: " << ts.toStringPretty() << endl;
+ log() << "replset source's GTE: " << ts.toStringPretty() << endl;
syncRollback(r);
return;
}
@@ -289,9 +338,11 @@ namespace mongo {
/* todo: too stale capability */
}
-
- if( box.getPrimary() != primary )
+
+ if( target->hbinfo().hbstate != MemberState::RS_PRIMARY &&
+ target->hbinfo().hbstate != MemberState::RS_SECONDARY ) {
return;
+ }
}
if( !r.more() )
break;
@@ -320,8 +371,10 @@ namespace mongo {
sleepsecs(6);
if( time(0) >= waitUntil )
break;
- if( box.getPrimary() != primary )
+ if( target->hbinfo().hbstate != MemberState::RS_PRIMARY &&
+ target->hbinfo().hbstate != MemberState::RS_SECONDARY ) {
break;
+ }
if( myConfig().slaveDelay != sd ) // reconf
break;
}
@@ -336,9 +389,8 @@ namespace mongo {
/* if we have become primary, we dont' want to apply things from elsewhere
anymore. assumePrimary is in the db lock so we are safe as long as
we check after we locked above. */
- if( box.getPrimary() != primary ) {
- if( box.getState().primary() )
- log(0) << "replSet stopping syncTail we are now primary" << rsLog;
+ if( box.getState().primary() ) {
+ log(0) << "replSet stopping syncTail we are now primary" << rsLog;
return;
}
@@ -353,8 +405,10 @@ namespace mongo {
// TODO : reuse our connection to the primary.
return;
}
- if( box.getPrimary() != primary )
+ if( target->hbinfo().hbstate != MemberState::RS_PRIMARY &&
+ target->hbinfo().hbstate != MemberState::RS_SECONDARY ) {
return;
+ }
// looping back is ok because this is a tailable cursor
}
}
@@ -370,11 +424,6 @@ namespace mongo {
return;
}
- /* later, we can sync from up secondaries if we want. tbd. */
- if( sp.primary == 0 ) {
- return;
- }
-
/* do we have anything at all? */
if( lastOpTimeWritten.isNull() ) {
syncDoInitialSync();