diff options
author | Wenbin Zhu <wenbin.zhu@mongodb.com> | 2021-09-02 02:08:11 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-10-01 18:38:54 +0000 |
commit | eea1bcb22a3d897cccd8f923350df380db203732 (patch) | |
tree | f197017ad64a71214e5f9da52d7bb5a9e47f2038 | |
parent | 2fd7e5d8f09ffdc22ebbd229cf40b1c6dda1c8fc (diff) | |
download | mongo-eea1bcb22a3d897cccd8f923350df380db203732.tar.gz |
SERVER-58988 Avoid sync source selection cycle during primary catchup.
(cherry picked from commit b46acdbba8ec51810b6f402dbe18ed7ea98fd13d)
-rw-r--r-- | src/mongo/db/repl/initial_syncer_test.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/repl/oplog_fetcher_test.cpp | 35 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator.cpp | 27 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_v1_test.cpp | 95 | ||||
-rw-r--r-- | src/mongo/rpc/metadata/oplog_query_metadata.cpp | 20 | ||||
-rw-r--r-- | src/mongo/rpc/metadata/oplog_query_metadata.h | 12 | ||||
-rw-r--r-- | src/mongo/rpc/metadata/oplog_query_metadata_test.cpp | 16 |
7 files changed, 182 insertions, 26 deletions
diff --git a/src/mongo/db/repl/initial_syncer_test.cpp b/src/mongo/db/repl/initial_syncer_test.cpp index 9717227343a..32b6e4189da 100644 --- a/src/mongo/db/repl/initial_syncer_test.cpp +++ b/src/mongo/db/repl/initial_syncer_test.cpp @@ -578,7 +578,8 @@ RemoteCommandResponse makeCursorResponse(CursorId cursorId, int rbid = 1) { OpTime futureOpTime(Timestamp(1000, 1000), 1000); Date_t futureWallTime = Date_t() + Seconds(futureOpTime.getSecs()); - rpc::OplogQueryMetadata oqMetadata({futureOpTime, futureWallTime}, futureOpTime, rbid, 0, 0); + rpc::OplogQueryMetadata oqMetadata( + {futureOpTime, futureWallTime}, futureOpTime, rbid, 0, 0, ""); BSONObjBuilder bob; { diff --git a/src/mongo/db/repl/oplog_fetcher_test.cpp b/src/mongo/db/repl/oplog_fetcher_test.cpp index 7c4a47fbcb8..121a4234566 100644 --- a/src/mongo/db/repl/oplog_fetcher_test.cpp +++ b/src/mongo/db/repl/oplog_fetcher_test.cpp @@ -276,6 +276,7 @@ protected: static const int rbid = 2; static const int primaryIndex = 2; static const int syncSourceIndex = 2; + static const std::string syncSourceHost; static const rpc::OplogQueryMetadata oqMetadata; static const rpc::OplogQueryMetadata staleOqMetadata; static const rpc::ReplSetMetadata replSetMetadata; @@ -325,13 +326,19 @@ protected: const int OplogFetcherTest::rbid; const OpTime OplogFetcherTest::remoteNewerOpTime = OpTime(Timestamp(1000, 1), 2); -const rpc::OplogQueryMetadata OplogFetcherTest::oqMetadata = rpc::OplogQueryMetadata( - {staleOpTime, staleWallTime}, remoteNewerOpTime, rbid, primaryIndex, syncSourceIndex); +const std::string OplogFetcherTest::syncSourceHost = ""; +const rpc::OplogQueryMetadata OplogFetcherTest::oqMetadata = + rpc::OplogQueryMetadata({staleOpTime, staleWallTime}, + remoteNewerOpTime, + rbid, + primaryIndex, + syncSourceIndex, + syncSourceHost); const OpTime OplogFetcherTest::staleOpTime = OpTime(Timestamp(1, 1), 0); const Date_t OplogFetcherTest::staleWallTime = Date_t() + Seconds(staleOpTime.getSecs()); const rpc::OplogQueryMetadata OplogFetcherTest::staleOqMetadata = rpc::OplogQueryMetadata( - {staleOpTime, staleWallTime}, staleOpTime, rbid, primaryIndex, syncSourceIndex); + {staleOpTime, staleWallTime}, staleOpTime, rbid, primaryIndex, syncSourceIndex, syncSourceHost); const rpc::ReplSetMetadata OplogFetcherTest::replSetMetadata = rpc::ReplSetMetadata( 1, OpTimeAndWallTime(), OpTime(), 1, 0, OID(), primaryIndex, syncSourceIndex, false); @@ -847,9 +854,12 @@ TEST_F(OplogFetcherTest, ValidMetadataWithInResponseShouldBeForwardedToProcessMe TEST_F(OplogFetcherTest, MetadataAndBatchAreNotProcessedWhenSyncSourceRollsBack) { CursorId cursorId = 22LL; auto entry = makeNoopOplogEntry(lastFetched); - - rpc::OplogQueryMetadata oplogQueryMetadata( - {staleOpTime, staleWallTime}, remoteNewerOpTime, rbid + 1, primaryIndex, syncSourceIndex); + rpc::OplogQueryMetadata oplogQueryMetadata({staleOpTime, staleWallTime}, + remoteNewerOpTime, + rbid + 1, + primaryIndex, + syncSourceIndex, + syncSourceHost); auto metadataObj = makeOplogBatchMetadata(replSetMetadata, oplogQueryMetadata); ASSERT_EQUALS(ErrorCodes::InvalidSyncSource, @@ -874,9 +884,12 @@ TEST_F(OplogFetcherTest, MetadataAndBatchAreNotProcessedWhenSyncSourceIsBehind) TEST_F(OplogFetcherTest, MetadataAndBatchAreNotProcessedWhenSyncSourceIsNotAhead) { CursorId cursorId = 22LL; auto entry = makeNoopOplogEntry(lastFetched); - - rpc::OplogQueryMetadata oplogQueryMetadata( - {staleOpTime, staleWallTime}, lastFetched, rbid, primaryIndex, syncSourceIndex); + rpc::OplogQueryMetadata oplogQueryMetadata({staleOpTime, staleWallTime}, + lastFetched, + rbid, + primaryIndex, + syncSourceIndex, + syncSourceHost); auto metadataObj = makeOplogBatchMetadata(replSetMetadata, oplogQueryMetadata); ASSERT_EQUALS(ErrorCodes::InvalidSyncSource, @@ -922,7 +935,7 @@ TEST_F(OplogFetcherTest, MetadataAndBatchAreProcessedWhenSyncSourceIsNotAheadWithoutRequiringFresherSyncSource) { CursorId cursorId = 0LL; rpc::OplogQueryMetadata oplogQueryMetadata( - {staleOpTime, staleWallTime}, lastFetched, rbid, 2, 2); + {staleOpTime, staleWallTime}, lastFetched, rbid, 2, 2, syncSourceHost); auto metadataObj = makeOplogBatchMetadata(replSetMetadata, oplogQueryMetadata); auto entry = makeNoopOplogEntry(lastFetched); @@ -1687,7 +1700,7 @@ TEST_F(OplogFetcherTest, FailedSyncSourceCheckWithBothMetadatasStopsTheOplogFetc TEST_F(OplogFetcherTest, FailedSyncSourceCheckWithSyncSourceHavingNoSyncSourceStopsTheOplogFetcher) { rpc::OplogQueryMetadata oplogQueryMetadata( - {staleOpTime, staleWallTime}, remoteNewerOpTime, rbid, primaryIndex, -1); + {staleOpTime, staleWallTime}, remoteNewerOpTime, rbid, primaryIndex, -1, syncSourceHost); testSyncSourceChecking(replSetMetadata, oplogQueryMetadata); // Sync source "hasSyncSource" is derived from metadata. diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp index 6513448deb9..1030898b642 100644 --- a/src/mongo/db/repl/topology_coordinator.cpp +++ b/src/mongo/db/repl/topology_coordinator.cpp @@ -2969,6 +2969,7 @@ bool TopologyCoordinator::shouldChangeSyncSource(const HostAndPort& currentSourc fassert(4612000, !currentSourceOpTime.isNull()); int syncSourceIndex = oqMetadata.getSyncSourceIndex(); + std::string syncSourceHost = oqMetadata.getSyncSourceHost(); // A 4.2 sync source's primaryIndex is unreliable, because we don't know what config version the // index is valid for. Prefer the new 4.4 field isPrimary. // TODO(SERVER-47125): Require isPrimary and stop using primaryIndex. @@ -3012,6 +3013,29 @@ bool TopologyCoordinator::shouldChangeSyncSource(const HostAndPort& currentSourc return true; } + // Change sync source if our sync source is also syncing from us when we are in primary + // catchup mode, forming a sync source selection cycle, and the sync source is not ahead + // of us. This is to prevent a deadlock situation. See SERVER-58988 for details. + // When checking the sync source, we use syncSourceHost if it is set, otherwise fall back + // to use syncSourceIndex. The difference is that syncSourceIndex might not point to the + // node that we think of because it was inferred from the sender node, which could have + // a different config. This is acceptable since we are just choosing a different sync + // source if that happens and reconfigs are rare. + bool isSyncingFromMe = !syncSourceHost.empty() + ? syncSourceHost == _selfMemberData().getHostAndPort().toString() + : syncSourceIndex == _selfIndex; + + if (isSyncingFromMe && _currentPrimaryIndex == _selfIndex && + currentSourceOpTime <= myLastOpTime) { + LOGV2(5898800, + "Choosing new sync source because we are in primary catchup but our current sync " + "source is also syncing from us but is not ahead of us", + "syncSource"_attr = currentSource, + "lastFetchedOpTime"_attr = myLastOpTime, + "syncSourceLatestOplogOpTime"_attr = currentSourceOpTime); + return true; + } + if (MONGO_unlikely(disableMaxSyncSourceLagSecs.shouldFail())) { LOGV2( 21833, @@ -3075,7 +3099,8 @@ rpc::OplogQueryMetadata TopologyCoordinator::prepareOplogQueryMetadata(int rbid) getMyLastAppliedOpTime(), rbid, _currentPrimaryIndex, - _rsConfig.findMemberIndexByHostAndPort(getSyncSourceAddress())); + _rsConfig.findMemberIndexByHostAndPort(getSyncSourceAddress()), + getSyncSourceAddress().toString()); } void TopologyCoordinator::processReplSetRequestVotes(const ReplSetRequestVotesArgs& args, diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp index 476adf62f90..9736e8b9aac 100644 --- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp @@ -237,9 +237,14 @@ protected: OplogQueryMetadata makeOplogQueryMetadata(OpTime lastAppliedOpTime = OpTime(), int primaryIndex = -1, int syncSourceIndex = -1, + std::string syncSourceHost = "", Date_t lastCommittedWall = Date_t()) { - return OplogQueryMetadata( - {OpTime(), lastCommittedWall}, lastAppliedOpTime, -1, primaryIndex, syncSourceIndex); + return OplogQueryMetadata({OpTime(), lastCommittedWall}, + lastAppliedOpTime, + -1, + primaryIndex, + syncSourceIndex, + syncSourceHost); } HeartbeatResponseAction receiveUpHeartbeat(const HostAndPort& member, @@ -4070,6 +4075,92 @@ TEST_F(HeartbeatResponseTestV1, now())); } +TEST_F(HeartbeatResponseTestV1, ShouldChangeSyncSourceWhenSyncSourceFormsCycleAndWeArePrimary) { + // In this test, the TopologyCoordinator will tell us change our sync source away from "host2" + // when it is not ahead of us and it selects us to be its sync source, forming a sync source + // cycle and we are currently in primary catchup. + setSelfMemberState(MemberState::RS_PRIMARY); + OpTime election = OpTime(); + OpTime syncSourceOpTime = OpTime(Timestamp(400, 0), 0); + + // Set lastOpTimeFetched to be same as the sync source's OpTime. + OpTime lastOpTimeFetched = OpTime(Timestamp(400, 0), 0); + setMyOpTime(lastOpTimeFetched); + + // Show we like host2 while it is not syncing from us. + HeartbeatResponseAction nextAction = receiveUpHeartbeat( + HostAndPort("host2"), "rs0", MemberState::RS_SECONDARY, election, syncSourceOpTime); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource( + HostAndPort("host2"), + makeReplSetMetadata(OpTime() /* visibleOpTime */, false /* isPrimary */), + makeOplogQueryMetadata(syncSourceOpTime, + -1 /* primaryIndex */, + 2 /* syncSourceIndex */, + "host3:27017" /* syncSourceHost */), + now())); + + // Show that we also like host2 while we are not primary. + nextAction = receiveUpHeartbeat( + HostAndPort("host2"), "rs0", MemberState::RS_SECONDARY, election, syncSourceOpTime); + ASSERT_NO_ACTION(nextAction.getAction()); + getTopoCoord().setPrimaryIndex(2); + ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource( + HostAndPort("host2"), + makeReplSetMetadata(OpTime() /* visibleOpTime */, false /* isPrimary */), + // Sync source is also syncing from us. + makeOplogQueryMetadata(syncSourceOpTime, + -1 /* primaryIndex */, + 0 /* syncSourceIndex */, + "host1:27017" /* syncSourceHost */), + now())); + + // Show that we also like host2 while it has some progress beyond our own. + getTopoCoord().setPrimaryIndex(0); + OpTime olderThanSyncSourceOpTime = OpTime(Timestamp(300, 0), 0); + topoCoordSetMyLastAppliedOpTime(olderThanSyncSourceOpTime, now(), true); + nextAction = receiveUpHeartbeat( + HostAndPort("host2"), "rs0", MemberState::RS_SECONDARY, election, syncSourceOpTime); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource( + HostAndPort("host2"), + makeReplSetMetadata(OpTime() /* visibleOpTime */, false /* isPrimary */), + // Sync source is also syncing from us. + makeOplogQueryMetadata(syncSourceOpTime, + -1 /* primaryIndex */, + 0 /* syncSourceIndex */, + "host1:27017" /* syncSourceHost */), + now())); + + // Show that we do not like host2 it forms a sync source selection cycle with us and we + // are primary and it lacks progress beyond our own. + setMyOpTime(lastOpTimeFetched); + nextAction = receiveUpHeartbeat( + HostAndPort("host2"), "rs0", MemberState::RS_SECONDARY, election, syncSourceOpTime); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource( + HostAndPort("host2"), + makeReplSetMetadata(OpTime() /* visibleOpTime */, false /* isPrimary */), + // Sync source is also syncing from us. + makeOplogQueryMetadata(syncSourceOpTime, + -1 /* primaryIndex */, + 0 /* syncSourceIndex */, + "host1:27017" /* syncSourceHost */), + now())); + + // Show that we still do not like it when syncSourceHost is not set, but we can rely on + // syncSourceIndex to decide if a sync source selection cycle has been formed. + nextAction = receiveUpHeartbeat( + HostAndPort("host2"), "rs0", MemberState::RS_SECONDARY, election, syncSourceOpTime); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource( + HostAndPort("host2"), + makeReplSetMetadata(OpTime() /* visibleOpTime */, false /* isPrimary */), + // Sync source is also syncing from us. + makeOplogQueryMetadata(syncSourceOpTime, -1 /* primaryIndex */, 0 /* syncSourceIndex */), + now())); +} + TEST_F(HeartbeatResponseTestV1, ShouldNotChangeSyncSourceWhenFresherMemberIsDown) { // In this test, the TopologyCoordinator should not tell us to change sync sources away from // "host2" and to "host3" despite "host2" being more than maxSyncSourceLagSecs(30) behind diff --git a/src/mongo/rpc/metadata/oplog_query_metadata.cpp b/src/mongo/rpc/metadata/oplog_query_metadata.cpp index 65d600dab00..f810662ded7 100644 --- a/src/mongo/rpc/metadata/oplog_query_metadata.cpp +++ b/src/mongo/rpc/metadata/oplog_query_metadata.cpp @@ -50,6 +50,7 @@ const char kLastCommittedWallFieldName[] = "lastCommittedWall"; const char kLastOpAppliedFieldName[] = "lastOpApplied"; const char kPrimaryIndexFieldName[] = "primaryIndex"; const char kSyncSourceIndexFieldName[] = "syncSourceIndex"; +const char kSyncSourceHostFieldName[] = "syncSourceHost"; const char kRBIDFieldName[] = "rbid"; } // unnamed namespace @@ -60,12 +61,14 @@ OplogQueryMetadata::OplogQueryMetadata(OpTimeAndWallTime lastOpCommitted, OpTime lastOpApplied, int rbid, int currentPrimaryIndex, - int currentSyncSourceIndex) + int currentSyncSourceIndex, + std::string currentSyncSourceHost) : _lastOpCommitted(std::move(lastOpCommitted)), _lastOpApplied(std::move(lastOpApplied)), _rbid(rbid), _currentPrimaryIndex(currentPrimaryIndex), - _currentSyncSourceIndex(currentSyncSourceIndex) {} + _currentSyncSourceIndex(currentSyncSourceIndex), + _currentSyncSourceHost(currentSyncSourceHost) {} StatusWith<OplogQueryMetadata> OplogQueryMetadata::readFromMetadata(const BSONObj& metadataObj) { BSONElement oqMetadataElement; @@ -86,6 +89,14 @@ StatusWith<OplogQueryMetadata> OplogQueryMetadata::readFromMetadata(const BSONOb if (!status.isOK()) return status; + std::string syncSourceHost; + status = bsonExtractStringField(oqMetadataObj, kSyncSourceHostFieldName, &syncSourceHost); + // SyncSourceHost might not be set in older versions, checking NoSuchKey error + // for backward compatibility. + // TODO SERVER-59732: Remove the compatibility check once 6.0 is released. + if (!status.isOK() && status.code() != ErrorCodes::NoSuchKey) + return status; + long long rbid; status = bsonExtractIntegerField(oqMetadataObj, kRBIDFieldName, &rbid); if (!status.isOK()) @@ -110,7 +121,8 @@ StatusWith<OplogQueryMetadata> OplogQueryMetadata::readFromMetadata(const BSONOb if (!status.isOK()) return status; - return OplogQueryMetadata(lastOpCommitted, lastOpApplied, rbid, primaryIndex, syncSourceIndex); + return OplogQueryMetadata( + lastOpCommitted, lastOpApplied, rbid, primaryIndex, syncSourceIndex, syncSourceHost); } Status OplogQueryMetadata::writeToMetadata(BSONObjBuilder* builder) const { @@ -121,6 +133,7 @@ Status OplogQueryMetadata::writeToMetadata(BSONObjBuilder* builder) const { oqMetadataBuilder.append(kRBIDFieldName, _rbid); oqMetadataBuilder.append(kPrimaryIndexFieldName, _currentPrimaryIndex); oqMetadataBuilder.append(kSyncSourceIndexFieldName, _currentSyncSourceIndex); + oqMetadataBuilder.append(kSyncSourceHostFieldName, _currentSyncSourceHost); oqMetadataBuilder.doneFast(); return Status::OK(); @@ -131,6 +144,7 @@ std::string OplogQueryMetadata::toString() const { output << "OplogQueryMetadata"; output << " Primary Index: " << _currentPrimaryIndex; output << " Sync Source Index: " << _currentSyncSourceIndex; + output << " Sync Source Host: " << _currentSyncSourceHost; output << " RBID: " << _rbid; output << " Last Op Committed: " << _lastOpCommitted.toString(); output << " Last Op Applied: " << _lastOpApplied.toString(); diff --git a/src/mongo/rpc/metadata/oplog_query_metadata.h b/src/mongo/rpc/metadata/oplog_query_metadata.h index 92ce4e24827..cc0ced48e41 100644 --- a/src/mongo/rpc/metadata/oplog_query_metadata.h +++ b/src/mongo/rpc/metadata/oplog_query_metadata.h @@ -57,7 +57,8 @@ public: repl::OpTime lastOpApplied, int rbid, int currentPrimaryIndex, - int currentSyncSourceIndex); + int currentSyncSourceIndex, + std::string currentSyncSourceHost); /** * format: @@ -104,6 +105,14 @@ public: } /** + * Returns the host of the sync source of the sender. + * Returns empty string if it has no sync source. + */ + std::string getSyncSourceHost() const { + return _currentSyncSourceHost; + } + + /** * Returns the current rbid of the sender. */ int getRBID() const { @@ -121,6 +130,7 @@ private: int _rbid = -1; int _currentPrimaryIndex = kNoPrimary; int _currentSyncSourceIndex = -1; + std::string _currentSyncSourceHost; }; } // namespace rpc diff --git a/src/mongo/rpc/metadata/oplog_query_metadata_test.cpp b/src/mongo/rpc/metadata/oplog_query_metadata_test.cpp index bb267ccac59..c0cc8969349 100644 --- a/src/mongo/rpc/metadata/oplog_query_metadata_test.cpp +++ b/src/mongo/rpc/metadata/oplog_query_metadata_test.cpp @@ -43,7 +43,7 @@ TEST(ReplResponseMetadataTest, OplogQueryMetadataRoundtrip) { OpTime opTime1(Timestamp(1234, 100), 5); Date_t committedWall = Date_t() + Seconds(opTime1.getSecs()); OpTime opTime2(Timestamp(7777, 101), 6); - OplogQueryMetadata metadata({opTime1, committedWall}, opTime2, 6, 12, -1); + OplogQueryMetadata metadata({opTime1, committedWall}, opTime2, 6, 12, -1, ""); ASSERT_EQ(opTime1, metadata.getLastOpCommitted().opTime); ASSERT_EQ(committedWall, metadata.getLastOpCommitted().wallTime); @@ -52,12 +52,14 @@ TEST(ReplResponseMetadataTest, OplogQueryMetadataRoundtrip) { BSONObjBuilder builder; metadata.writeToMetadata(&builder).transitional_ignore(); - BSONObj expectedObj(BSON( - kOplogQueryMetadataFieldName << BSON( - "lastOpCommitted" << BSON("ts" << opTime1.getTimestamp() << "t" << opTime1.getTerm()) - << "lastCommittedWall" << committedWall << "lastOpApplied" - << BSON("ts" << opTime2.getTimestamp() << "t" << opTime2.getTerm()) - << "rbid" << 6 << "primaryIndex" << 12 << "syncSourceIndex" << -1))); + BSONObj expectedObj( + BSON(kOplogQueryMetadataFieldName + << BSON("lastOpCommitted" + << BSON("ts" << opTime1.getTimestamp() << "t" << opTime1.getTerm()) + << "lastCommittedWall" << committedWall << "lastOpApplied" + << BSON("ts" << opTime2.getTimestamp() << "t" << opTime2.getTerm()) << "rbid" + << 6 << "primaryIndex" << 12 << "syncSourceIndex" << -1 << "syncSourceHost" + << ""))); BSONObj serializedObj = builder.obj(); ASSERT_BSONOBJ_EQ(expectedObj, serializedObj); |