diff options
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/repl/collection_cloner.cpp | 13 | ||||
-rw-r--r-- | src/mongo/db/repl/data_replicator.cpp | 24 | ||||
-rw-r--r-- | src/mongo/db/repl/data_replicator.h | 9 | ||||
-rw-r--r-- | src/mongo/db/repl/data_replicator_test.cpp | 36 | ||||
-rw-r--r-- | src/mongo/db/repl/database_cloner.cpp | 7 | ||||
-rw-r--r-- | src/mongo/db/repl/databases_cloner.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/repl/rs_initialsync.cpp | 2 |
8 files changed, 53 insertions, 48 deletions
diff --git a/src/mongo/db/repl/collection_cloner.cpp b/src/mongo/db/repl/collection_cloner.cpp index 83eed5ca3a7..36ff30a37a0 100644 --- a/src/mongo/db/repl/collection_cloner.cpp +++ b/src/mongo/db/repl/collection_cloner.cpp @@ -37,6 +37,7 @@ #include "mongo/db/namespace_string.h" #include "mongo/db/repl/storage_interface.h" #include "mongo/db/repl/storage_interface_mock.h" +#include "mongo/db/server_parameters.h" #include "mongo/util/assert_util.h" #include "mongo/util/destructor_guard.h" #include "mongo/util/log.h" @@ -52,10 +53,10 @@ using UniqueLock = stdx::unique_lock<stdx::mutex>; // The batchSize to use for the query to get all documents from the collection. // 16MB max batch size / 12 byte min doc size * 10 (for good measure) = batchSize to use. const auto batchSize = (16 * 1024 * 1024) / 12 * 10; -// The number of retries for the listIndexes commands. -const size_t numListIndexesRetries = 1; -// The number of retries for the find command, which gets the data. -const size_t numFindRetries = 3; +// The number of attempts for the listIndexes commands. +MONGO_EXPORT_SERVER_PARAMETER(numInitialSyncListIndexesAttempts, int, 3); +// The number of attempts for the find command, which gets the data. +MONGO_EXPORT_SERVER_PARAMETER(numInitialSyncCollectionFindAttempts, int, 3); } // namespace CollectionCloner::CollectionCloner(executor::TaskExecutor* executor, @@ -86,7 +87,7 @@ CollectionCloner::CollectionCloner(executor::TaskExecutor* executor, rpc::ServerSelectionMetadata(true, boost::none).toBSON(), RemoteCommandRequest::kNoTimeout, RemoteCommandRetryScheduler::makeRetryPolicy( - numListIndexesRetries, + numInitialSyncListIndexesAttempts, executor::RemoteCommandRequest::kNoTimeout, RemoteCommandRetryScheduler::kAllRetriableErrors)), _findFetcher( @@ -104,7 +105,7 @@ CollectionCloner::CollectionCloner(executor::TaskExecutor* executor, rpc::ServerSelectionMetadata(true, boost::none).toBSON(), RemoteCommandRequest::kNoTimeout, RemoteCommandRetryScheduler::makeRetryPolicy( - numFindRetries, + numInitialSyncCollectionFindAttempts, executor::RemoteCommandRequest::kNoTimeout, RemoteCommandRetryScheduler::kAllRetriableErrors)), diff --git a/src/mongo/db/repl/data_replicator.cpp b/src/mongo/db/repl/data_replicator.cpp index 554b22ceacd..299b86dd45c 100644 --- a/src/mongo/db/repl/data_replicator.cpp +++ b/src/mongo/db/repl/data_replicator.cpp @@ -52,6 +52,7 @@ #include "mongo/db/repl/rollback_checker.h" #include "mongo/db/repl/storage_interface.h" #include "mongo/db/repl/sync_source_selector.h" +#include "mongo/db/server_parameters.h" #include "mongo/executor/task_executor.h" #include "mongo/rpc/metadata/repl_set_metadata.h" #include "mongo/rpc/metadata/server_selection_metadata.h" @@ -67,8 +68,6 @@ namespace mongo { namespace repl { -const std::size_t kInitialSyncMaxConnectRetries = 10; - // Failpoint for initial sync MONGO_FP_DECLARE(failInitialSyncWithBadHost); @@ -98,8 +97,11 @@ using QueryResponseStatus = StatusWith<Fetcher::QueryResponse>; using UniqueLock = stdx::unique_lock<stdx::mutex>; using LockGuard = stdx::lock_guard<stdx::mutex>; -// The number of retries for the find command. -const size_t numFindRetries = 3; +// The number of attempts to connect to a sync source. +MONGO_EXPORT_SERVER_PARAMETER(numInitialSyncConnectAttempts, int, 10); + +// The number of attempts to call find on the remote oplog. +MONGO_EXPORT_SERVER_PARAMETER(numInitialSyncOplogFindAttempts, int, 3); Counter64 initialSyncFailedAttempts; Counter64 initialSyncFailures; @@ -431,7 +433,7 @@ void DataReplicator::slavesHaveProgressed() { } } -StatusWith<Timestamp> DataReplicator::resync(OperationContext* txn, std::size_t maxRetries) { +StatusWith<Timestamp> DataReplicator::resync(OperationContext* txn, std::size_t maxAttempts) { _shutdown(txn); // Drop databases and do initialSync(); CBHStatus cbh = scheduleWork(_exec, [this](OperationContext* txn, const CallbackArgs& cbData) { @@ -444,7 +446,7 @@ StatusWith<Timestamp> DataReplicator::resync(OperationContext* txn, std::size_t _exec->wait(cbh.getValue()); - auto status = doInitialSync(txn, maxRetries); + auto status = doInitialSync(txn, maxAttempts); if (status.isOK()) { return status.getValue().opTime.getTimestamp(); } else { @@ -646,7 +648,7 @@ Status DataReplicator::_runInitialSyncAttempt_inlock(OperationContext* txn, } StatusWith<OpTimeWithHash> DataReplicator::doInitialSync(OperationContext* txn, - std::size_t maxRetries) { + std::size_t maxAttempts) { if (!txn) { std::string msg = "Initial Sync attempted but no OperationContext*, so aborting."; error() << msg; @@ -688,7 +690,7 @@ StatusWith<OpTimeWithHash> DataReplicator::doInitialSync(OperationContext* txn, _storage->setInitialSyncFlag(txn); lk.lock(); - _stats.maxFailedInitialSyncAttempts = maxRetries + 1; + _stats.maxFailedInitialSyncAttempts = maxAttempts; _stats.failedInitialSyncAttempts = 0; while (_stats.failedInitialSyncAttempts < _stats.maxFailedInitialSyncAttempts) { Status attemptErrorStatus(Status::OK()); @@ -707,14 +709,14 @@ StatusWith<OpTimeWithHash> DataReplicator::doInitialSync(OperationContext* txn, if (attemptErrorStatus.isOK()) { if (_syncSource.empty()) { - for (std::size_t i = 0; i < kInitialSyncMaxConnectRetries; ++i) { + for (int i = 0; i < numInitialSyncConnectAttempts; ++i) { attemptErrorStatus = _ensureGoodSyncSource_inlock(); if (attemptErrorStatus.isOK()) { break; } LOG(1) << "Error getting sync source: '" << attemptErrorStatus.toString() << "', trying again in " << _opts.syncSourceRetryWait << ". Attempt " - << i + 1 << " of " << kInitialSyncMaxConnectRetries; + << i + 1 << " of " << numInitialSyncConnectAttempts.load(); sleepmillis(durationCount<Milliseconds>(_opts.syncSourceRetryWait)); } } @@ -826,7 +828,7 @@ void DataReplicator::_onDataClonerFinish(const Status& status, HostAndPort syncS rpc::ServerSelectionMetadata(true, boost::none).toBSON(), RemoteCommandRequest::kNoTimeout, RemoteCommandRetryScheduler::makeRetryPolicy( - numFindRetries, + numInitialSyncOplogFindAttempts, executor::RemoteCommandRequest::kNoTimeout, RemoteCommandRetryScheduler::kAllRetriableErrors)); Status scheduleStatus = _lastOplogEntryFetcher->schedule(); diff --git a/src/mongo/db/repl/data_replicator.h b/src/mongo/db/repl/data_replicator.h index fe21b3f226f..68427600943 100644 --- a/src/mongo/db/repl/data_replicator.h +++ b/src/mongo/db/repl/data_replicator.h @@ -66,9 +66,6 @@ using UniqueLock = stdx::unique_lock<stdx::mutex>; } // namespace - -extern const std::size_t kInitialSyncMaxConnectRetries; - // TODO: Remove forward declares once we remove rs_initialsync.cpp and other dependents. // Failpoint which fails initial sync and leaves an oplog entry in the buffer. MONGO_FP_FORWARD_DECLARE(failInitSyncWithBufferedEntriesLeft); @@ -232,14 +229,14 @@ public: void slavesHaveProgressed(); // Just like initialSync but can be called any time. - StatusWith<Timestamp> resync(OperationContext* txn, std::size_t maxRetries); + StatusWith<Timestamp> resync(OperationContext* txn, std::size_t maxAttempts); /** - * Does an initial sync, with up to 'kInitialSyncMaxRetries' retries. + * Does an initial sync, with the provided number of attempts. * * This should be the first method called after construction (see class comment). */ - StatusWith<OpTimeWithHash> doInitialSync(OperationContext* txn, std::size_t maxRetries); + StatusWith<OpTimeWithHash> doInitialSync(OperationContext* txn, std::size_t maxAttempts); DataReplicatorState getState() const; diff --git a/src/mongo/db/repl/data_replicator_test.cpp b/src/mongo/db/repl/data_replicator_test.cpp index 7e611129e73..e51912b7878 100644 --- a/src/mongo/db/repl/data_replicator_test.cpp +++ b/src/mongo/db/repl/data_replicator_test.cpp @@ -419,14 +419,14 @@ TEST_F(DataReplicatorTest, StartOk) { TEST_F(DataReplicatorTest, CannotInitialSyncAfterStart) { auto txn = makeOpCtx(); ASSERT_EQ(getDR().start(txn.get()).code(), ErrorCodes::OK); - ASSERT_EQ(getDR().doInitialSync(txn.get(), 0), ErrorCodes::AlreadyInitialized); + ASSERT_EQ(getDR().doInitialSync(txn.get(), 1), ErrorCodes::AlreadyInitialized); } // Used to run a Initial Sync in a separate thread, to avoid blocking test execution. class InitialSyncBackgroundRunner { public: - InitialSyncBackgroundRunner(DataReplicator* dr, std::size_t maxRetries) - : _dr(dr), _maxRetries(maxRetries) {} + InitialSyncBackgroundRunner(DataReplicator* dr, std::size_t maxAttempts) + : _dr(dr), _maxAttempts(maxAttempts) {} ~InitialSyncBackgroundRunner() { if (_thread) { @@ -479,7 +479,7 @@ private: _condVar.notify_all(); lk.unlock(); - auto result = _dr->doInitialSync(txn.get(), _maxRetries); // blocking + auto result = _dr->doInitialSync(txn.get(), _maxAttempts); // blocking lk.lock(); _result = result; @@ -489,7 +489,7 @@ private: StatusWith<OpTimeWithHash> _result{ErrorCodes::NotYetInitialized, "InitialSync not started."}; DataReplicator* _dr; - const std::size_t _maxRetries; + const std::size_t _maxAttempts; std::unique_ptr<stdx::thread> _thread; stdx::condition_variable _condVar; }; @@ -534,9 +534,9 @@ protected: _responses = resps; } - void startSync(std::size_t maxRetries) { + void startSync(std::size_t maxAttempts) { DataReplicator* dr = &(getDR()); - _isbr.reset(new InitialSyncBackgroundRunner(dr, maxRetries)); + _isbr.reset(new InitialSyncBackgroundRunner(dr, maxAttempts)); _isbr->run(); } @@ -742,7 +742,7 @@ TEST_F(InitialSyncTest, Complete) { auto txn = makeOpCtx(); ASSERT_FALSE(getStorage().getInitialSyncFlag(txn.get())); - startSync(0); + startSync(1); // Play first response to ensure data replicator has entered initial sync state. setResponses({responses.begin(), responses.begin() + 1}); @@ -831,7 +831,7 @@ TEST_F(InitialSyncTest, LastOpTimeShouldBeSetEvenIfNoOperationsAreAppliedAfterCl auto txn = makeOpCtx(); ASSERT_FALSE(getStorage().getInitialSyncFlag(txn.get())); - startSync(0); + startSync(1); // Play first response to ensure data replicator has entered initial sync state. setResponses({responses.begin(), responses.begin() + 1}); @@ -873,7 +873,7 @@ TEST_F(InitialSyncTest, Failpoint) { _myLastOpTime = opTime1; _memberState = MemberState::RS_SECONDARY; - startSync(0); + startSync(1); verifySync(getNet(), ErrorCodes::InvalidSyncSource); } @@ -906,7 +906,7 @@ TEST_F(InitialSyncTest, FailsOnClone) { {"replSetGetRBID", fromjson(str::stream() << "{ok: 1, rbid:1}")}, }; - startSync(0); + startSync(1); setResponses(responses); playResponses(); verifySync(getNet(), ErrorCodes::FailedToParse); @@ -964,7 +964,7 @@ TEST_F(InitialSyncTest, FailOnRollback) { {"replSetGetRBID", fromjson(str::stream() << "{ok: 1, rbid:2}")}, }; - startSync(0); + startSync(1); numGetMoreOplogEntriesMax = 5; setResponses(responses); playResponses(); @@ -1022,7 +1022,7 @@ TEST_F(InitialSyncTest, DataReplicatorPassesThroughRollbackCheckerScheduleError) // shutting the executor down. }; - startSync(0); + startSync(1); numGetMoreOplogEntriesMax = 5; setResponses(responses); playResponses(); @@ -1049,7 +1049,7 @@ TEST_F(InitialSyncTest, DataReplicatorPassesThroughOplogFetcherFailure) { << ", op:'i', o:{_id:1, a:1}}]}}")}, }; - startSync(0); + startSync(1); setResponses(responses); playResponses(); @@ -1140,7 +1140,7 @@ TEST_F(InitialSyncTest, OplogOutOfOrderOnOplogFetchFinish) { // Applier starts ... }; - startSync(0); + startSync(1); numGetMoreOplogEntriesMax = 10; setResponses({responses.begin(), responses.end() - 4}); @@ -1206,7 +1206,7 @@ TEST_F(InitialSyncTest, InitialSyncStateIsResetAfterFailure) { // Applier starts ... }; - startSync(1); + startSync(2); numGetMoreOplogEntriesMax = 6; setResponses(responses); @@ -1323,7 +1323,7 @@ TEST_F(InitialSyncTest, GetInitialSyncProgressReturnsCorrectProgress) { // Applier starts ... }; - startSync(1); + startSync(2); // Play first 2 responses to ensure data replicator has started the oplog fetcher. setResponses({failedResponses.begin(), failedResponses.begin() + 2}); @@ -1500,7 +1500,7 @@ TEST_F(InitialSyncTest, DataReplicatorCreatesNewApplierForNextBatchBeforeDestroy {"replSetGetRBID", getRollbackIdResponse}, }; - startSync(0); + startSync(1); setResponses(responses); playResponses(); diff --git a/src/mongo/db/repl/database_cloner.cpp b/src/mongo/db/repl/database_cloner.cpp index a7b8bda03cc..7dbac25dee0 100644 --- a/src/mongo/db/repl/database_cloner.cpp +++ b/src/mongo/db/repl/database_cloner.cpp @@ -40,6 +40,7 @@ #include "mongo/db/catalog/collection_options.h" #include "mongo/db/commands/list_collections_filter.h" #include "mongo/db/repl/storage_interface.h" +#include "mongo/db/server_parameters.h" #include "mongo/rpc/metadata/server_selection_metadata.h" #include "mongo/stdx/functional.h" #include "mongo/util/assert_util.h" @@ -58,8 +59,8 @@ using UniqueLock = stdx::unique_lock<stdx::mutex>; const char* kNameFieldName = "name"; const char* kOptionsFieldName = "options"; -// The number of retries for the listCollections commands. -const int numListCollectionsRetries = 1; +// The number of attempts for the listCollections commands. +MONGO_EXPORT_SERVER_PARAMETER(numInitialSyncListCollectionsAttempts, int, 3); /** * Default listCollections predicate. @@ -115,7 +116,7 @@ DatabaseCloner::DatabaseCloner(executor::TaskExecutor* executor, rpc::ServerSelectionMetadata(true, boost::none).toBSON(), RemoteCommandRequest::kNoTimeout, RemoteCommandRetryScheduler::makeRetryPolicy( - numListCollectionsRetries, + numInitialSyncListCollectionsAttempts, executor::RemoteCommandRequest::kNoTimeout, RemoteCommandRetryScheduler::kAllRetriableErrors)), _startCollectionCloner([](CollectionCloner& cloner) { return cloner.startup(); }) { diff --git a/src/mongo/db/repl/databases_cloner.cpp b/src/mongo/db/repl/databases_cloner.cpp index f92ca3c9215..0527a82dfd0 100644 --- a/src/mongo/db/repl/databases_cloner.cpp +++ b/src/mongo/db/repl/databases_cloner.cpp @@ -40,6 +40,7 @@ #include "mongo/db/catalog/collection_options.h" #include "mongo/db/client.h" #include "mongo/db/repl/storage_interface.h" +#include "mongo/db/server_parameters.h" #include "mongo/rpc/get_status_from_command_result.h" #include "mongo/rpc/metadata/server_selection_metadata.h" #include "mongo/stdx/functional.h" @@ -58,7 +59,8 @@ using Response = executor::RemoteCommandResponse; using LockGuard = stdx::lock_guard<stdx::mutex>; using UniqueLock = stdx::unique_lock<stdx::mutex>; -const size_t numListDatabasesRetries = 1; +// The number of attempts for the listDatabases commands. +MONGO_EXPORT_SERVER_PARAMETER(numInitialSyncListDatabasesAttempts, int, 3); } // namespace @@ -187,7 +189,7 @@ Status DatabasesCloner::startup() { listDBsReq, stdx::bind(&DatabasesCloner::_onListDatabaseFinish, this, stdx::placeholders::_1), RemoteCommandRetryScheduler::makeRetryPolicy( - numListDatabasesRetries, + numInitialSyncListDatabasesAttempts, executor::RemoteCommandRequest::kNoTimeout, RemoteCommandRetryScheduler::kAllRetriableErrors)); auto s = _listDBsScheduler->startup(); diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 9afd1253a35..09df360bf92 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -100,7 +100,7 @@ namespace { const char kLocalDB[] = "local"; -MONGO_EXPORT_STARTUP_SERVER_PARAMETER(numInitialSyncRetries, int, 9); +MONGO_EXPORT_SERVER_PARAMETER(numInitialSyncAttempts, int, 10); void lockAndCall(stdx::unique_lock<stdx::mutex>* lk, const stdx::function<void()>& fn) { if (!lk->owns_lock()) { @@ -602,7 +602,7 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* txn, _storage); lk.unlock(); - const auto status = _dr->doInitialSync(txn, numInitialSyncRetries); + const auto status = _dr->doInitialSync(txn, numInitialSyncAttempts); // If it is interrupted by resync, we do not need to cleanup the DataReplicator. if (status == ErrorCodes::ShutdownInProgress) { return; diff --git a/src/mongo/db/repl/rs_initialsync.cpp b/src/mongo/db/repl/rs_initialsync.cpp index 5c93123bdd4..d4a2116b532 100644 --- a/src/mongo/db/repl/rs_initialsync.cpp +++ b/src/mongo/db/repl/rs_initialsync.cpp @@ -72,6 +72,8 @@ namespace { using std::list; using std::string; +const auto kInitialSyncMaxConnectRetries = 10; + /** * Truncates the oplog (removes any documents) and resets internal variables that were * originally initialized or affected by using values from the oplog at startup time. These |