summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJudah Schvimer <judah@mongodb.com>2016-09-01 09:56:38 -0400
committerJudah Schvimer <judah@mongodb.com>2016-09-01 09:56:38 -0400
commit0dcbbe8777de4b279d095c37c9060210c6a66b29 (patch)
treed8540741f49405af6eb01861d821453f56db4cb0
parent4a3f671a1178ecf9769e6eeef05a74ea4f4765b5 (diff)
downloadmongo-0dcbbe8777de4b279d095c37c9060210c6a66b29.tar.gz
SERVER-25874 Add server parameter configuration for initial sync metadata retry params
-rw-r--r--src/mongo/db/repl/collection_cloner.cpp13
-rw-r--r--src/mongo/db/repl/data_replicator.cpp24
-rw-r--r--src/mongo/db/repl/data_replicator.h9
-rw-r--r--src/mongo/db/repl/data_replicator_test.cpp36
-rw-r--r--src/mongo/db/repl/database_cloner.cpp7
-rw-r--r--src/mongo/db/repl/databases_cloner.cpp6
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp4
-rw-r--r--src/mongo/db/repl/rs_initialsync.cpp2
8 files changed, 53 insertions, 48 deletions
diff --git a/src/mongo/db/repl/collection_cloner.cpp b/src/mongo/db/repl/collection_cloner.cpp
index 83eed5ca3a7..36ff30a37a0 100644
--- a/src/mongo/db/repl/collection_cloner.cpp
+++ b/src/mongo/db/repl/collection_cloner.cpp
@@ -37,6 +37,7 @@
#include "mongo/db/namespace_string.h"
#include "mongo/db/repl/storage_interface.h"
#include "mongo/db/repl/storage_interface_mock.h"
+#include "mongo/db/server_parameters.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/destructor_guard.h"
#include "mongo/util/log.h"
@@ -52,10 +53,10 @@ using UniqueLock = stdx::unique_lock<stdx::mutex>;
// The batchSize to use for the query to get all documents from the collection.
// 16MB max batch size / 12 byte min doc size * 10 (for good measure) = batchSize to use.
const auto batchSize = (16 * 1024 * 1024) / 12 * 10;
-// The number of retries for the listIndexes commands.
-const size_t numListIndexesRetries = 1;
-// The number of retries for the find command, which gets the data.
-const size_t numFindRetries = 3;
+// The number of attempts for the listIndexes commands.
+MONGO_EXPORT_SERVER_PARAMETER(numInitialSyncListIndexesAttempts, int, 3);
+// The number of attempts for the find command, which gets the data.
+MONGO_EXPORT_SERVER_PARAMETER(numInitialSyncCollectionFindAttempts, int, 3);
} // namespace
CollectionCloner::CollectionCloner(executor::TaskExecutor* executor,
@@ -86,7 +87,7 @@ CollectionCloner::CollectionCloner(executor::TaskExecutor* executor,
rpc::ServerSelectionMetadata(true, boost::none).toBSON(),
RemoteCommandRequest::kNoTimeout,
RemoteCommandRetryScheduler::makeRetryPolicy(
- numListIndexesRetries,
+ numInitialSyncListIndexesAttempts,
executor::RemoteCommandRequest::kNoTimeout,
RemoteCommandRetryScheduler::kAllRetriableErrors)),
_findFetcher(
@@ -104,7 +105,7 @@ CollectionCloner::CollectionCloner(executor::TaskExecutor* executor,
rpc::ServerSelectionMetadata(true, boost::none).toBSON(),
RemoteCommandRequest::kNoTimeout,
RemoteCommandRetryScheduler::makeRetryPolicy(
- numFindRetries,
+ numInitialSyncCollectionFindAttempts,
executor::RemoteCommandRequest::kNoTimeout,
RemoteCommandRetryScheduler::kAllRetriableErrors)),
diff --git a/src/mongo/db/repl/data_replicator.cpp b/src/mongo/db/repl/data_replicator.cpp
index 554b22ceacd..299b86dd45c 100644
--- a/src/mongo/db/repl/data_replicator.cpp
+++ b/src/mongo/db/repl/data_replicator.cpp
@@ -52,6 +52,7 @@
#include "mongo/db/repl/rollback_checker.h"
#include "mongo/db/repl/storage_interface.h"
#include "mongo/db/repl/sync_source_selector.h"
+#include "mongo/db/server_parameters.h"
#include "mongo/executor/task_executor.h"
#include "mongo/rpc/metadata/repl_set_metadata.h"
#include "mongo/rpc/metadata/server_selection_metadata.h"
@@ -67,8 +68,6 @@
namespace mongo {
namespace repl {
-const std::size_t kInitialSyncMaxConnectRetries = 10;
-
// Failpoint for initial sync
MONGO_FP_DECLARE(failInitialSyncWithBadHost);
@@ -98,8 +97,11 @@ using QueryResponseStatus = StatusWith<Fetcher::QueryResponse>;
using UniqueLock = stdx::unique_lock<stdx::mutex>;
using LockGuard = stdx::lock_guard<stdx::mutex>;
-// The number of retries for the find command.
-const size_t numFindRetries = 3;
+// The number of attempts to connect to a sync source.
+MONGO_EXPORT_SERVER_PARAMETER(numInitialSyncConnectAttempts, int, 10);
+
+// The number of attempts to call find on the remote oplog.
+MONGO_EXPORT_SERVER_PARAMETER(numInitialSyncOplogFindAttempts, int, 3);
Counter64 initialSyncFailedAttempts;
Counter64 initialSyncFailures;
@@ -431,7 +433,7 @@ void DataReplicator::slavesHaveProgressed() {
}
}
-StatusWith<Timestamp> DataReplicator::resync(OperationContext* txn, std::size_t maxRetries) {
+StatusWith<Timestamp> DataReplicator::resync(OperationContext* txn, std::size_t maxAttempts) {
_shutdown(txn);
// Drop databases and do initialSync();
CBHStatus cbh = scheduleWork(_exec, [this](OperationContext* txn, const CallbackArgs& cbData) {
@@ -444,7 +446,7 @@ StatusWith<Timestamp> DataReplicator::resync(OperationContext* txn, std::size_t
_exec->wait(cbh.getValue());
- auto status = doInitialSync(txn, maxRetries);
+ auto status = doInitialSync(txn, maxAttempts);
if (status.isOK()) {
return status.getValue().opTime.getTimestamp();
} else {
@@ -646,7 +648,7 @@ Status DataReplicator::_runInitialSyncAttempt_inlock(OperationContext* txn,
}
StatusWith<OpTimeWithHash> DataReplicator::doInitialSync(OperationContext* txn,
- std::size_t maxRetries) {
+ std::size_t maxAttempts) {
if (!txn) {
std::string msg = "Initial Sync attempted but no OperationContext*, so aborting.";
error() << msg;
@@ -688,7 +690,7 @@ StatusWith<OpTimeWithHash> DataReplicator::doInitialSync(OperationContext* txn,
_storage->setInitialSyncFlag(txn);
lk.lock();
- _stats.maxFailedInitialSyncAttempts = maxRetries + 1;
+ _stats.maxFailedInitialSyncAttempts = maxAttempts;
_stats.failedInitialSyncAttempts = 0;
while (_stats.failedInitialSyncAttempts < _stats.maxFailedInitialSyncAttempts) {
Status attemptErrorStatus(Status::OK());
@@ -707,14 +709,14 @@ StatusWith<OpTimeWithHash> DataReplicator::doInitialSync(OperationContext* txn,
if (attemptErrorStatus.isOK()) {
if (_syncSource.empty()) {
- for (std::size_t i = 0; i < kInitialSyncMaxConnectRetries; ++i) {
+ for (int i = 0; i < numInitialSyncConnectAttempts; ++i) {
attemptErrorStatus = _ensureGoodSyncSource_inlock();
if (attemptErrorStatus.isOK()) {
break;
}
LOG(1) << "Error getting sync source: '" << attemptErrorStatus.toString()
<< "', trying again in " << _opts.syncSourceRetryWait << ". Attempt "
- << i + 1 << " of " << kInitialSyncMaxConnectRetries;
+ << i + 1 << " of " << numInitialSyncConnectAttempts.load();
sleepmillis(durationCount<Milliseconds>(_opts.syncSourceRetryWait));
}
}
@@ -826,7 +828,7 @@ void DataReplicator::_onDataClonerFinish(const Status& status, HostAndPort syncS
rpc::ServerSelectionMetadata(true, boost::none).toBSON(),
RemoteCommandRequest::kNoTimeout,
RemoteCommandRetryScheduler::makeRetryPolicy(
- numFindRetries,
+ numInitialSyncOplogFindAttempts,
executor::RemoteCommandRequest::kNoTimeout,
RemoteCommandRetryScheduler::kAllRetriableErrors));
Status scheduleStatus = _lastOplogEntryFetcher->schedule();
diff --git a/src/mongo/db/repl/data_replicator.h b/src/mongo/db/repl/data_replicator.h
index fe21b3f226f..68427600943 100644
--- a/src/mongo/db/repl/data_replicator.h
+++ b/src/mongo/db/repl/data_replicator.h
@@ -66,9 +66,6 @@ using UniqueLock = stdx::unique_lock<stdx::mutex>;
} // namespace
-
-extern const std::size_t kInitialSyncMaxConnectRetries;
-
// TODO: Remove forward declares once we remove rs_initialsync.cpp and other dependents.
// Failpoint which fails initial sync and leaves an oplog entry in the buffer.
MONGO_FP_FORWARD_DECLARE(failInitSyncWithBufferedEntriesLeft);
@@ -232,14 +229,14 @@ public:
void slavesHaveProgressed();
// Just like initialSync but can be called any time.
- StatusWith<Timestamp> resync(OperationContext* txn, std::size_t maxRetries);
+ StatusWith<Timestamp> resync(OperationContext* txn, std::size_t maxAttempts);
/**
- * Does an initial sync, with up to 'kInitialSyncMaxRetries' retries.
+ * Does an initial sync, with the provided number of attempts.
*
* This should be the first method called after construction (see class comment).
*/
- StatusWith<OpTimeWithHash> doInitialSync(OperationContext* txn, std::size_t maxRetries);
+ StatusWith<OpTimeWithHash> doInitialSync(OperationContext* txn, std::size_t maxAttempts);
DataReplicatorState getState() const;
diff --git a/src/mongo/db/repl/data_replicator_test.cpp b/src/mongo/db/repl/data_replicator_test.cpp
index 7e611129e73..e51912b7878 100644
--- a/src/mongo/db/repl/data_replicator_test.cpp
+++ b/src/mongo/db/repl/data_replicator_test.cpp
@@ -419,14 +419,14 @@ TEST_F(DataReplicatorTest, StartOk) {
TEST_F(DataReplicatorTest, CannotInitialSyncAfterStart) {
auto txn = makeOpCtx();
ASSERT_EQ(getDR().start(txn.get()).code(), ErrorCodes::OK);
- ASSERT_EQ(getDR().doInitialSync(txn.get(), 0), ErrorCodes::AlreadyInitialized);
+ ASSERT_EQ(getDR().doInitialSync(txn.get(), 1), ErrorCodes::AlreadyInitialized);
}
// Used to run a Initial Sync in a separate thread, to avoid blocking test execution.
class InitialSyncBackgroundRunner {
public:
- InitialSyncBackgroundRunner(DataReplicator* dr, std::size_t maxRetries)
- : _dr(dr), _maxRetries(maxRetries) {}
+ InitialSyncBackgroundRunner(DataReplicator* dr, std::size_t maxAttempts)
+ : _dr(dr), _maxAttempts(maxAttempts) {}
~InitialSyncBackgroundRunner() {
if (_thread) {
@@ -479,7 +479,7 @@ private:
_condVar.notify_all();
lk.unlock();
- auto result = _dr->doInitialSync(txn.get(), _maxRetries); // blocking
+ auto result = _dr->doInitialSync(txn.get(), _maxAttempts); // blocking
lk.lock();
_result = result;
@@ -489,7 +489,7 @@ private:
StatusWith<OpTimeWithHash> _result{ErrorCodes::NotYetInitialized, "InitialSync not started."};
DataReplicator* _dr;
- const std::size_t _maxRetries;
+ const std::size_t _maxAttempts;
std::unique_ptr<stdx::thread> _thread;
stdx::condition_variable _condVar;
};
@@ -534,9 +534,9 @@ protected:
_responses = resps;
}
- void startSync(std::size_t maxRetries) {
+ void startSync(std::size_t maxAttempts) {
DataReplicator* dr = &(getDR());
- _isbr.reset(new InitialSyncBackgroundRunner(dr, maxRetries));
+ _isbr.reset(new InitialSyncBackgroundRunner(dr, maxAttempts));
_isbr->run();
}
@@ -742,7 +742,7 @@ TEST_F(InitialSyncTest, Complete) {
auto txn = makeOpCtx();
ASSERT_FALSE(getStorage().getInitialSyncFlag(txn.get()));
- startSync(0);
+ startSync(1);
// Play first response to ensure data replicator has entered initial sync state.
setResponses({responses.begin(), responses.begin() + 1});
@@ -831,7 +831,7 @@ TEST_F(InitialSyncTest, LastOpTimeShouldBeSetEvenIfNoOperationsAreAppliedAfterCl
auto txn = makeOpCtx();
ASSERT_FALSE(getStorage().getInitialSyncFlag(txn.get()));
- startSync(0);
+ startSync(1);
// Play first response to ensure data replicator has entered initial sync state.
setResponses({responses.begin(), responses.begin() + 1});
@@ -873,7 +873,7 @@ TEST_F(InitialSyncTest, Failpoint) {
_myLastOpTime = opTime1;
_memberState = MemberState::RS_SECONDARY;
- startSync(0);
+ startSync(1);
verifySync(getNet(), ErrorCodes::InvalidSyncSource);
}
@@ -906,7 +906,7 @@ TEST_F(InitialSyncTest, FailsOnClone) {
{"replSetGetRBID", fromjson(str::stream() << "{ok: 1, rbid:1}")},
};
- startSync(0);
+ startSync(1);
setResponses(responses);
playResponses();
verifySync(getNet(), ErrorCodes::FailedToParse);
@@ -964,7 +964,7 @@ TEST_F(InitialSyncTest, FailOnRollback) {
{"replSetGetRBID", fromjson(str::stream() << "{ok: 1, rbid:2}")},
};
- startSync(0);
+ startSync(1);
numGetMoreOplogEntriesMax = 5;
setResponses(responses);
playResponses();
@@ -1022,7 +1022,7 @@ TEST_F(InitialSyncTest, DataReplicatorPassesThroughRollbackCheckerScheduleError)
// shutting the executor down.
};
- startSync(0);
+ startSync(1);
numGetMoreOplogEntriesMax = 5;
setResponses(responses);
playResponses();
@@ -1049,7 +1049,7 @@ TEST_F(InitialSyncTest, DataReplicatorPassesThroughOplogFetcherFailure) {
<< ", op:'i', o:{_id:1, a:1}}]}}")},
};
- startSync(0);
+ startSync(1);
setResponses(responses);
playResponses();
@@ -1140,7 +1140,7 @@ TEST_F(InitialSyncTest, OplogOutOfOrderOnOplogFetchFinish) {
// Applier starts ...
};
- startSync(0);
+ startSync(1);
numGetMoreOplogEntriesMax = 10;
setResponses({responses.begin(), responses.end() - 4});
@@ -1206,7 +1206,7 @@ TEST_F(InitialSyncTest, InitialSyncStateIsResetAfterFailure) {
// Applier starts ...
};
- startSync(1);
+ startSync(2);
numGetMoreOplogEntriesMax = 6;
setResponses(responses);
@@ -1323,7 +1323,7 @@ TEST_F(InitialSyncTest, GetInitialSyncProgressReturnsCorrectProgress) {
// Applier starts ...
};
- startSync(1);
+ startSync(2);
// Play first 2 responses to ensure data replicator has started the oplog fetcher.
setResponses({failedResponses.begin(), failedResponses.begin() + 2});
@@ -1500,7 +1500,7 @@ TEST_F(InitialSyncTest, DataReplicatorCreatesNewApplierForNextBatchBeforeDestroy
{"replSetGetRBID", getRollbackIdResponse},
};
- startSync(0);
+ startSync(1);
setResponses(responses);
playResponses();
diff --git a/src/mongo/db/repl/database_cloner.cpp b/src/mongo/db/repl/database_cloner.cpp
index a7b8bda03cc..7dbac25dee0 100644
--- a/src/mongo/db/repl/database_cloner.cpp
+++ b/src/mongo/db/repl/database_cloner.cpp
@@ -40,6 +40,7 @@
#include "mongo/db/catalog/collection_options.h"
#include "mongo/db/commands/list_collections_filter.h"
#include "mongo/db/repl/storage_interface.h"
+#include "mongo/db/server_parameters.h"
#include "mongo/rpc/metadata/server_selection_metadata.h"
#include "mongo/stdx/functional.h"
#include "mongo/util/assert_util.h"
@@ -58,8 +59,8 @@ using UniqueLock = stdx::unique_lock<stdx::mutex>;
const char* kNameFieldName = "name";
const char* kOptionsFieldName = "options";
-// The number of retries for the listCollections commands.
-const int numListCollectionsRetries = 1;
+// The number of attempts for the listCollections commands.
+MONGO_EXPORT_SERVER_PARAMETER(numInitialSyncListCollectionsAttempts, int, 3);
/**
* Default listCollections predicate.
@@ -115,7 +116,7 @@ DatabaseCloner::DatabaseCloner(executor::TaskExecutor* executor,
rpc::ServerSelectionMetadata(true, boost::none).toBSON(),
RemoteCommandRequest::kNoTimeout,
RemoteCommandRetryScheduler::makeRetryPolicy(
- numListCollectionsRetries,
+ numInitialSyncListCollectionsAttempts,
executor::RemoteCommandRequest::kNoTimeout,
RemoteCommandRetryScheduler::kAllRetriableErrors)),
_startCollectionCloner([](CollectionCloner& cloner) { return cloner.startup(); }) {
diff --git a/src/mongo/db/repl/databases_cloner.cpp b/src/mongo/db/repl/databases_cloner.cpp
index f92ca3c9215..0527a82dfd0 100644
--- a/src/mongo/db/repl/databases_cloner.cpp
+++ b/src/mongo/db/repl/databases_cloner.cpp
@@ -40,6 +40,7 @@
#include "mongo/db/catalog/collection_options.h"
#include "mongo/db/client.h"
#include "mongo/db/repl/storage_interface.h"
+#include "mongo/db/server_parameters.h"
#include "mongo/rpc/get_status_from_command_result.h"
#include "mongo/rpc/metadata/server_selection_metadata.h"
#include "mongo/stdx/functional.h"
@@ -58,7 +59,8 @@ using Response = executor::RemoteCommandResponse;
using LockGuard = stdx::lock_guard<stdx::mutex>;
using UniqueLock = stdx::unique_lock<stdx::mutex>;
-const size_t numListDatabasesRetries = 1;
+// The number of attempts for the listDatabases commands.
+MONGO_EXPORT_SERVER_PARAMETER(numInitialSyncListDatabasesAttempts, int, 3);
} // namespace
@@ -187,7 +189,7 @@ Status DatabasesCloner::startup() {
listDBsReq,
stdx::bind(&DatabasesCloner::_onListDatabaseFinish, this, stdx::placeholders::_1),
RemoteCommandRetryScheduler::makeRetryPolicy(
- numListDatabasesRetries,
+ numInitialSyncListDatabasesAttempts,
executor::RemoteCommandRequest::kNoTimeout,
RemoteCommandRetryScheduler::kAllRetriableErrors));
auto s = _listDBsScheduler->startup();
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 9afd1253a35..09df360bf92 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -100,7 +100,7 @@ namespace {
const char kLocalDB[] = "local";
-MONGO_EXPORT_STARTUP_SERVER_PARAMETER(numInitialSyncRetries, int, 9);
+MONGO_EXPORT_SERVER_PARAMETER(numInitialSyncAttempts, int, 10);
void lockAndCall(stdx::unique_lock<stdx::mutex>* lk, const stdx::function<void()>& fn) {
if (!lk->owns_lock()) {
@@ -602,7 +602,7 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* txn,
_storage);
lk.unlock();
- const auto status = _dr->doInitialSync(txn, numInitialSyncRetries);
+ const auto status = _dr->doInitialSync(txn, numInitialSyncAttempts);
// If it is interrupted by resync, we do not need to cleanup the DataReplicator.
if (status == ErrorCodes::ShutdownInProgress) {
return;
diff --git a/src/mongo/db/repl/rs_initialsync.cpp b/src/mongo/db/repl/rs_initialsync.cpp
index 5c93123bdd4..d4a2116b532 100644
--- a/src/mongo/db/repl/rs_initialsync.cpp
+++ b/src/mongo/db/repl/rs_initialsync.cpp
@@ -72,6 +72,8 @@ namespace {
using std::list;
using std::string;
+const auto kInitialSyncMaxConnectRetries = 10;
+
/**
* Truncates the oplog (removes any documents) and resets internal variables that were
* originally initialized or affected by using values from the oplog at startup time. These