diff options
author | Judah Schvimer <judah@mongodb.com> | 2017-07-11 10:14:16 -0400 |
---|---|---|
committer | Judah Schvimer <judah@mongodb.com> | 2017-07-17 11:13:08 -0400 |
commit | cbd74166d8930657f63e8e90791d276e15c3d3cb (patch) | |
tree | 0188f8c485433ffb622ba35d4cf7d9cbc7c93a15 | |
parent | a109a23f12a957e127814cd9bd82ce18f8437f9b (diff) | |
download | mongo-cbd74166d8930657f63e8e90791d276e15c3d3cb.tar.gz |
SERVER-27581 Only fetch missing documents on update oplog entries during initial
sync
(cherry picked from commit f33e9a715cf3eb4d29d8d5c5a926e5edacf9f63d)
-rw-r--r-- | jstests/replsets/initial_sync_update_missing_doc1.js | 5 | ||||
-rw-r--r-- | jstests/replsets/initial_sync_update_missing_doc2.js | 4 | ||||
-rw-r--r-- | src/mongo/base/error_codes.err | 1 | ||||
-rw-r--r-- | src/mongo/db/repl/master_slave.cpp | 24 | ||||
-rw-r--r-- | src/mongo/db/repl/oplog.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/sync_tail.cpp | 94 | ||||
-rw-r--r-- | src/mongo/db/repl/sync_tail.h | 9 | ||||
-rw-r--r-- | src/mongo/db/repl/sync_tail_test.cpp | 50 | ||||
-rw-r--r-- | src/mongo/dbtests/repltests.cpp | 12 |
9 files changed, 84 insertions, 121 deletions
diff --git a/jstests/replsets/initial_sync_update_missing_doc1.js b/jstests/replsets/initial_sync_update_missing_doc1.js index d2d3c6aed58..4421b702bc2 100644 --- a/jstests/replsets/initial_sync_update_missing_doc1.js +++ b/jstests/replsets/initial_sync_update_missing_doc1.js @@ -54,10 +54,9 @@ {configureFailPoint: 'initialSyncHangBeforeCopyingDatabases', mode: 'off'})); checkLog.contains(secondary, 'update of non-mod failed'); - checkLog.contains(secondary, 'adding missing object'); + checkLog.contains(secondary, 'Fetching missing document'); checkLog.contains( secondary, 'initial sync - initialSyncHangBeforeGettingMissingDocument fail point enabled'); - var res = assert.commandWorked(secondary.adminCommand({replSetGetStatus: 1, initialSync: 1})); assert.eq(res.initialSyncStatus.fetchedMissingDocs, 0); var firstOplogEnd = res.initialSyncStatus.initialSyncOplogEnd; @@ -68,7 +67,7 @@ assert.commandWorked(secondary.getDB('admin').runCommand( {configureFailPoint: 'initialSyncHangBeforeGettingMissingDocument', mode: 'off'})); checkLog.contains(secondary, - 'missing object not found on source. presumably deleted later in oplog'); + 'Missing document not found on source; presumably deleted later in oplog.'); checkLog.contains(secondary, 'initial sync done'); replSet.awaitReplication(); diff --git a/jstests/replsets/initial_sync_update_missing_doc2.js b/jstests/replsets/initial_sync_update_missing_doc2.js index 9418757468c..6809b355a77 100644 --- a/jstests/replsets/initial_sync_update_missing_doc2.js +++ b/jstests/replsets/initial_sync_update_missing_doc2.js @@ -54,7 +54,7 @@ {configureFailPoint: 'initialSyncHangBeforeCopyingDatabases', mode: 'off'})); checkLog.contains(secondary, 'update of non-mod failed'); - checkLog.contains(secondary, 'adding missing object'); + checkLog.contains(secondary, 'Fetching missing document'); checkLog.contains( secondary, 'initial sync - initialSyncHangBeforeGettingMissingDocument fail point enabled'); @@ -70,7 +70,7 @@ assert.commandWorked(secondary.getDB('admin').runCommand( {configureFailPoint: 'initialSyncHangBeforeGettingMissingDocument', mode: 'off'})); - checkLog.contains(secondary, 'inserted missing doc:'); + checkLog.contains(secondary, 'Inserted missing document'); secondary.getDB('test').setLogLevel(0, 'replication'); checkLog.contains(secondary, 'initial sync done'); diff --git a/src/mongo/base/error_codes.err b/src/mongo/base/error_codes.err index c090a203f77..c8def6bd695 100644 --- a/src/mongo/base/error_codes.err +++ b/src/mongo/base/error_codes.err @@ -201,6 +201,7 @@ error_code("ChunkRangeCleanupPending", 200) error_code("CannotBuildIndexKeys", 201) error_code("NetworkInterfaceExceededTimeLimit", 202) error_code("TooManyLocks", 208) +error_code("UpdateOperationFailed", 218) # Non-sequential error codes (for compatibility only) error_code("SocketException", 9001) diff --git a/src/mongo/db/repl/master_slave.cpp b/src/mongo/db/repl/master_slave.cpp index 05faad12599..96cfe58efdc 100644 --- a/src/mongo/db/repl/master_slave.cpp +++ b/src/mongo/db/repl/master_slave.cpp @@ -645,15 +645,7 @@ bool ReplSource::handleDuplicateDbName(OperationContext* txn, void ReplSource::applyCommand(OperationContext* txn, const BSONObj& op) { try { Status status = applyCommand_inlock(txn, op, true); - if (!status.isOK()) { - SyncTail sync(nullptr, SyncTail::MultiSyncApplyFunc()); - sync.setHostname(hostName); - if (sync.shouldRetry(txn, op)) { - uassert(28639, - "Failure retrying initial sync update", - applyCommand_inlock(txn, op, true).isOK()); - } - } + uassert(28639, "Failure applying initial sync command", status.isOK()); } catch (UserException& e) { log() << "sync: caught user assertion " << redact(e) << " while applying op: " << redact(op) << endl; @@ -669,13 +661,17 @@ void ReplSource::applyOperation(OperationContext* txn, Database* db, const BSONO try { Status status = applyOperation_inlock(txn, db, op); if (!status.isOK()) { + uassert(15914, + "Failure applying initial sync operation", + status == ErrorCodes::UpdateOperationFailed); + + // In initial sync, update operations can cause documents to be missed during + // collection cloning. As a result, it is possible that a document that we need to + // update is not present locally. In that case we fetch the document from the + // sync source. SyncTail sync(nullptr, SyncTail::MultiSyncApplyFunc()); sync.setHostname(hostName); - if (sync.shouldRetry(txn, op)) { - uassert(15914, - "Failure retrying initial sync update", - applyOperation_inlock(txn, db, op).isOK()); - } + sync.fetchAndInsertMissingDocument(txn, op); } } catch (UserException& e) { log() << "sync: caught user assertion " << redact(e) << " while applying op: " << redact(op) diff --git a/src/mongo/db/repl/oplog.cpp b/src/mongo/db/repl/oplog.cpp index 4868d7f4a75..69bc38cd3e5 100644 --- a/src/mongo/db/repl/oplog.cpp +++ b/src/mongo/db/repl/oplog.cpp @@ -916,7 +916,7 @@ Status applyOperation_inlock(OperationContext* txn, // was a simple { _id : ... } update criteria string msg = str::stream() << "failed to apply update: " << redact(op); error() << msg; - return Status(ErrorCodes::OperationFailed, msg); + return Status(ErrorCodes::UpdateOperationFailed, msg); } // Need to check to see if it isn't present so we can exit early with a // failure. Note that adds some overhead for this extra check in some cases, @@ -932,7 +932,7 @@ Status applyOperation_inlock(OperationContext* txn, Helpers::findOne(txn, collection, updateCriteria, false).isNull())) { string msg = str::stream() << "couldn't find doc: " << redact(op); error() << msg; - return Status(ErrorCodes::OperationFailed, msg); + return Status(ErrorCodes::UpdateOperationFailed, msg); } // Otherwise, it's present; zero objects were updated because of additional @@ -944,7 +944,7 @@ Status applyOperation_inlock(OperationContext* txn, if (!upsert) { string msg = str::stream() << "update of non-mod failed: " << redact(op); error() << msg; - return Status(ErrorCodes::OperationFailed, msg); + return Status(ErrorCodes::UpdateOperationFailed, msg); } } } diff --git a/src/mongo/db/repl/sync_tail.cpp b/src/mongo/db/repl/sync_tail.cpp index edb82cc82a4..8c0c0229508 100644 --- a/src/mongo/db/repl/sync_tail.cpp +++ b/src/mongo/db/repl/sync_tail.cpp @@ -935,17 +935,10 @@ OldThreadPool* SyncTail::getWriterPool() { return _writerPool.get(); } -BSONObj SyncTail::getMissingDoc(OperationContext* txn, Database* db, const BSONObj& o) { +BSONObj SyncTail::getMissingDoc(OperationContext* txn, const BSONObj& o) { OplogReader missingObjReader; // why are we using OplogReader to run a non-oplog query? const char* ns = o.getStringField("ns"); - // capped collections - Collection* collection = db->getCollection(ns); - if (collection && collection->isCapped()) { - log() << "missing doc, but this is okay for a capped collection (" << ns << ")"; - return BSONObj(); - } - if (MONGO_FAIL_POINT(initialSyncHangBeforeGettingMissingDocument)) { log() << "initial sync - initialSyncHangBeforeGettingMissingDocument fail point enabled. " "Blocking until fail point is disabled."; @@ -1004,43 +997,52 @@ BSONObj SyncTail::getMissingDoc(OperationContext* txn, Database* db, const BSONO str::stream() << "Can no longer connect to initial sync source: " << _hostname); } -bool SyncTail::shouldRetry(OperationContext* txn, const BSONObj& o) { +bool SyncTail::fetchAndInsertMissingDocument(OperationContext* txn, const BSONObj& o) { const NamespaceString nss(o.getStringField("ns")); + + { + // If the document is in a capped collection then it's okay for it to be missing. + AutoGetCollectionForRead autoColl(txn, nss); + Collection* const collection = autoColl.getCollection(); + if (collection && collection->isCapped()) { + log() << "Not fetching missing document in capped collection (" << nss << ")"; + return false; + } + } + + log() << "Fetching missing document: " << redact(o); + BSONObj missingObj = getMissingDoc(txn, o); + + if (missingObj.isEmpty()) { + log() << "Missing document not found on source; presumably deleted later in oplog. o first " + "field: " + << o.getObjectField("o").firstElementFieldName() + << ", o2: " << redact(o.getObjectField("o2")); + + return false; + } + MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { // Take an X lock on the database in order to preclude other modifications. // Also, the database might not exist yet, so create it. AutoGetOrCreateDb autoDb(txn, nss.db(), MODE_X); Database* const db = autoDb.getDb(); - // we don't have the object yet, which is possible on initial sync. get it. - log() << "adding missing object" << endl; // rare enough we can log + WriteUnitOfWork wunit(txn); - BSONObj missingObj = getMissingDoc(txn, db, o); + Collection* const coll = db->getOrCreateCollection(txn, nss.toString()); + invariant(coll); - if (missingObj.isEmpty()) { - log() << "missing object not found on source." - " presumably deleted later in oplog"; - log() << "o2: " << redact(o.getObjectField("o2")); - log() << "o firstfield: " << o.getObjectField("o").firstElementFieldName(); + OpDebug* const nullOpDebug = nullptr; + Status status = coll->insertDocument(txn, missingObj, nullOpDebug, true); + uassert(15917, + str::stream() << "Failed to insert missing document: " << status.toString(), + status.isOK()); - return false; - } else { - WriteUnitOfWork wunit(txn); - - Collection* const coll = db->getOrCreateCollection(txn, nss.toString()); - invariant(coll); + LOG(1) << "Inserted missing document: " << redact(missingObj); - OpDebug* const nullOpDebug = nullptr; - Status status = coll->insertDocument(txn, missingObj, nullOpDebug, true); - uassert(15917, - str::stream() << "failed to insert missing doc: " << status.toString(), - status.isOK()); - - LOG(1) << "inserted missing doc: " << redact(missingObj); - - wunit.commit(); - return true; - } + wunit.commit(); + return true; } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "InsertRetry", nss.ns()); @@ -1196,27 +1198,19 @@ Status multiInitialSyncApply_noAbort(OperationContext* txn, try { const Status s = SyncTail::syncApply(txn, entry.raw, inSteadyStateReplication); if (!s.isOK()) { - // Don't retry on commands. - if (entry.isCommand()) { - error() << "Error applying command (" << redact(entry.raw) - << "): " << redact(s); + // In initial sync, update operations can cause documents to be missed during + // collection cloning. As a result, it is possible that a document that we need to + // update is not present locally. In that case we fetch the document from the + // sync source. + if (s != ErrorCodes::UpdateOperationFailed) { + error() << "Error applying operation: " << redact(s) << " (" + << redact(entry.raw) << ")"; return s; } // We might need to fetch the missing docs from the sync source. fetchCount->fetchAndAdd(1); - if (st->shouldRetry(txn, entry.raw)) { - const Status s2 = SyncTail::syncApply(txn, entry.raw, inSteadyStateReplication); - if (!s2.isOK()) { - severe() << "Error applying operation (" << redact(entry.raw) - << "): " << redact(s2); - return s2; - } - } - - // If shouldRetry() returns false, fall through. - // This can happen if the document that was moved and missed by Cloner - // subsequently got deleted and no longer exists on the Sync Target at all + st->fetchAndInsertMissingDocument(txn, entry.raw); } } catch (const DBException& e) { // SERVER-24927 If we have a NamespaceNotFound exception, then this document will be diff --git a/src/mongo/db/repl/sync_tail.h b/src/mongo/db/repl/sync_tail.h index ec133599bd3..1ad240c7890 100644 --- a/src/mongo/db/repl/sync_tail.h +++ b/src/mongo/db/repl/sync_tail.h @@ -200,12 +200,15 @@ public: /** * Fetch a single document referenced in the operation from the sync source. */ - virtual BSONObj getMissingDoc(OperationContext* txn, Database* db, const BSONObj& o); + virtual BSONObj getMissingDoc(OperationContext* txn, const BSONObj& o); /** - * If applyOperation_inlock should be called again after an update fails. + * If an update fails, fetches the missing document and inserts it into the local collection. + * + * Returns true if the document was fetched and inserted successfully. */ - virtual bool shouldRetry(OperationContext* txn, const BSONObj& o); + virtual bool fetchAndInsertMissingDocument(OperationContext* txn, const BSONObj& o); + void setHostname(const std::string& hostname); /** diff --git a/src/mongo/db/repl/sync_tail_test.cpp b/src/mongo/db/repl/sync_tail_test.cpp index d73a7593829..1ea383ff9b3 100644 --- a/src/mongo/db/repl/sync_tail_test.cpp +++ b/src/mongo/db/repl/sync_tail_test.cpp @@ -91,19 +91,19 @@ protected: class SyncTailWithLocalDocumentFetcher : public SyncTail { public: SyncTailWithLocalDocumentFetcher(const BSONObj& document); - BSONObj getMissingDoc(OperationContext* txn, Database* db, const BSONObj& o) override; + BSONObj getMissingDoc(OperationContext* txn, const BSONObj& o) override; private: BSONObj _document; }; /** - * Testing-only SyncTail that checks the operation context in shouldRetry(). + * Testing-only SyncTail that checks the operation context in fetchAndInsertMissingDocument(). */ class SyncTailWithOperationContextChecker : public SyncTail { public: SyncTailWithOperationContextChecker(); - bool shouldRetry(OperationContext* txn, const BSONObj& o) override; + bool fetchAndInsertMissingDocument(OperationContext* txn, const BSONObj& o) override; }; void SyncTailTest::setUp() { @@ -142,16 +142,15 @@ void SyncTailTest::tearDown() { SyncTailWithLocalDocumentFetcher::SyncTailWithLocalDocumentFetcher(const BSONObj& document) : SyncTail(nullptr, SyncTail::MultiSyncApplyFunc(), nullptr), _document(document) {} -BSONObj SyncTailWithLocalDocumentFetcher::getMissingDoc(OperationContext*, - Database*, - const BSONObj&) { +BSONObj SyncTailWithLocalDocumentFetcher::getMissingDoc(OperationContext*, const BSONObj&) { return _document; } SyncTailWithOperationContextChecker::SyncTailWithOperationContextChecker() : SyncTail(nullptr, SyncTail::MultiSyncApplyFunc(), nullptr) {} -bool SyncTailWithOperationContextChecker::shouldRetry(OperationContext* txn, const BSONObj&) { +bool SyncTailWithOperationContextChecker::fetchAndInsertMissingDocument(OperationContext* txn, + const BSONObj&) { ASSERT_FALSE(txn->writesAreReplicated()); ASSERT_FALSE(txn->lockState()->shouldConflictWithSecondaryBatchApplication()); ASSERT_TRUE(documentValidationDisabled(txn)); @@ -869,8 +868,7 @@ TEST_F(SyncTailTest, MultiInitialSyncApplyDisablesDocumentValidationWhileApplyin ASSERT_EQUALS(fetchCount.load(), 1U); } -TEST_F(SyncTailTest, - MultiInitialSyncApplyDoesNotRetryFailedUpdateIfDocumentIsMissingFromSyncSource) { +TEST_F(SyncTailTest, MultiInitialSyncApplyIgnoresUpdateOperationIfDocumentIsMissingFromSyncSource) { BSONObj emptyDoc; SyncTailWithLocalDocumentFetcher syncTail(emptyDoc); NamespaceString nss("local." + _agent.getSuiteName() + "_" + _agent.getTestName()); @@ -938,10 +936,11 @@ TEST_F(SyncTailTest, MultiInitialSyncApplySkipsIndexCreationOnNamespaceNotFound) ASSERT_FALSE(AutoGetCollectionForRead(_opCtx.get(), badNss).getCollection()); } -TEST_F(SyncTailTest, MultiInitialSyncApplyRetriesFailedUpdateIfDocumentIsAvailableFromSyncSource) { +TEST_F(SyncTailTest, + MultiInitialSyncApplyFetchesMissingDocumentIfDocumentIsAvailableFromSyncSource) { SyncTailWithLocalDocumentFetcher syncTail(BSON("_id" << 0 << "x" << 1)); NamespaceString nss("local." + _agent.getSuiteName() + "_" + _agent.getTestName()); - auto updatedDocument = BSON("_id" << 0 << "x" << 2); + auto updatedDocument = BSON("_id" << 0 << "x" << 1); auto op = makeUpdateDocumentOplogEntry( {Timestamp(Seconds(1), 0), 1LL}, nss, BSON("_id" << 0), updatedDocument); MultiApplier::OperationPtrs ops = {&op}; @@ -958,35 +957,6 @@ TEST_F(SyncTailTest, MultiInitialSyncApplyRetriesFailedUpdateIfDocumentIsAvailab ASSERT_EQUALS(ErrorCodes::CollectionIsEmpty, iter->next().getStatus()); } -TEST_F(SyncTailTest, MultiInitialSyncApplyPassesThroughSyncApplyErrorAfterFailingToRetryBadOp) { - SyncTailWithLocalDocumentFetcher syncTail(BSON("_id" << 0 << "x" << 1)); - NamespaceString nss("local." + _agent.getSuiteName() + "_" + _agent.getTestName()); - OplogEntry op(BSON("op" - << "x" - << "ns" - << nss.ns())); - MultiApplier::OperationPtrs ops = {&op}; - AtomicUInt32 fetchCount(0); - ASSERT_EQUALS(ErrorCodes::BadValue, - multiInitialSyncApply_noAbort(_opCtx.get(), &ops, &syncTail, &fetchCount)); - ASSERT_EQUALS(fetchCount.load(), 1U); -} - -TEST_F(SyncTailTest, MultiInitialSyncApplyPassesThroughShouldSyncTailRetryError) { - SyncTail syncTail(nullptr, SyncTail::MultiSyncApplyFunc(), nullptr); - NamespaceString nss("local." + _agent.getSuiteName() + "_" + _agent.getTestName()); - auto op = makeUpdateDocumentOplogEntry( - {Timestamp(Seconds(1), 0), 1LL}, nss, BSON("_id" << 0), BSON("_id" << 0 << "x" << 2)); - ASSERT_THROWS_CODE(syncTail.shouldRetry(_opCtx.get(), op.raw), - mongo::UserException, - ErrorCodes::FailedToParse); - MultiApplier::OperationPtrs ops = {&op}; - AtomicUInt32 fetchCount(0); - ASSERT_EQUALS(ErrorCodes::FailedToParse, - multiInitialSyncApply_noAbort(_opCtx.get(), &ops, &syncTail, &fetchCount)); - ASSERT_EQUALS(fetchCount.load(), 1U); -} - class IdempotencyTest : public SyncTailTest { protected: OplogEntry createCollection(); diff --git a/src/mongo/dbtests/repltests.cpp b/src/mongo/dbtests/repltests.cpp index e10f536c21f..3a7c31a5eda 100644 --- a/src/mongo/dbtests/repltests.cpp +++ b/src/mongo/dbtests/repltests.cpp @@ -1381,7 +1381,7 @@ public: bool returnEmpty; SyncTest() : SyncTail(nullptr, SyncTail::MultiSyncApplyFunc()), returnEmpty(false) {} virtual ~SyncTest() {} - virtual BSONObj getMissingDoc(OperationContext* txn, Database* db, const BSONObj& o) { + virtual BSONObj getMissingDoc(OperationContext* txn, const BSONObj& o) { if (returnEmpty) { BSONObj o; return o; @@ -1393,7 +1393,7 @@ public: } }; -class ShouldRetry : public Base { +class FetchAndInsertMissingDocument : public Base { public: void run() { bool threw = false; @@ -1414,7 +1414,7 @@ public: badSource.setHostname("localhost:123"); OldClientContext ctx(&_txn, ns()); - badSource.getMissingDoc(&_txn, ctx.db(), o); + badSource.getMissingDoc(&_txn, o); } catch (DBException&) { threw = true; } @@ -1422,7 +1422,7 @@ public: // now this should succeed SyncTest t; - verify(t.shouldRetry(&_txn, o)); + verify(t.fetchAndInsertMissingDocument(&_txn, o)); verify(!_client .findOne(ns(), BSON("_id" @@ -1431,7 +1431,7 @@ public: // force it not to find an obj t.returnEmpty = true; - verify(!t.shouldRetry(&_txn, o)); + verify(!t.fetchAndInsertMissingDocument(&_txn, o)); } }; @@ -1498,7 +1498,7 @@ public: add<DeleteOpIsIdBased>(); add<DatabaseIgnorerBasic>(); add<DatabaseIgnorerUpdate>(); - add<ShouldRetry>(); + add<FetchAndInsertMissingDocument>(); } }; |