diff options
author | Judah Schvimer <judah@mongodb.com> | 2017-12-21 11:03:42 -0500 |
---|---|---|
committer | Judah Schvimer <judah@mongodb.com> | 2017-12-21 11:03:42 -0500 |
commit | 1038fbaffdbfc4390e3ae0a56b3cbb0dc6e2870e (patch) | |
tree | c72ea5a5df30e02d18e17ff77ae400dbf89e2ce7 | |
parent | 5610da9e3e13ec1bc7a092945acf15b0bb8b01b8 (diff) | |
download | mongo-1038fbaffdbfc4390e3ae0a56b3cbb0dc6e2870e.tar.gz |
SERVER-30538 check that checkpointTimestamp is correct in
PeriodicKillSecondaries hook
-rw-r--r-- | buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py | 56 |
1 files changed, 48 insertions, 8 deletions
diff --git a/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py b/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py index 2283ef2c39a..aa60a0a94f6 100644 --- a/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py +++ b/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py @@ -210,6 +210,7 @@ class PeriodicKillSecondaries(interface.CustomBehavior): client = secondary.mongo_client() minvalid_doc = client.local["replset.minvalid"].find_one() oplog_truncate_after_doc = client.local["replset.oplogTruncateAfterPoint"].find_one() + checkpoint_timestamp_doc = client.local["replset.checkpointTimestamp"].find_one() latest_oplog_doc = client.local["oplog.rs"].find_one( sort=[("$natural", pymongo.DESCENDING)]) @@ -231,17 +232,56 @@ class PeriodicKillSecondaries(interface.CustomBehavior): oplog_truncate_after_ts = oplog_truncate_after_doc.get( "oplogTruncateAfterPoint", null_ts) + # The "checkpointTimestamp" document may not exist at startup. If so, we default + # it to null. + checkpoint_timestamp = null_ts + if checkpoint_timestamp_doc is not None: + checkpoint_timestamp = checkpoint_timestamp_doc.get("checkpointTimestamp") + if checkpoint_timestamp is None: + raise errors.ServerFailure( + "Checkpoint timestamp document had no 'checkpointTimestamp'" + "field: {}".format(checkpoint_timestamp_doc)) + + # checkpointTimestamp <= top of oplog + # If the oplog is empty, the checkpoint timestamp should also be null. + if not checkpoint_timestamp <= latest_oplog_entry_ts: + raise errors.ServerFailure( + "The condition checkpointTimestamp <= top of oplog ({} <= {}) doesn't hold:" + " checkpointTimestamp document={}, latest oplog entry={}".format( + checkpoint_timestamp, latest_oplog_entry_ts, checkpoint_timestamp_doc, + latest_oplog_doc)) + if minvalid_doc is not None: applied_through_ts = minvalid_doc.get("begin", {}).get("ts", null_ts) minvalid_ts = minvalid_doc.get("ts", null_ts) - # This hook never runs upgrades, so it should never have an "oplogDeleteFromPoint". - if minvalid_doc.get("oplogDeleteFromPoint") is not None: + # The "appliedThrough" value should always equal the "checkpointTimestamp". + # The writes to "appliedThrough" are given the timestamp of the end of the batch, + # and batch boundaries are the only valid timestamps in which we could take + # checkpoints, so if you see a non-null applied through in a stable checkpoint it + # must be at the same timestamp as the checkpoint. + if (checkpoint_timestamp != null_ts + and applied_through_ts != null_ts + and (not checkpoint_timestamp == applied_through_ts)): raise errors.ServerFailure( - "The condition oplogDeleteFromPoint != null doesn't hold:" - " minValid document={}, oplogTruncateAfterPoint document={}," - " last oplog entry={}".format( - minvalid_doc, oplog_truncate_after_doc, latest_oplog_doc)) + "The condition checkpointTimestamp ({}) == appliedThrough ({})" + " doesn't hold: minValid document={}," + " checkpointTimestamp document={}, last oplog entry={}".format( + checkpoint_timestamp, applied_through_ts, minvalid_doc, + checkpoint_timestamp_doc, latest_oplog_doc)) + + if applied_through_ts == null_ts: + # We clear "appliedThrough" to represent having applied through the top of the + # oplog in PRIMARY state or immediately after "rollback via refetch". + # If we are using a storage engine that supports "recover to a checkpoint," + # then we will have a "checkpointTimestamp" and we should use that as our + # "appliedThrough" (similarly to why we assert their equality above). + # If both are null, then we are in PRIMARY state on a storage engine that does + # not support "recover to a checkpoint" or in RECOVERING immediately after + # "rollback via refetch". Since we do not update "minValid" in PRIMARY state, + # we leave "appliedThrough" as null so that the invariants below hold, rather + # than substituting the latest oplog entry for the "appliedThrough" value. + applied_through_ts = checkpoint_timestamp if minvalid_ts == null_ts: # The server treats the "ts" field in the minValid document as missing when its @@ -306,11 +346,11 @@ class PeriodicKillSecondaries(interface.CustomBehavior): except pymongo.errors.OperationFailure as err: self.hook_test_case.logger.exception( "Failed to read the minValid document, the oplogTruncateAfterPoint document," - " or the latest oplog entry from the mongod on" + " the checkpointTimestamp document, or the latest oplog entry from the mongod on" " port %d", secondary.port) raise errors.ServerFailure( "Failed to read the minValid document, the oplogTruncateAfterPoint document," - " or the latest oplog entry from the mongod on" + " the checkpointTimestamp document, or the latest oplog entry from the mongod on" " port {}: {}".format(secondary.port, err.args[0])) finally: # Set the secondary's options back to their original values. |