summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJudah Schvimer <judah@mongodb.com>2017-12-21 11:03:42 -0500
committerJudah Schvimer <judah@mongodb.com>2017-12-21 11:03:42 -0500
commit1038fbaffdbfc4390e3ae0a56b3cbb0dc6e2870e (patch)
treec72ea5a5df30e02d18e17ff77ae400dbf89e2ce7
parent5610da9e3e13ec1bc7a092945acf15b0bb8b01b8 (diff)
downloadmongo-1038fbaffdbfc4390e3ae0a56b3cbb0dc6e2870e.tar.gz
SERVER-30538 check that checkpointTimestamp is correct in
PeriodicKillSecondaries hook
-rw-r--r--buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py56
1 files changed, 48 insertions, 8 deletions
diff --git a/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py b/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py
index 2283ef2c39a..aa60a0a94f6 100644
--- a/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py
+++ b/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py
@@ -210,6 +210,7 @@ class PeriodicKillSecondaries(interface.CustomBehavior):
client = secondary.mongo_client()
minvalid_doc = client.local["replset.minvalid"].find_one()
oplog_truncate_after_doc = client.local["replset.oplogTruncateAfterPoint"].find_one()
+ checkpoint_timestamp_doc = client.local["replset.checkpointTimestamp"].find_one()
latest_oplog_doc = client.local["oplog.rs"].find_one(
sort=[("$natural", pymongo.DESCENDING)])
@@ -231,17 +232,56 @@ class PeriodicKillSecondaries(interface.CustomBehavior):
oplog_truncate_after_ts = oplog_truncate_after_doc.get(
"oplogTruncateAfterPoint", null_ts)
+ # The "checkpointTimestamp" document may not exist at startup. If so, we default
+ # it to null.
+ checkpoint_timestamp = null_ts
+ if checkpoint_timestamp_doc is not None:
+ checkpoint_timestamp = checkpoint_timestamp_doc.get("checkpointTimestamp")
+ if checkpoint_timestamp is None:
+ raise errors.ServerFailure(
+ "Checkpoint timestamp document had no 'checkpointTimestamp'"
+ "field: {}".format(checkpoint_timestamp_doc))
+
+ # checkpointTimestamp <= top of oplog
+ # If the oplog is empty, the checkpoint timestamp should also be null.
+ if not checkpoint_timestamp <= latest_oplog_entry_ts:
+ raise errors.ServerFailure(
+ "The condition checkpointTimestamp <= top of oplog ({} <= {}) doesn't hold:"
+ " checkpointTimestamp document={}, latest oplog entry={}".format(
+ checkpoint_timestamp, latest_oplog_entry_ts, checkpoint_timestamp_doc,
+ latest_oplog_doc))
+
if minvalid_doc is not None:
applied_through_ts = minvalid_doc.get("begin", {}).get("ts", null_ts)
minvalid_ts = minvalid_doc.get("ts", null_ts)
- # This hook never runs upgrades, so it should never have an "oplogDeleteFromPoint".
- if minvalid_doc.get("oplogDeleteFromPoint") is not None:
+ # The "appliedThrough" value should always equal the "checkpointTimestamp".
+ # The writes to "appliedThrough" are given the timestamp of the end of the batch,
+ # and batch boundaries are the only valid timestamps in which we could take
+ # checkpoints, so if you see a non-null applied through in a stable checkpoint it
+ # must be at the same timestamp as the checkpoint.
+ if (checkpoint_timestamp != null_ts
+ and applied_through_ts != null_ts
+ and (not checkpoint_timestamp == applied_through_ts)):
raise errors.ServerFailure(
- "The condition oplogDeleteFromPoint != null doesn't hold:"
- " minValid document={}, oplogTruncateAfterPoint document={},"
- " last oplog entry={}".format(
- minvalid_doc, oplog_truncate_after_doc, latest_oplog_doc))
+ "The condition checkpointTimestamp ({}) == appliedThrough ({})"
+ " doesn't hold: minValid document={},"
+ " checkpointTimestamp document={}, last oplog entry={}".format(
+ checkpoint_timestamp, applied_through_ts, minvalid_doc,
+ checkpoint_timestamp_doc, latest_oplog_doc))
+
+ if applied_through_ts == null_ts:
+ # We clear "appliedThrough" to represent having applied through the top of the
+ # oplog in PRIMARY state or immediately after "rollback via refetch".
+ # If we are using a storage engine that supports "recover to a checkpoint,"
+ # then we will have a "checkpointTimestamp" and we should use that as our
+ # "appliedThrough" (similarly to why we assert their equality above).
+ # If both are null, then we are in PRIMARY state on a storage engine that does
+ # not support "recover to a checkpoint" or in RECOVERING immediately after
+ # "rollback via refetch". Since we do not update "minValid" in PRIMARY state,
+ # we leave "appliedThrough" as null so that the invariants below hold, rather
+ # than substituting the latest oplog entry for the "appliedThrough" value.
+ applied_through_ts = checkpoint_timestamp
if minvalid_ts == null_ts:
# The server treats the "ts" field in the minValid document as missing when its
@@ -306,11 +346,11 @@ class PeriodicKillSecondaries(interface.CustomBehavior):
except pymongo.errors.OperationFailure as err:
self.hook_test_case.logger.exception(
"Failed to read the minValid document, the oplogTruncateAfterPoint document,"
- " or the latest oplog entry from the mongod on"
+ " the checkpointTimestamp document, or the latest oplog entry from the mongod on"
" port %d", secondary.port)
raise errors.ServerFailure(
"Failed to read the minValid document, the oplogTruncateAfterPoint document,"
- " or the latest oplog entry from the mongod on"
+ " the checkpointTimestamp document, or the latest oplog entry from the mongod on"
" port {}: {}".format(secondary.port, err.args[0]))
finally:
# Set the secondary's options back to their original values.