diff options
280 files changed, 4509 insertions, 2766 deletions
diff --git a/.gitignore b/.gitignore index 4c35f527d92..7cb6d71c0d0 100644 --- a/.gitignore +++ b/.gitignore @@ -176,6 +176,9 @@ src/mongo/embedded/java/aar/build/ src/mongo/embedded/java/jar/build/ local.properties +# clangd language server +.clangd/ + compile_commands.json generated_resmoke_config selected_tests_config diff --git a/README.third_party.md b/README.third_party.md index 11ade4b1b24..b91064f9d7a 100644 --- a/README.third_party.md +++ b/README.third_party.md @@ -19,40 +19,40 @@ not authored by MongoDB, and has a license which requires reproduction, a notice will be included in `THIRD-PARTY-NOTICES`. -| Name | License | Upstream Version | Vendored Version | Emits persisted data | Distributed in Release Binaries | -| ---------------------------| ----------------- | ---------------- | ------------------| :------------------: | :-----------------------------: | -| [abseil-cpp] | Apache-2.0 | | 070f6e47b3 | | ✗ | -| Aladdin MD5 | Zlib | | Unknown | ✗ | ✗ | -| [ASIO] | BSL-1.0 | 1.16.1 | b0926b61b0 | | ✗ | -| [benchmark] | Apache-2.0 | 1.5.1 | 1.5.0 | | | -| [Boost] | BSL-1.0 | 1.73.0 | 1.70.0 | | ✗ | -| [fmt] | BSD-2-Clause | 6.2.1 | 6.1.1 | | ✗ | -| [GPerfTools] | BSD-3-Clause | 2.8 | 2.8 | | ✗ | -| [ICU4] | ICU | 67.1 | 57.1 | ✗ | ✗ | -| [Intel Decimal FP Library] | BSD-3-Clause | 2.0 Update 2 | 2.0 Update 1 | | ✗ | -| [JSON-Schema-Test-Suite] | MIT | | 728066f9c5 | | | -| [kms-message] | | | 75e391a037 | | ✗ | -| [libstemmer] | BSD-3-Clause | | Unknown | ✗ | ✗ | -| [linenoise] | BSD-3-Clause | | Unknown + changes | | ✗ | -| [MozJS] | MPL-2.0 | ESR 68.9 | ESR 60.3.0 | | ✗ | -| [MurmurHash3] | Public Domain | | Unknown + changes | ✗ | ✗ | -| [ocspbuilder] | MIT | 0.10.2 | 0.10.2 | | | -| [ocspresponder] | Apache-2.0 | 0.5.0 | 0.5.0 | | | -| [peglib] | MIT | 0.1.12 | 0.1.12 | | ✗ | -| [Pcre] | BSD-3-Clause | 8.44 | 8.42 | | ✗ | -| [S2] | Apache-2.0 | | Unknown | ✗ | ✗ | -| [SafeInt] | MIT | 3.24 | 3.23 | | | -| [scons] | MIT | 3.1.2 | 3.1.2 | | | -| [Snappy] | BSD-3-Clause | 1.1.8 | 1.1.7 | ✗ | ✗ | -| [timelib] | MIT | 2018.03 | 2018.01 | | ✗ | -| [TomCrypt] | Public Domain | 1.18.2 | 1.18.2 | ✗ | ✗ | -| [Unicode] | Unicode-DFS-2015 | 13.0.0 | 8.0.0 | ✗ | ✗ | -| [Valgrind] | BSD-3-Clause<sup>\[<a href="#note_vg" id="ref_vg">1</a>]</sup> | 3.16.1 | 3.11.0 | | ✗ | -| [variant] | BSL-1.0 | 1.4.0 | 1.4.0 | | ✗ | -| [wiredtiger] | | | <sup>\[<a href="#note_wt" id="ref_wt">2</a>]</sup> | ✗ | ✗ | -| [yaml-cpp] | MIT | 0.6.3 | 0.6.2 | | ✗ | -| [Zlib] | Zlib | 1.2.11 | 1.2.11 | ✗ | ✗ | -| [Zstandard] | BSD-3-Clause | 1.4.5 | 1.4.4 | ✗ | ✗ | +| Name | License | Vendored Version | Emits persisted data | Distributed in Release Binaries | +| ---------------------------| ----------------- | ------------------| :------------------: | :-----------------------------: | +| [abseil-cpp] | Apache-2.0 | 070f6e47b3 | | ✗ | +| Aladdin MD5 | Zlib | Unknown | ✗ | ✗ | +| [ASIO] | BSL-1.0 | b0926b61b0 | | ✗ | +| [benchmark] | Apache-2.0 | 1.5.0 | | | +| [Boost] | BSL-1.0 | 1.70.0 | | ✗ | +| [fmt] | BSD-2-Clause | 6.1.1 | | ✗ | +| [GPerfTools] | BSD-3-Clause | 2.8 | | ✗ | +| [ICU4] | ICU | 57.1 | ✗ | ✗ | +| [Intel Decimal FP Library] | BSD-3-Clause | 2.0 Update 1 | | ✗ | +| [JSON-Schema-Test-Suite] | MIT | 728066f9c5 | | | +| [kms-message] | | 75e391a037 | | ✗ | +| [libstemmer] | BSD-3-Clause | Unknown | ✗ | ✗ | +| [linenoise] | BSD-3-Clause | Unknown + changes | | ✗ | +| [MozJS] | MPL-2.0 | ESR 60.3.0 | | ✗ | +| [MurmurHash3] | Public Domain | Unknown + changes | ✗ | ✗ | +| [ocspbuilder] | MIT | 0.10.2 | | | +| [ocspresponder] | Apache-2.0 | 0.5.0 | | | +| [peglib] | MIT | 0.1.12 | | ✗ | +| [Pcre] | BSD-3-Clause | 8.42 | | ✗ | +| [S2] | Apache-2.0 | Unknown | ✗ | ✗ | +| [SafeInt] | MIT | 3.23 | | | +| [scons] | MIT | 3.1.2 | | | +| [Snappy] | BSD-3-Clause | 1.1.7 | ✗ | ✗ | +| [timelib] | MIT | 2018.01 | | ✗ | +| [TomCrypt] | Public Domain | 1.18.2 | ✗ | ✗ | +| [Unicode] | Unicode-DFS-2015 | 8.0.0 | ✗ | ✗ | +| [Valgrind] | BSD-3-Clause<sup>\[<a href="#note_vg" id="ref_vg">1</a>]</sup> | 3.11.0 | | ✗ | +| [variant] | BSL-1.0 | 1.4.0 | | ✗ | +| [wiredtiger] | | <sup>\[<a href="#note_wt" id="ref_wt">2</a>]</sup> | ✗ | ✗ | +| [yaml-cpp] | MIT | 0.6.2 | | ✗ | +| [Zlib] | Zlib | 1.2.11 | ✗ | ✗ | +| [Zstandard] | BSD-3-Clause | 1.4.4 | ✗ | ✗ | [abseil-cpp]: https://github.com/abseil/abseil-cpp [ASIO]: https://github.com/chriskohlhoff/asio diff --git a/buildscripts/resmokelib/core/process.py b/buildscripts/resmokelib/core/process.py index 2c458e4320f..b3d19bf599a 100644 --- a/buildscripts/resmokelib/core/process.py +++ b/buildscripts/resmokelib/core/process.py @@ -128,7 +128,7 @@ class Process(object): logger=self.logger.name.replace('/', '-'), process=os.path.basename(self.args[0]), pid=self.pid, t=now_str) recorder_args = [ - _config.UNDO_RECORDER_PATH, "--thread-fuzzing", "-p", + _config.UNDO_RECORDER_PATH, "-p", str(self.pid), "-o", recorder_output_file ] self._recorder = subprocess.Popen(recorder_args, bufsize=buffer_size, env=self.env, diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index 322f0dad785..ae045bcedf4 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -66,6 +66,16 @@ all: test_file: jstests/replsets/secondaryOk_slaveOk_aliases.js - ticket: SERVER-43902 test_file: jstests/sharding/scaled_collection_stats.js + - ticket: SERVER-50416 + test_file: jstests/replsets/disconnect_on_legacy_write_to_secondary.js + - ticket: SERVER-50417 + test_file: jstests/replsets/no_disconnect_on_stepdown.js + - ticket: SERVER-50417 + test_file: jstests/replsets/not_master_unacknowledged_write.js + - ticket: SERVER-50417 + test_file: jstests/replsets/read_operations_during_step_down.js + - ticket: SERVER-50417 + test_file: jstests/replsets/read_operations_during_step_up.js # Tests that should only be excluded from particular suites should be listed under that suite. suites: diff --git a/etc/evergreen.yml b/etc/evergreen.yml index 3935a97758d..de9f17fc7f7 100644 --- a/etc/evergreen.yml +++ b/etc/evergreen.yml @@ -12751,6 +12751,7 @@ buildvariants: num_scons_link_jobs_available: 0.99 record_with: --recordWith /opt/undodb5/bin/live-record exec_timeout_secs: 14400 # 4 hours + test_flags: --excludeWithAnyTags=requires_fast_memory tasks: - name: compile_without_package_TG - name: .jscore .common @@ -12764,6 +12765,24 @@ buildvariants: - name: sharded_jscore_txns_sharded_collections - name: sharding_jscore_passthrough - name: sharding_jscore_op_query_passthrough + - name: aggregation + - name: aggregation_auth + - name: aggregation_disabled_optimization + - name: aggregation_ese + - name: aggregation_ese_gcm + - name: aggregation_facet_unwind_passthrough + - name: aggregation_slot_based_execution + - name: .auth .gle + - name: .jscore .encrypt + - name: noPassthroughWithMongod_gen + - name: parallel_compatibility + - name: serial_run + - name: session_jscore_passthrough + - name: .aggfuzzer + - name: query_fuzzer_standalone_gen + - name: update_fuzzer_gen + - name: jstestfuzz_gen + - name: jstestfuzz_interrupt_gen - <<: *enterprise-rhel-62-64-bit-dynamic-required-template name: rhel-62-64-bit-dynamic-visibility-test diff --git a/jstests/auth/repl.js b/jstests/auth/repl.js index 06aa38a38a7..6f5b7ed0dcb 100644 --- a/jstests/auth/repl.js +++ b/jstests/auth/repl.js @@ -1,4 +1,4 @@ -// Test that authorization information gets propogated correctly to secondaries and slaves. +// Test that authorization information gets propogated correctly to secondaries. var baseName = "jstests_auth_repl"; var rsName = baseName + "_rs"; @@ -26,7 +26,7 @@ var AuthReplTest = function(spec) { assert(adminPri.auth("super", "super"), "could not authenticate as superuser"); if (secondaryConn != null) { - secondaryConn.setSlaveOk(true); + secondaryConn.setSecondaryOk(); adminSec = secondaryConn.getDB("admin"); } @@ -38,7 +38,7 @@ var AuthReplTest = function(spec) { /** * Use the rolesInfo command to check that the test - * role is as expected on the secondary/slave + * role is as expected on the secondary */ var confirmRolesInfo = function(actionType) { var role = adminSec.getRole(testRole, {showPrivileges: true}); @@ -48,7 +48,7 @@ var AuthReplTest = function(spec) { /** * Use the usersInfo command to check that the test - * user is as expected on the secondary/slave + * user is as expected on the secondary */ var confirmUsersInfo = function(roleName) { var user = adminSec.getUser(testUser); @@ -58,7 +58,7 @@ var AuthReplTest = function(spec) { /** * Ensure that the test user has the proper privileges - * on the secondary/slave + * on the secondary */ var confirmPrivilegeBeforeUpdate = function() { // can run hostInfo @@ -87,7 +87,7 @@ var AuthReplTest = function(spec) { /** * Ensure that the auth changes have taken effect - * properly on the secondary/slave + * properly on the secondary */ var confirmPrivilegeAfterUpdate = function() { // cannot run hostInfo @@ -117,7 +117,7 @@ var AuthReplTest = function(spec) { */ that.setSecondary = function(secondary) { secondaryConn = secondary; - secondaryConn.setSlaveOk(true); + secondaryConn.setSecondaryOk(); adminSec = secondaryConn.getDB("admin"); }; @@ -149,7 +149,7 @@ var AuthReplTest = function(spec) { /** * Top-level test for updating users and roles and ensuring that the update - * has the correct effect on the secondary/slave + * has the correct effect on the secondary */ that.testAll = function() { authOnSecondary(); diff --git a/jstests/concurrency/fsm_workloads/auth_privilege_consistency.js b/jstests/concurrency/fsm_workloads/auth_privilege_consistency.js index 054f0c5ca15..f1c6ad28b9b 100644 --- a/jstests/concurrency/fsm_workloads/auth_privilege_consistency.js +++ b/jstests/concurrency/fsm_workloads/auth_privilege_consistency.js @@ -72,7 +72,7 @@ var $config = (function() { // Create a new connection to any node which isn't "me". const conn = new Mongo(node); assert(conn); - conn.setSlaveOk(); + conn.setSecondaryOk(); RSnodes.push(conn); }); diff --git a/jstests/core/resume_query_from_non_existent_record.js b/jstests/core/resume_query_from_non_existent_record.js new file mode 100644 index 00000000000..954325a5763 --- /dev/null +++ b/jstests/core/resume_query_from_non_existent_record.js @@ -0,0 +1,66 @@ +/** + * Test that an error is raised when we try to resume a query from a record which doesn't exist. + * + * @tags: [ + * assumes_against_mongod_not_mongos, + * requires_find_command, + * multiversion_incompatible, + * ] + */ + +(function() { +"use strict"; + +const collName = "resume_query_from_non_existent_record"; +const coll = db[collName]; + +coll.drop(); + +const testData = [{_id: 0, a: 1}, {_id: 1, a: 2}, {_id: 2, a: 3}]; +assert.commandWorked(coll.insert(testData)); + +// Run the initial query and request to return a resume token. We're interested only in a single +// document, so 'batchSize' is set to 1. +let res = assert.commandWorked( + db.runCommand({find: collName, hint: {$natural: 1}, batchSize: 1, $_requestResumeToken: true})); +assert.eq(1, res.cursor.firstBatch.length); +assert.contains(res.cursor.firstBatch[0], testData); +const savedData = res.cursor.firstBatch; + +// Make sure the query returned a resume token which will be used to resume the query from. +assert.hasFields(res.cursor, ["postBatchResumeToken"]); +const resumeToken = res.cursor.postBatchResumeToken; + +// Kill the cursor before attempting to resume. +assert.commandWorked(db.runCommand({killCursors: collName, cursors: [res.cursor.id]})); + +// Try to resume the query from the saved resume token. +res = assert.commandWorked(db.runCommand({ + find: collName, + hint: {$natural: 1}, + batchSize: 1, + $_requestResumeToken: true, + $_resumeAfter: resumeToken +})); +assert.eq(1, res.cursor.firstBatch.length); +assert.contains(res.cursor.firstBatch[0], testData); +assert.neq(savedData[0], res.cursor.firstBatch[0]); + +// Kill the cursor before attempting to resume. +assert.commandWorked(db.runCommand({killCursors: collName, cursors: [res.cursor.id]})); + +// Delete a document which corresponds to the saved resume token, so that we can guarantee it does +// not exist. +assert.commandWorked(coll.remove({_id: savedData[0]._id}, {justOne: true})); + +// Try to resume the query from the same token and check that it fails to position the cursor to +// the record specified in the resume token. +assert.commandFailedWithCode(db.runCommand({ + find: collName, + hint: {$natural: 1}, + batchSize: 1, + $_requestResumeToken: true, + $_resumeAfter: resumeToken +}), + ErrorCodes.KeyNotFound); +})(); diff --git a/jstests/core/shell1.js b/jstests/core/shell1.js index 7ea23f8d3a5..4fc4c3a1c15 100644 --- a/jstests/core/shell1.js +++ b/jstests/core/shell1.js @@ -4,11 +4,15 @@ shellHelper("show", "tables;"); shellHelper("show", "tables"); shellHelper("show", "tables ;"); -// test slaveOk levels -assert(!db.getSlaveOk() && !db.test.getSlaveOk() && !db.getMongo().getSlaveOk(), "slaveOk 1"); -db.getMongo().setSlaveOk(); -assert(db.getSlaveOk() && db.test.getSlaveOk() && db.getMongo().getSlaveOk(), "slaveOk 2"); -db.setSlaveOk(false); -assert(!db.getSlaveOk() && !db.test.getSlaveOk() && db.getMongo().getSlaveOk(), "slaveOk 3"); -db.test.setSlaveOk(true); -assert(!db.getSlaveOk() && db.test.getSlaveOk() && db.getMongo().getSlaveOk(), "slaveOk 4"); +// test secondaryOk levels +assert(!db.getSecondaryOk() && !db.test.getSecondaryOk() && !db.getMongo().getSecondaryOk(), + "secondaryOk 1"); +db.getMongo().setSecondaryOk(); +assert(db.getSecondaryOk() && db.test.getSecondaryOk() && db.getMongo().getSecondaryOk(), + "secondaryOk 2"); +db.setSecondaryOk(false); +assert(!db.getSecondaryOk() && !db.test.getSecondaryOk() && db.getMongo().getSecondaryOk(), + "secondaryOk 3"); +db.test.setSecondaryOk(); +assert(!db.getSecondaryOk() && db.test.getSecondaryOk() && db.getMongo().getSecondaryOk(), + "secondaryOk 4"); diff --git a/jstests/core/views/views_all_commands.js b/jstests/core/views/views_all_commands.js index 2f9d0cc1c60..04c4a34eabd 100644 --- a/jstests/core/views/views_all_commands.js +++ b/jstests/core/views/views_all_commands.js @@ -338,6 +338,7 @@ let viewsCommandTests = { hello: {skip: isUnrelated}, hostInfo: {skip: isUnrelated}, httpClientRequest: {skip: isAnInternalCommand}, + importCollection: {skip: isUnrelated}, insert: {command: {insert: "view", documents: [{x: 1}]}, expectFailure: true}, internalRenameIfOptionsAndIndexesMatch: {skip: isAnInternalCommand}, invalidateUserCache: {skip: isUnrelated}, diff --git a/jstests/hooks/validate_collections.js b/jstests/hooks/validate_collections.js index cf7f1be9707..856191ca51c 100644 --- a/jstests/hooks/validate_collections.js +++ b/jstests/hooks/validate_collections.js @@ -88,7 +88,7 @@ function CollectionValidator() { try { print('Running validate() on ' + host); const conn = new Mongo(host); - conn.setSlaveOk(); + conn.setSecondaryOk(); jsTest.authenticate(conn); // Skip validating collections for arbiters. diff --git a/jstests/libs/kill_sessions.js b/jstests/libs/kill_sessions.js index b4643ec8ed5..1dabe9c9cea 100644 --- a/jstests/libs/kill_sessions.js +++ b/jstests/libs/kill_sessions.js @@ -120,7 +120,7 @@ var _kill_sessions_api_module = (function() { // hosts. We identify particular ops by secs sleeping. this.visit(function(client) { let admin = client.getDB("admin"); - admin.getMongo().setSlaveOk(); + admin.getMongo().setSecondaryOk(); assert.soon(function() { let inProgressOps = admin.aggregate([{$currentOp: {'allUsers': true}}]); @@ -183,7 +183,7 @@ var _kill_sessions_api_module = (function() { Fixture.prototype.assertNoSessionsInCursors = function() { this.visit(function(client) { var db = client.getDB("admin"); - db.setSlaveOk(); + db.setSecondaryOk(); assert.soon(() => { let cursors = db.aggregate([ {"$currentOp": {"idleCursors": true, "allUsers": true}} @@ -205,7 +205,7 @@ var _kill_sessions_api_module = (function() { }); var db = client.getDB("admin"); - db.setSlaveOk(); + db.setSecondaryOk(); var cursors = db.aggregate([ {"$currentOp": {"idleCursors": true, "allUsers": true}}, {"$match": {type: "idleCursor"}} diff --git a/jstests/libs/override_methods/validate_collections_on_shutdown.js b/jstests/libs/override_methods/validate_collections_on_shutdown.js index a1e56fd1ca8..a378d6e390a 100644 --- a/jstests/libs/override_methods/validate_collections_on_shutdown.js +++ b/jstests/libs/override_methods/validate_collections_on_shutdown.js @@ -29,8 +29,8 @@ MongoRunner.validateCollectionsCallback = function(port) { return; } - // Set slaveOk=true so that we can run commands against any secondaries. - conn.setSlaveOk(); + // Set secondaryOk=true so that we can run commands against any secondaries. + conn.setSecondaryOk(); let dbNames; let result = diff --git a/jstests/noPassthrough/apply_ops_DDL_operation_does_not_take_global_X.js b/jstests/noPassthrough/apply_ops_DDL_operation_does_not_take_global_X.js index 3e855455985..e6191f97449 100644 --- a/jstests/noPassthrough/apply_ops_DDL_operation_does_not_take_global_X.js +++ b/jstests/noPassthrough/apply_ops_DDL_operation_does_not_take_global_X.js @@ -29,7 +29,7 @@ assert.commandWorked(secondary.getDB("admin").runCommand( {configureFailPoint: "waitInFindBeforeMakingBatch", mode: "alwaysOn"})); const findWait = startParallelShell(function() { - db.getMongo().setSlaveOk(); + db.getMongo().setSecondaryOk(); assert.eq( db.getSiblingDB('read').getCollection('readColl').find().comment('read hangs').itcount(), 1); diff --git a/jstests/noPassthrough/change_stream_error_label.js b/jstests/noPassthrough/change_stream_error_label.js index 899207b3bd1..2b326a22cd7 100644 --- a/jstests/noPassthrough/change_stream_error_label.js +++ b/jstests/noPassthrough/change_stream_error_label.js @@ -12,9 +12,9 @@ rst.startSet(); rst.initiate(); rst.awaitSecondaryNodes(); -// Disable "slaveOk" on the connection so that we are not allowed to run on the Secondary. +// Disable "secondaryOk" on the connection so that we are not allowed to run on the Secondary. const testDB = rst.getSecondary().getDB(jsTestName()); -testDB.getMongo().setSlaveOk(false); +testDB.getMongo().setSecondaryOk(false); const coll = testDB.test; // Issue a change stream. We should fail with a NotPrimaryNoSecondaryOk error. @@ -28,8 +28,8 @@ assert.contains("ResumableChangeStreamError", err.errorLabels, err); // Now verify that the 'failGetMoreAfterCursorCheckout' failpoint can effectively exercise the // error label generation logic for change stream getMores. function testFailGetMoreAfterCursorCheckoutFailpoint({errorCode, expectedLabel}) { - // Re-enable "slaveOk" on the test connection. - testDB.getMongo().setSlaveOk(true); + // Re-enable "secondaryOk" on the test connection. + testDB.getMongo().setSecondaryOk(); // Activate the failpoint and set the exception that it will throw. assert.commandWorked(testDB.adminCommand({ diff --git a/jstests/noPassthrough/out_merge_on_secondary_killop.js b/jstests/noPassthrough/out_merge_on_secondary_killop.js index 7cdc25d8eae..d5863374f96 100644 --- a/jstests/noPassthrough/out_merge_on_secondary_killop.js +++ b/jstests/noPassthrough/out_merge_on_secondary_killop.js @@ -57,7 +57,7 @@ function testKillOp(pipeline, comment, failpointName) { // Run the aggregate and ensure that it is interrupted. const runAggregate = ` const testDB = db.getSiblingDB("${kDBName}"); - testDB.setSlaveOk(true); + testDB.setSecondaryOk(); const res = testDB.runCommand({ aggregate: "inputColl", pipeline: ${tojson(pipeline)}, diff --git a/jstests/noPassthrough/server_transaction_metrics_secondary.js b/jstests/noPassthrough/server_transaction_metrics_secondary.js index 9282b19bea0..3a337e17fde 100644 --- a/jstests/noPassthrough/server_transaction_metrics_secondary.js +++ b/jstests/noPassthrough/server_transaction_metrics_secondary.js @@ -19,8 +19,8 @@ replTest.initiate(config); const primary = replTest.getPrimary(); const secondary = replTest.getSecondary(); -// Set slaveOk=true so that normal read commands would be allowed on the secondary. -secondary.setSlaveOk(true); +// Set secondaryOk=true so that normal read commands would be allowed on the secondary. +secondary.setSecondaryOk(); // Create a test collection that we can run commands against. assert.commandWorked(primary.getDB(dbName)[collName].insert({_id: 0})); diff --git a/jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js b/jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js index 3a43603e935..a00cabc89a9 100644 --- a/jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js +++ b/jstests/noPassthrough/startup_recovery_truncates_oplog_holes_after_primary_crash.js @@ -87,7 +87,7 @@ assert.soonNoExcept(function() { }); // Confirm that the write with the oplog hold behind it is now gone (truncated) as expected. -primary.setSlaveOk(); +primary.setSecondaryOk(); const find = primary.getDB(dbName).getCollection(collName).findOne({_id: "writeAfterHole"}); assert.eq(find, null); diff --git a/jstests/noPassthrough/stepdown_query.js b/jstests/noPassthrough/stepdown_query.js index 4809e471f7a..239497725ff 100644 --- a/jstests/noPassthrough/stepdown_query.js +++ b/jstests/noPassthrough/stepdown_query.js @@ -25,8 +25,8 @@ var collName = jsTest.name(); function runTest(host, rst, waitForPrimary) { // We create a new connection to 'host' here instead of passing in the original connection. - // This to work around the fact that connections created by ReplSetTest already have slaveOk - // set on them, but we need a connection with slaveOk not set for this test. + // This to work around the fact that connections created by ReplSetTest already have secondaryOk + // set on them, but we need a connection with secondaryOk not set for this test. var conn = new Mongo(host); var coll = conn.getDB(dbName).getCollection(collName); assert(!coll.exists()); @@ -51,7 +51,7 @@ function runTest(host, rst, waitForPrimary) { } catch (e) { } - // Even though our connection doesn't have slaveOk set, we should still be able to iterate + // Even though our connection doesn't have secondaryOk set, we should still be able to iterate // our cursor and kill our cursor. assert(cursor.hasNext()); assert.doesNotThrow(function() { diff --git a/jstests/noPassthrough/timestamp_index_builds.js b/jstests/noPassthrough/timestamp_index_builds.js index 8e13ff0d21c..f7995108f34 100644 --- a/jstests/noPassthrough/timestamp_index_builds.js +++ b/jstests/noPassthrough/timestamp_index_builds.js @@ -87,7 +87,7 @@ for (let nodeIdx = 0; nodeIdx < 2; ++nodeIdx) { jsTestLog("Starting as a replica set. Both indexes should exist. Node: " + nodeIdentity); let conn = rst.start(nodeIdx, {startClean: false}, true); rst.waitForState(conn, ReplSetTest.State.SECONDARY); - conn.setSlaveOk(); + conn.setSecondaryOk(); IndexBuildTest.assertIndexes(getColl(conn), 2, ['_id_', 'foo_1']); rst.stop(nodeIdx); } diff --git a/jstests/noPassthroughWithMongod/geo_polygon.js b/jstests/noPassthroughWithMongod/geo_polygon.js index ce7f9ebf67c..d0085fa1f92 100644 --- a/jstests/noPassthroughWithMongod/geo_polygon.js +++ b/jstests/noPassthroughWithMongod/geo_polygon.js @@ -16,7 +16,9 @@ for (x = -180; x < 180; x += .5) { assert.commandWorked(bulk.execute()); var numTests = 31; -for (var n = 0; n < numTests; n++) { +// Reduce the amount of repetitions on live-record buildvariant +var start = (TestData.undoRecorderPath ? 20 : 0); +for (var n = start; n < numTests; n++) { t.dropIndexes(); t.ensureIndex({loc: "2d"}, {bits: 2 + n}); diff --git a/jstests/noPassthroughWithMongod/indexbg_interrupts.js b/jstests/noPassthroughWithMongod/indexbg_interrupts.js index a1bf783f032..420fb2b6d96 100644 --- a/jstests/noPassthroughWithMongod/indexbg_interrupts.js +++ b/jstests/noPassthroughWithMongod/indexbg_interrupts.js @@ -32,7 +32,8 @@ var checkOp = function(checkDB) { var dbname = 'bgIndexSec'; var collection = 'jstests_feh'; -var size = 100000; +// Reduce the amount of data on live-record buildvariant +var size = (TestData.undoRecorderPath ? 10000 : 100000); // Set up replica set var replTest = new ReplSetTest({name: 'bgIndex', nodes: 3}); diff --git a/jstests/noPassthroughWithMongod/no_balance_collection.js b/jstests/noPassthroughWithMongod/no_balance_collection.js index 38182f1c481..2ffaf7aecfc 100644 --- a/jstests/noPassthroughWithMongod/no_balance_collection.js +++ b/jstests/noPassthroughWithMongod/no_balance_collection.js @@ -78,7 +78,9 @@ st.waitForBalancer(true, 60000); var lastMigration = sh._lastMigration(collB); var bulk = collB.initializeUnorderedBulkOp(); -for (var i = 0; i < 1000000; i++) { +// Reduce the amount of data on live-record buildvariant +var n = (TestData.undoRecorderPath ? 100000 : 1000000); +for (var i = 0; i < n; i++) { bulk.insert({_id: i, hello: "world"}); } assert.commandWorked(bulk.execute()); diff --git a/jstests/noPassthroughWithMongod/replReads.js b/jstests/noPassthroughWithMongod/replReads.js index 5c40dbd900c..fde1143911c 100644 --- a/jstests/noPassthroughWithMongod/replReads.js +++ b/jstests/noPassthroughWithMongod/replReads.js @@ -1,4 +1,4 @@ -// Test that doing slaveOk reads from secondaries hits all the secondaries evenly +// Test that doing secondaryOk reads from secondaries hits all the secondaries evenly // @tags: [requires_sharding] function testReadLoadBalancing(numReplicas) { @@ -52,7 +52,7 @@ function testReadLoadBalancing(numReplicas) { for (var i = 0; i < secondaries.length * 10; i++) { conn = new Mongo(s._mongos[0].host); - conn.setSlaveOk(); + conn.setSecondaryOk(); conn.getDB('test').foo.findOne(); connections.push(conn); } @@ -103,7 +103,7 @@ function testReadLoadBalancing(numReplicas) { for (var i = 0; i < secondaries.length * 10; i++) { conn = new Mongo(s._mongos[0].host); - conn.setSlaveOk(); + conn.setSecondaryOk(); conn.getDB('test').foo.findOne(); connections.push(conn); } diff --git a/jstests/replsets/auth1.js b/jstests/replsets/auth1.js index cfc9c405dd1..a91137391a8 100644 --- a/jstests/replsets/auth1.js +++ b/jstests/replsets/auth1.js @@ -88,7 +88,7 @@ print("try some legal and illegal reads"); var r = primary.getDB("test").foo.findOne(); assert.eq(r.x, 1); -secondary.setSlaveOk(); +secondary.setSecondaryOk(); function doQueryOn(p) { var error = assert.throws(function() { @@ -200,7 +200,7 @@ wait(function() { print("make sure it has the config, too"); assert.soon(function() { for (var i in rs.nodes) { - rs.nodes[i].setSlaveOk(); + rs.nodes[i].setSecondaryOk(); rs.nodes[i].getDB("admin").auth("foo", "bar"); config = rs.nodes[i].getDB("local").system.replset.findOne(); // We expect the config version to be 3 due to the initial config and then the diff --git a/jstests/replsets/auth_no_pri.js b/jstests/replsets/auth_no_pri.js index 16a94763b04..bc606dc913a 100644 --- a/jstests/replsets/auth_no_pri.js +++ b/jstests/replsets/auth_no_pri.js @@ -23,7 +23,7 @@ rs.waitForState(nodes[2], ReplSetTest.State.SECONDARY); // Make sure you can still authenticate a replset connection with no primary var conn2 = new Mongo(rs.getURL()); -conn2.setSlaveOk(true); +conn2.setSecondaryOk(); assert(conn2.getDB('admin').auth({user: 'admin', pwd: 'pwd', mechanism: "SCRAM-SHA-1"})); assert.eq(1, conn2.getDB('admin').foo.findOne().a); diff --git a/jstests/replsets/awaitable_ismaster_fcv_change.js b/jstests/replsets/awaitable_ismaster_fcv_change.js index 41ed644e5e6..9e22fde507d 100644 --- a/jstests/replsets/awaitable_ismaster_fcv_change.js +++ b/jstests/replsets/awaitable_ismaster_fcv_change.js @@ -21,7 +21,7 @@ const secondaryAdminDB = secondary.getDB("admin"); function runAwaitableIsMasterBeforeFCVChange( topologyVersionField, targetFCV, isPrimary, prevMinWireVersion, serverMaxWireVersion) { - db.getMongo().setSlaveOk(); + db.getMongo().setSecondaryOk(); let response = assert.commandWorked(db.runCommand({ isMaster: 1, topologyVersion: topologyVersionField, diff --git a/jstests/replsets/awaitdata_getmore_new_last_committed_optime.js b/jstests/replsets/awaitdata_getmore_new_last_committed_optime.js index f98e2fb4326..4d17389c209 100644 --- a/jstests/replsets/awaitdata_getmore_new_last_committed_optime.js +++ b/jstests/replsets/awaitdata_getmore_new_last_committed_optime.js @@ -56,7 +56,7 @@ let waitForGetMoreToFinish = startParallelShell(() => { load('jstests/replsets/rslib.js'); const secondary = db.getMongo(); - secondary.setSlaveOk(); + secondary.setSecondaryOk(); const dbName = 'test'; const collName = 'coll'; diff --git a/jstests/replsets/buildindexes.js b/jstests/replsets/buildindexes.js index e00a9e94ef7..3be0ba68896 100644 --- a/jstests/replsets/buildindexes.js +++ b/jstests/replsets/buildindexes.js @@ -21,7 +21,7 @@ var primary = replTest.getPrimary().getDB(name); var secondaryConns = replTest.getSecondaries(); var secondaries = []; for (var i in secondaryConns) { - secondaryConns[i].setSlaveOk(); + secondaryConns[i].setSecondaryOk(); secondaries.push(secondaryConns[i].getDB(name)); } replTest.awaitReplication(); diff --git a/jstests/replsets/catchup.js b/jstests/replsets/catchup.js index a8284ad1772..7ab31e4d76c 100644 --- a/jstests/replsets/catchup.js +++ b/jstests/replsets/catchup.js @@ -37,7 +37,7 @@ rst.nodes.forEach(function(node) { }); function checkOpInOplog(node, op, count) { - node.getDB("admin").getMongo().setSlaveOk(); + node.getDB("admin").getMongo().setSecondaryOk(); var oplog = node.getDB("local")['oplog.rs']; var oplogArray = oplog.find().toArray(); assert.eq(oplog.count(op), count, "op: " + tojson(op) + ", oplog: " + tojson(oplogArray)); diff --git a/jstests/replsets/db_reads_while_recovering_all_commands.js b/jstests/replsets/db_reads_while_recovering_all_commands.js index e742e1b43af..a5b3f9d3ab0 100644 --- a/jstests/replsets/db_reads_while_recovering_all_commands.js +++ b/jstests/replsets/db_reads_while_recovering_all_commands.js @@ -192,6 +192,7 @@ const allCommands = { hello: {skip: isNotAUserDataRead}, hostInfo: {skip: isNotAUserDataRead}, httpClientRequest: {skip: isNotAUserDataRead}, + importCollection: {skip: isNotAUserDataRead}, insert: {skip: isPrimaryOnly}, internalRenameIfOptionsAndIndexesMatch: {skip: isAnInternalCommand}, invalidateUserCache: {skip: isNotAUserDataRead}, diff --git a/jstests/replsets/disconnect_on_legacy_write_to_secondary.js b/jstests/replsets/disconnect_on_legacy_write_to_secondary.js index edf5950a62b..9678fc1a98e 100644 --- a/jstests/replsets/disconnect_on_legacy_write_to_secondary.js +++ b/jstests/replsets/disconnect_on_legacy_write_to_secondary.js @@ -52,16 +52,16 @@ const primaryDb = primaryDataConn.getDB("test"); const primaryColl = primaryDb[collname]; primaryDataConn.forceWriteMode('legacy'); -function getNotMasterLegacyUnackWritesCounter() { +function getNotPrimaryLegacyUnackWritesCounter() { return assert.commandWorked(primaryAdmin.adminCommand({serverStatus: 1})) - .metrics.repl.network.notMasterLegacyUnacknowledgedWrites; + .metrics.repl.network.notPrimaryLegacyUnacknowledgedWrites; } function runStepDownTest({description, failpoint, operation}) { jsTestLog("Enabling failpoint to block " + description + "s"); let failPoint = configureFailPoint(primaryAdmin, failpoint); - let failedLegacyUnackWritesBefore = getNotMasterLegacyUnackWritesCounter(); + let failedLegacyUnackWritesBefore = getNotPrimaryLegacyUnackWritesCounter(); jsTestLog("Trying legacy " + description + " on stepping-down primary"); operation(); @@ -77,7 +77,7 @@ function runStepDownTest({description, failpoint, operation}) { // Validate the number of legacy unacknowledged writes failed due to step down resulted // in network disconnection. - let failedLegacyUnackWritesAfter = getNotMasterLegacyUnackWritesCounter(); + let failedLegacyUnackWritesAfter = getNotPrimaryLegacyUnackWritesCounter(); assert.eq(failedLegacyUnackWritesAfter, failedLegacyUnackWritesBefore + 1); // Allow the primary to be re-elected, and wait for it. diff --git a/jstests/replsets/explain_slaveok.js b/jstests/replsets/explain_slaveok.js index 68eda89bce7..f3215af9ab6 100644 --- a/jstests/replsets/explain_slaveok.js +++ b/jstests/replsets/explain_slaveok.js @@ -1,12 +1,12 @@ // Test the explain command on the primary and on secondaries: // -// 1) Explain of read operations should work on the secondaries iff slaveOk is set. +// 1) Explain of read operations should work on the secondaries iff secondaryOk is set. // // 2) Explain of write operations should -// --fail on secondaries, even if slaveOk is set, +// --fail on secondaries, even if secondaryOk is set, // --succeed on primary without applying any writes. -var name = "explain_slaveok"; +var name = "explain_secondaryok"; print("Start replica set with two nodes"); var replTest = new ReplSetTest({name: name, nodes: 2}); @@ -16,22 +16,22 @@ var primary = replTest.getPrimary(); // Insert a document and let it sync to the secondary. print("Initial sync"); -primary.getDB("test").explain_slaveok.insert({a: 1}); +primary.getDB("test").explain_secondaryok.insert({a: 1}); replTest.awaitReplication(); // Check that the document is present on the primary. -assert.eq(1, primary.getDB("test").explain_slaveok.findOne({a: 1})["a"]); +assert.eq(1, primary.getDB("test").explain_secondaryok.findOne({a: 1})["a"]); -// We shouldn't be able to read from the secondary with slaveOk off. +// We shouldn't be able to read from the secondary with secondaryOk off. var secondary = replTest.getSecondary(); -secondary.getDB("test").getMongo().setSlaveOk(false); +secondary.getDB("test").getMongo().setSecondaryOk(false); assert.throws(function() { - secondary.getDB("test").explain_slaveok.findOne({a: 1}); + secondary.getDB("test").explain_secondaryok.findOne({a: 1}); }); -// With slaveOk on, we should be able to read from the secondary. -secondary.getDB("test").getMongo().setSlaveOk(true); -assert.eq(1, secondary.getDB("test").explain_slaveok.findOne({a: 1})["a"]); +// With secondaryOk on, we should be able to read from the secondary. +secondary.getDB("test").getMongo().setSecondaryOk(); +assert.eq(1, secondary.getDB("test").explain_secondaryok.findOne({a: 1})["a"]); // // Test explains on primary. @@ -39,12 +39,12 @@ assert.eq(1, secondary.getDB("test").explain_slaveok.findOne({a: 1})["a"]); // Explain a count on the primary. var explainOut = primary.getDB("test").runCommand( - {explain: {count: "explain_slaveok", query: {a: 1}}, verbosity: "executionStats"}); + {explain: {count: "explain_secondaryok", query: {a: 1}}, verbosity: "executionStats"}); assert.commandWorked(explainOut, "explain read op on primary"); // Explain an update on the primary. explainOut = primary.getDB("test").runCommand({ - explain: {update: "explain_slaveok", updates: [{q: {a: 1}, u: {$set: {a: 5}}}]}, + explain: {update: "explain_secondaryok", updates: [{q: {a: 1}, u: {$set: {a: 5}}}]}, verbosity: "executionStats" }); assert.commandWorked(explainOut, "explain write op on primary"); @@ -57,52 +57,52 @@ assert.eq(1, stages.nWouldModify); // Confirm that the document did not actually get modified on the primary // or on the secondary. -assert.eq(1, primary.getDB("test").explain_slaveok.findOne({a: 1})["a"]); -secondary.getDB("test").getMongo().setSlaveOk(true); -assert.eq(1, secondary.getDB("test").explain_slaveok.findOne({a: 1})["a"]); +assert.eq(1, primary.getDB("test").explain_secondaryok.findOne({a: 1})["a"]); +secondary.getDB("test").getMongo().setSecondaryOk(); +assert.eq(1, secondary.getDB("test").explain_secondaryok.findOne({a: 1})["a"]); // // Test explains on secondary. // -// Explain a count on the secondary with slaveOk off. Should fail because -// slaveOk is required for explains on a secondary. -secondary.getDB("test").getMongo().setSlaveOk(false); +// Explain a count on the secondary with secondaryOk off. Should fail because +// secondaryOk is required for explains on a secondary. +secondary.getDB("test").getMongo().setSecondaryOk(false); explainOut = secondary.getDB("test").runCommand( - {explain: {count: "explain_slaveok", query: {a: 1}}, verbosity: "executionStats"}); -assert.commandFailed(explainOut, "explain read op on secondary, slaveOk false"); + {explain: {count: "explain_secondaryok", query: {a: 1}}, verbosity: "executionStats"}); +assert.commandFailed(explainOut, "explain read op on secondary, secondaryOk false"); -// Explain of count should succeed once slaveOk is true. -secondary.getDB("test").getMongo().setSlaveOk(true); +// Explain of count should succeed once secondaryOk is true. +secondary.getDB("test").getMongo().setSecondaryOk(); explainOut = secondary.getDB("test").runCommand( - {explain: {count: "explain_slaveok", query: {a: 1}}, verbosity: "executionStats"}); -assert.commandWorked(explainOut, "explain read op on secondary, slaveOk true"); + {explain: {count: "explain_secondaryok", query: {a: 1}}, verbosity: "executionStats"}); +assert.commandWorked(explainOut, "explain read op on secondary, secondaryOk true"); -// Explain .find() on a secondary, setting slaveOk directly on the query. -secondary.getDB("test").getMongo().setSlaveOk(false); +// Explain .find() on a secondary, setting secondaryOk directly on the query. +secondary.getDB("test").getMongo().setSecondaryOk(false); assert.throws(function() { - secondary.getDB("test").explain_slaveok.explain("executionStats").find({a: 1}).finish(); + secondary.getDB("test").explain_secondaryok.explain("executionStats").find({a: 1}).finish(); }); -secondary.getDB("test").getMongo().setSlaveOk(false); +secondary.getDB("test").getMongo().setSecondaryOk(false); explainOut = secondary.getDB("test") - .explain_slaveok.explain("executionStats") + .explain_secondaryok.explain("executionStats") .find({a: 1}) .addOption(DBQuery.Option.slaveOk) .finish(); -assert.commandWorked(explainOut, "explain read op on secondary, slaveOk set to true on query"); +assert.commandWorked(explainOut, "explain read op on secondary, slaveOk bit set to true on query"); -secondary.getDB("test").getMongo().setSlaveOk(true); +secondary.getDB("test").getMongo().setSecondaryOk(); explainOut = - secondary.getDB("test").explain_slaveok.explain("executionStats").find({a: 1}).finish(); -assert.commandWorked(explainOut, "explain .find() on secondary, slaveOk set to true"); + secondary.getDB("test").explain_secondaryok.explain("executionStats").find({a: 1}).finish(); +assert.commandWorked(explainOut, "explain .find() on secondary, secondaryOk set to true"); -// Explain .find() on a secondary, setting slaveOk to false with various read preferences. +// Explain .find() on a secondary, setting secondaryOk to false with various read preferences. var readPrefModes = ["secondary", "secondaryPreferred", "primaryPreferred", "nearest"]; readPrefModes.forEach(function(prefString) { - secondary.getDB("test").getMongo().setSlaveOk(false); + secondary.getDB("test").getMongo().setSecondaryOk(false); explainOut = secondary.getDB("test") - .explain_slaveok.explain("executionStats") + .explain_secondaryok.explain("executionStats") .find({a: 1}) .readPref(prefString) .finish(); @@ -112,7 +112,7 @@ readPrefModes.forEach(function(prefString) { // Similarly should succeed if a read preference is set on the connection. secondary.setReadPref(prefString); explainOut = - secondary.getDB("test").explain_slaveok.explain("executionStats").find({a: 1}).finish(); + secondary.getDB("test").explain_secondaryok.explain("executionStats").find({a: 1}).finish(); assert.commandWorked( explainOut, "explain .find() on secondary, '" + prefString + "' read preference on connection"); @@ -120,35 +120,36 @@ readPrefModes.forEach(function(prefString) { secondary.setReadPref(); }); -// Fail explain find() on a secondary, setting slaveOk to false with read preference set to primary. +// Fail explain find() on a secondary, setting secondaryOk to false with read preference set to +// primary. var prefStringPrimary = "primary"; -secondary.getDB("test").getMongo().setSlaveOk(false); +secondary.getDB("test").getMongo().setSecondaryOk(false); explainOut = secondary.getDB("test").runCommand( - {explain: {find: "explain_slaveok", query: {a: 1}}, verbosity: "executionStats"}); -assert.commandFailed(explainOut, "not master and slaveOk=false"); + {explain: {find: "explain_secondaryok", query: {a: 1}}, verbosity: "executionStats"}); +assert.commandFailed(explainOut, "not primary and secondaryOk=false"); // Similarly should fail if a read preference is set on the connection. secondary.setReadPref(prefStringPrimary); explainOut = secondary.getDB("test").runCommand( - {explain: {find: "explain_slaveok", query: {a: 1}}, verbosity: "executionStats"}); -assert.commandFailed(explainOut, "not master and slaveOk=false"); + {explain: {find: "explain_secondaryok", query: {a: 1}}, verbosity: "executionStats"}); +assert.commandFailed(explainOut, "not primary and secondaryOk=false"); // Unset read pref on the connection. secondary.setReadPref(); -// Explain an update on the secondary with slaveOk off. Should fail because -// slaveOk is required for explains on a secondary. -secondary.getDB("test").getMongo().setSlaveOk(false); +// Explain an update on the secondary with secondaryOk off. Should fail because +// secondaryOk is required for explains on a secondary. +secondary.getDB("test").getMongo().setSecondaryOk(false); explainOut = secondary.getDB("test").runCommand({ - explain: {update: "explain_slaveok", updates: [{q: {a: 1}, u: {$set: {a: 5}}}]}, + explain: {update: "explain_secondaryok", updates: [{q: {a: 1}, u: {$set: {a: 5}}}]}, verbosity: "executionStats" }); -assert.commandFailed(explainOut, "explain write op on secondary, slaveOk false"); +assert.commandFailed(explainOut, "explain write op on secondary, secondaryOk false"); -// Explain of the update should also fail with slaveOk on. -secondary.getDB("test").getMongo().setSlaveOk(true); +// Explain of the update should also fail with secondaryOk on. +secondary.getDB("test").getMongo().setSecondaryOk(); explainOut = secondary.getDB("test").runCommand({ - explain: {update: "explain_slaveok", updates: [{q: {a: 1}, u: {$set: {a: 5}}}]}, + explain: {update: "explain_secondaryok", updates: [{q: {a: 1}, u: {$set: {a: 5}}}]}, verbosity: "executionStats" }); -assert.commandFailed(explainOut, "explain write op on secondary, slaveOk true"); +assert.commandFailed(explainOut, "explain write op on secondary, secondaryOk true"); replTest.stopSet(); diff --git a/jstests/replsets/fsync_lock_read_secondaries.js b/jstests/replsets/fsync_lock_read_secondaries.js index e73ceab58ba..daed9de7ad6 100644 --- a/jstests/replsets/fsync_lock_read_secondaries.js +++ b/jstests/replsets/fsync_lock_read_secondaries.js @@ -50,7 +50,7 @@ replTest.awaitReplication(); // Calling getPrimary also populates '_secondaries'. var secondaries = replTest.getSecondaries(); -secondaries[0].setSlaveOk(); +secondaries[0].setSecondaryOk(); assert.commandWorked(secondaries[0].getDB("admin").runCommand({fsync: 1, lock: 1})); var docNum = 1000; diff --git a/jstests/replsets/groupAndMapReduce.js b/jstests/replsets/groupAndMapReduce.js index 270436bf62c..2723f800a07 100644 --- a/jstests/replsets/groupAndMapReduce.js +++ b/jstests/replsets/groupAndMapReduce.js @@ -36,7 +36,7 @@ doTest = function(signal) { assert(secondaries.length == 2, "Expected 2 secondaries but length was " + secondaries.length); secondaries.forEach(function(secondary) { // try to read from secondary - secondary.slaveOk = true; + secondary.setSecondaryOk(); var count = secondary.getDB("foo").foo.find().itcount(); printjson(count); assert.eq(len, count, "secondary count wrong: " + secondary); @@ -46,7 +46,7 @@ doTest = function(signal) { printjson(one); print("Calling inline mr() with slaveOk=true, must succeed"); - secondary.slaveOk = true; + secondary.setSecondaryOk(); map = function() { emit(this.a, 1); }; diff --git a/jstests/replsets/initial_sync4.js b/jstests/replsets/initial_sync4.js index 80103839bfb..35dbd632715 100644 --- a/jstests/replsets/initial_sync4.js +++ b/jstests/replsets/initial_sync4.js @@ -45,7 +45,7 @@ jsTestLog("5. Wait for new node to start cloning"); - s.setSlaveOk(); + s.setSecondaryOk(); var sc = s.getDB("d")["c"]; wait(function() { diff --git a/jstests/replsets/initial_sync_ambiguous_index.js b/jstests/replsets/initial_sync_ambiguous_index.js index 7e415fade43..c50324db935 100644 --- a/jstests/replsets/initial_sync_ambiguous_index.js +++ b/jstests/replsets/initial_sync_ambiguous_index.js @@ -44,7 +44,7 @@ const secondary = rst.add({ rsConfig: {votes: 0, priority: 0}, setParameter: {"numInitialSyncAttempts": 1, 'collectionClonerBatchSize': clonerBatchSize} }); -secondary.setSlaveOk(); +secondary.setSecondaryOk(); const secondaryColl = secondary.getDB(dbName).getCollection(collectionName); // We set the collectionClonerBatchSize low above, so we will definitely hit diff --git a/jstests/replsets/initial_sync_applier_error.js b/jstests/replsets/initial_sync_applier_error.js index 7ef7058aea4..e880c739ef1 100644 --- a/jstests/replsets/initial_sync_applier_error.js +++ b/jstests/replsets/initial_sync_applier_error.js @@ -31,7 +31,7 @@ assert.commandWorked(coll.insert({_id: 0, content: "hi"})); // but before copying databases. var secondary = replSet.add({setParameter: "numInitialSyncAttempts=2", rsConfig: {votes: 0, priority: 0}}); -secondary.setSlaveOk(); +secondary.setSecondaryOk(); let failPoint = configureFailPoint(secondary, 'initialSyncHangBeforeCopyingDatabases'); replSet.reInitiate(); diff --git a/jstests/replsets/initial_sync_fetch_from_oldest_active_transaction_timestamp.js b/jstests/replsets/initial_sync_fetch_from_oldest_active_transaction_timestamp.js index 023184c5dfb..292b0318ecd 100644 --- a/jstests/replsets/initial_sync_fetch_from_oldest_active_transaction_timestamp.js +++ b/jstests/replsets/initial_sync_fetch_from_oldest_active_transaction_timestamp.js @@ -139,7 +139,7 @@ replTest.awaitReplication(); jsTestLog("Initial sync completed"); // Make sure the secondary fetched enough transaction oplog entries. -secondary.setSlaveOk(); +secondary.setSecondaryOk(); const secondaryOplog = secondary.getDB("local").getCollection("oplog.rs"); assert.eq(secondaryOplog.find({"ts": beginFetchingTs}).itcount(), 1); diff --git a/jstests/replsets/initial_sync_fetch_from_oldest_active_transaction_timestamp_no_oplog_application.js b/jstests/replsets/initial_sync_fetch_from_oldest_active_transaction_timestamp_no_oplog_application.js index a4420ff9940..9b1839b4c43 100644 --- a/jstests/replsets/initial_sync_fetch_from_oldest_active_transaction_timestamp_no_oplog_application.js +++ b/jstests/replsets/initial_sync_fetch_from_oldest_active_transaction_timestamp_no_oplog_application.js @@ -98,7 +98,7 @@ replTest.waitForState(secondary, ReplSetTest.State.SECONDARY); jsTestLog("Initial sync completed"); // Make sure the secondary fetched enough transaction oplog entries. -secondary.setSlaveOk(); +secondary.setSecondaryOk(); const secondaryOplog = secondary.getDB("local").getCollection("oplog.rs"); assert.eq(secondaryOplog.find({"ts": beginFetchingTs}).itcount(), 1); diff --git a/jstests/replsets/initial_sync_invalid_views.js b/jstests/replsets/initial_sync_invalid_views.js index fb5a1975323..9faf5207608 100644 --- a/jstests/replsets/initial_sync_invalid_views.js +++ b/jstests/replsets/initial_sync_invalid_views.js @@ -18,7 +18,7 @@ assert.commandWorked(coll.insert({a: 1})); // Add a secondary node but make it hang before copying databases. let secondary = replSet.add({rsConfig: {votes: 0, priority: 0}}); -secondary.setSlaveOk(); +secondary.setSecondaryOk(); assert.commandWorked(secondary.getDB('admin').runCommand( {configureFailPoint: 'initialSyncHangBeforeCopyingDatabases', mode: 'alwaysOn'})); diff --git a/jstests/replsets/initial_sync_move_forward.js b/jstests/replsets/initial_sync_move_forward.js index d5142d06b98..c99b529a700 100644 --- a/jstests/replsets/initial_sync_move_forward.js +++ b/jstests/replsets/initial_sync_move_forward.js @@ -41,7 +41,7 @@ assert.commandWorked(masterColl.ensureIndex({x: 1}, {unique: true})); // Add a secondary. var secondary = rst.add({setParameter: "numInitialSyncAttempts=1", rsConfig: {votes: 0, priority: 0}}); -secondary.setSlaveOk(); +secondary.setSecondaryOk(); var secondaryColl = secondary.getDB("test").coll; // Pause initial sync when the secondary has copied {_id: 0, x: 0} and {_id: 1, x: 1}. diff --git a/jstests/replsets/initial_sync_oplog_rollover.js b/jstests/replsets/initial_sync_oplog_rollover.js index 268ec261d39..b9c1eda8f2f 100644 --- a/jstests/replsets/initial_sync_oplog_rollover.js +++ b/jstests/replsets/initial_sync_oplog_rollover.js @@ -38,7 +38,7 @@ var firstOplogEntry = getFirstOplogEntry(primary); // Add a secondary node but make it hang before copying databases. var secondary = replSet.add(); -secondary.setSlaveOk(); +secondary.setSecondaryOk(); var failPoint = configureFailPoint(secondary, 'initialSyncHangBeforeCopyingDatabases'); replSet.reInitiate(); diff --git a/jstests/replsets/initial_sync_replSetGetStatus.js b/jstests/replsets/initial_sync_replSetGetStatus.js index d315421577a..3d999cc9553 100644 --- a/jstests/replsets/initial_sync_replSetGetStatus.js +++ b/jstests/replsets/initial_sync_replSetGetStatus.js @@ -24,7 +24,7 @@ assert.commandWorked(coll.insert({a: 2})); // Add a secondary node but make it hang before copying databases. var secondary = replSet.add({rsConfig: {votes: 0, priority: 0}}); -secondary.setSlaveOk(); +secondary.setSecondaryOk(); var failPointBeforeCopying = configureFailPoint(secondary, 'initialSyncHangBeforeCopyingDatabases'); var failPointBeforeFinish = configureFailPoint(secondary, 'initialSyncHangBeforeFinish'); diff --git a/jstests/replsets/initial_sync_replicates_prepare_received_during_another_initial_sync.js b/jstests/replsets/initial_sync_replicates_prepare_received_during_another_initial_sync.js index 80012cab607..9ea82c52bb3 100644 --- a/jstests/replsets/initial_sync_replicates_prepare_received_during_another_initial_sync.js +++ b/jstests/replsets/initial_sync_replicates_prepare_received_during_another_initial_sync.js @@ -47,7 +47,7 @@ function restartSecondaryAndForceSyncSource(replSet, secondary, syncSource, dbNa // Wait for the secondary to complete initial sync. waitForState(secondary, ReplSetTest.State.SECONDARY); // Allow for secondary reads. - secondary.setSlaveOk(); + secondary.setSecondaryOk(); const secondaryDB = secondary.getDB(dbName); // Confirm that we have a prepared transaction in progress on the secondary. diff --git a/jstests/replsets/initial_sync_test_fixture_test.js b/jstests/replsets/initial_sync_test_fixture_test.js index 625620584a4..755df7a2109 100644 --- a/jstests/replsets/initial_sync_test_fixture_test.js +++ b/jstests/replsets/initial_sync_test_fixture_test.js @@ -108,7 +108,7 @@ let prepareTimestamp = PrepareHelpers.prepareTransaction(session); assert(!initialSyncTest.step()); secondary = initialSyncTest.getSecondary(); -secondary.setSlaveOk(); +secondary.setSecondaryOk(); // Make sure that we cannot read from this node yet. assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}), diff --git a/jstests/replsets/initial_sync_uuid_not_found.js b/jstests/replsets/initial_sync_uuid_not_found.js index 90e0024b76c..2e2911ee6dd 100644 --- a/jstests/replsets/initial_sync_uuid_not_found.js +++ b/jstests/replsets/initial_sync_uuid_not_found.js @@ -39,7 +39,7 @@ function ResyncWithFailpoint(failpointName, failpointData) { assert.eq(primary, rst.getPrimary(), 'Primary changed after reconfig'); jsTestLog('Wait for new node to start cloning'); - secondary.setSlaveOk(); + secondary.setSecondaryOk(); const secondaryDB = secondary.getDB(primaryDB.getName()); const secondaryColl = secondaryDB[primaryColl.getName()]; diff --git a/jstests/replsets/initial_sync_with_write_load.js b/jstests/replsets/initial_sync_with_write_load.js index fc1164c6c43..0474c1f9c10 100644 --- a/jstests/replsets/initial_sync_with_write_load.js +++ b/jstests/replsets/initial_sync_with_write_load.js @@ -24,8 +24,8 @@ replTest.waitForState(replTest.nodes[0], ReplSetTest.State.PRIMARY); var master = replTest.getPrimary(); var a_conn = conns[0]; var b_conn = conns[1]; -a_conn.setSlaveOk(); -b_conn.setSlaveOk(); +a_conn.setSecondaryOk(); +b_conn.setSecondaryOk(); var A = a_conn.getDB("test"); var B = b_conn.getDB("test"); var AID = replTest.getNodeId(a_conn); diff --git a/jstests/replsets/kill_reads_with_prepare_conflicts_during_step_up.js b/jstests/replsets/kill_reads_with_prepare_conflicts_during_step_up.js index 1c71eb94b66..da8c8fd544d 100644 --- a/jstests/replsets/kill_reads_with_prepare_conflicts_during_step_up.js +++ b/jstests/replsets/kill_reads_with_prepare_conflicts_during_step_up.js @@ -74,7 +74,7 @@ TestData.clusterTime = clusterTimeAfterPrepare; const waitForSecondaryReadBlockedOnPrepareConflictThread = startParallelShell(() => { // Allow for secondary reads. - db.getMongo().setSlaveOk(); + db.getMongo().setSecondaryOk(); const parallelTestDB = db.getSiblingDB(TestData.dbName); const parallelTestCollName = TestData.collName; diff --git a/jstests/replsets/libs/initial_sync_update_missing_doc.js b/jstests/replsets/libs/initial_sync_update_missing_doc.js index f45d9a4e107..68bb14ac668 100644 --- a/jstests/replsets/libs/initial_sync_update_missing_doc.js +++ b/jstests/replsets/libs/initial_sync_update_missing_doc.js @@ -17,7 +17,7 @@ load("jstests/libs/fail_point_util.js"); // must be called after reInitiateSetWithSecondary. var reInitiateSetWithSecondary = function(replSet, secondaryConfig) { const secondary = replSet.add(secondaryConfig); - secondary.setSlaveOk(); + secondary.setSecondaryOk(); // Make the secondary hang after retrieving the last op on the sync source but before // copying databases. diff --git a/jstests/replsets/libs/rollback_resumable_index_build.js b/jstests/replsets/libs/rollback_resumable_index_build.js index 0ab2148e783..e5c63d91ada 100644 --- a/jstests/replsets/libs/rollback_resumable_index_build.js +++ b/jstests/replsets/libs/rollback_resumable_index_build.js @@ -7,8 +7,10 @@ const RollbackResumableIndexBuildTest = class { * rollback starts is specified by rollbackStartFailPointName. The phase that the index build * will resume from after rollback completes is specified by rollbackEndFailPointName. If * either of these points is in the drain writes phase, documents to insert into the side - * writes table must be specified by sideWrites. Documents specified by insertsToBeRolledBack - * are inserted after transitioning to rollback operations and will be rolled back. + * writes table must be specified by sideWrites. locksYieldedFailPointName specifies a point + * during the index build between rollbackEndFailPointName and rollbackStartFailPointName at + * which its locks are yielded. Documents specified by insertsToBeRolledBack are inserted after + * transitioning to rollback operations and will be rolled back. */ static run(rollbackTest, dbName, @@ -18,6 +20,7 @@ const RollbackResumableIndexBuildTest = class { rollbackStartFailPointData, rollbackEndFailPointName, rollbackEndFailPointData, + locksYieldedFailPointName, insertsToBeRolledBack, sideWrites = []) { const originalPrimary = rollbackTest.getPrimary(); @@ -29,6 +32,14 @@ const RollbackResumableIndexBuildTest = class { rollbackTest.awaitLastOpCommitted(); + // Set internalQueryExecYieldIterations to 0 and maxIndexBuildDrainBatchSize to 1 so that + // the index build is guaranteed to yield its locks between the rollback end and start + // failpoints. + assert.commandWorked( + originalPrimary.adminCommand({setParameter: 1, internalQueryExecYieldIterations: 0})); + assert.commandWorked( + originalPrimary.adminCommand({setParameter: 1, maxIndexBuildDrainBatchSize: 1})); + const coll = originalPrimary.getDB(dbName).getCollection(collName); const indexName = "rollback_resumable_index_build"; @@ -57,32 +68,23 @@ const RollbackResumableIndexBuildTest = class { assert.commandWorked(coll.insert(insertsToBeRolledBack)); - // Disable the failpoint in a parallel shell so that the primary can step down when the - // rollback test is transitioning to sync source operations before rollback. - const awaitDisableFailPointAfterContinuingInBackground = startParallelShell( - funWithArgs(function(failPointName, buildUUID) { - // Wait for the index build to be continue in the background. - checkLog.containsJson(db.getMongo(), 4760400, { - buildUUID: function(uuid) { - return uuid["uuid"]["$uuid"] === buildUUID; - } - }); - - // Disable the failpoint so that stepdown can proceed. - assert.commandWorked( - db.adminCommand({configureFailPoint: failPointName, mode: "off"})); - }, rollbackEndFp.failPointName, buildUUID), originalPrimary.port); + // Move the index build forward to a point at which its locks are yielded. This allows the + // primary to step down during the call to transitionToSyncSourceOperationsBeforeRollback() + // below. + const locksYieldedFp = configureFailPoint( + originalPrimary, locksYieldedFailPointName, {namespace: coll.getFullName()}); + rollbackEndFp.off(); + locksYieldedFp.wait(); rollbackTest.transitionToSyncSourceOperationsBeforeRollback(); - awaitDisableFailPointAfterContinuingInBackground(); - // The index creation will report as having failed due to InterruptedDueToReplStateChange, // but it is still building in the background. awaitCreateIndex(); // Wait until the index build reaches the desired starting point so that we can start the // rollback. + locksYieldedFp.off(); rollbackStartFp.wait(); // We ignore the return value here because the node will go into rollback immediately upon diff --git a/jstests/replsets/libs/secondary_reads_test.js b/jstests/replsets/libs/secondary_reads_test.js index 1d712fce05a..4840708dba2 100644 --- a/jstests/replsets/libs/secondary_reads_test.js +++ b/jstests/replsets/libs/secondary_reads_test.js @@ -14,7 +14,7 @@ function SecondaryReadsTest(name = "secondary_reads_test") { let primaryDB = primary.getDB(dbName); let secondary = rst.getSecondary(); let secondaryDB = secondary.getDB(dbName); - secondaryDB.getMongo().setSlaveOk(); + secondaryDB.getMongo().setSecondaryOk(); let readers = []; let signalColl = "signalColl"; @@ -37,7 +37,7 @@ function SecondaryReadsTest(name = "secondary_reads_test") { this.startSecondaryReaders = function(nReaders, readFn) { let read = function() { - db.getMongo().setSlaveOk(); + db.getMongo().setSecondaryOk(); db = db.getSiblingDB(TestData.dbName); while (true) { readFn(); diff --git a/jstests/replsets/maintenance2.js b/jstests/replsets/maintenance2.js index 2b904346945..c62d6bf17b6 100644 --- a/jstests/replsets/maintenance2.js +++ b/jstests/replsets/maintenance2.js @@ -40,7 +40,7 @@ secondaries.forEach(function(secondary) { assert.eq(stats.myState, 3, "Secondary should be in recovering state."); print("count should fail in recovering state..."); - secondary.slaveOk = true; + secondary.setSecondaryOk(); assert.commandFailed(secondary.getDB("foo").runCommand({count: "foo"})); // unset maintenance mode when done diff --git a/jstests/replsets/no_disconnect_on_stepdown.js b/jstests/replsets/no_disconnect_on_stepdown.js index 68877c6fc64..77a5526c50b 100644 --- a/jstests/replsets/no_disconnect_on_stepdown.js +++ b/jstests/replsets/no_disconnect_on_stepdown.js @@ -73,7 +73,7 @@ function runStepDownTest({description, failpoint, operation, errorCode}) { assert.commandWorked(primaryAdmin.adminCommand({serverStatus: 1})).metrics.repl; assert.eq(replMetrics.stateTransition.lastStateTransition, "stepDown"); assert.eq(replMetrics.stateTransition.userOperationsKilled, 1); - assert.eq(replMetrics.network.notMasterUnacknowledgedWrites, 0); + assert.eq(replMetrics.network.notPrimaryUnacknowledgedWrites, 0); // Allow the primary to be re-elected, and wait for it. assert.commandWorked(primaryAdmin.adminCommand({replSetFreeze: 0})); diff --git a/jstests/replsets/not_master_unacknowledged_write.js b/jstests/replsets/not_master_unacknowledged_write.js index a1570de931a..1fc65ddb7ba 100644 --- a/jstests/replsets/not_master_unacknowledged_write.js +++ b/jstests/replsets/not_master_unacknowledged_write.js @@ -5,12 +5,12 @@ (function() { "use strict"; -function getNotMasterUnackWritesCounter() { +function getNotPrimaryUnackWritesCounter() { return assert.commandWorked(primaryDB.adminCommand({serverStatus: 1})) - .metrics.repl.network.notMasterUnacknowledgedWrites; + .metrics.repl.network.notPrimaryUnacknowledgedWrites; } -const collName = "not_master_unacknowledged_write"; +const collName = "not_primary_unacknowledged_write"; var rst = new ReplSetTest({nodes: [{}, {rsConfig: {priority: 0}}]}); rst.startSet(); @@ -22,8 +22,8 @@ var secondaryDB = secondary.getDB("test"); var primaryColl = primaryDB[collName]; var secondaryColl = secondaryDB[collName]; -// Verify that reading from secondaries does not impact `notMasterUnacknowledgedWrites`. -const preReadingCounter = getNotMasterUnackWritesCounter(); +// Verify that reading from secondaries does not impact `notPrimaryUnacknowledgedWrites`. +const preReadingCounter = getNotPrimaryUnackWritesCounter(); jsTestLog("Reading from secondary ..."); [{name: "findOne", fn: () => secondaryColl.findOne()}, {name: "distinct", fn: () => secondaryColl.distinct("item")}, @@ -32,7 +32,7 @@ jsTestLog("Reading from secondary ..."); assert.doesNotThrow(fn); assert.eq(assert.commandWorked(secondary.getDB("admin").isMaster()).ismaster, false); }); -const postReadingCounter = getNotMasterUnackWritesCounter(); +const postReadingCounter = getNotPrimaryUnackWritesCounter(); assert.eq(preReadingCounter, postReadingCounter); jsTestLog("Primary on port " + primary.port + " hangs up on unacknowledged writes"); @@ -71,7 +71,7 @@ var command = var awaitShell = startParallelShell(command, primary.port); -let failedUnackWritesBefore = getNotMasterUnackWritesCounter(); +let failedUnackWritesBefore = getNotPrimaryUnackWritesCounter(); jsTestLog("Beginning unacknowledged insert"); primaryColl.insertOne({}, {writeConcern: {w: 0}}); @@ -87,7 +87,7 @@ assert.includes(result.toString(), "network error while attempting to run comman // Validate the number of unacknowledged writes failed due to step down resulted in network // disconnection. -let failedUnackWritesAfter = getNotMasterUnackWritesCounter(); +let failedUnackWritesAfter = getNotPrimaryUnackWritesCounter(); assert.eq(failedUnackWritesAfter, failedUnackWritesBefore + 1); rst.stopSet(); diff --git a/jstests/replsets/plan_cache_slaveok.js b/jstests/replsets/plan_cache_slaveok.js index 4ef60d93795..c20decf9eb1 100644 --- a/jstests/replsets/plan_cache_slaveok.js +++ b/jstests/replsets/plan_cache_slaveok.js @@ -1,7 +1,7 @@ // Verify that the plan cache and index filter commands can be run on secondaries, but only -// if slave ok is explicitly set. +// if secondaryOk is explicitly set. -var name = "plan_cache_slaveok"; +var name = "plan_cache_secondaryok"; function assertPlanCacheCommandsSucceed(db) { assert.commandWorked(db.runCommand({planCacheClear: name, query: {a: 1}})); @@ -50,13 +50,13 @@ assert.eq(1, primary.getDB("test")[name].findOne({a: 1})["a"]); // Make sure the plan cache commands succeed on the primary. assertPlanCacheCommandsSucceed(primary.getDB("test")); -// With slave ok false, the commands should fail on the secondary. +// With secondaryOk false, the commands should fail on the secondary. var secondary = replTest.getSecondary(); -secondary.getDB("test").getMongo().setSlaveOk(false); +secondary.getDB("test").getMongo().setSecondaryOk(false); assertPlanCacheCommandsFail(secondary.getDB("test")); -// With slave ok true, the commands should succeed on the secondary. -secondary.getDB("test").getMongo().setSlaveOk(true); +// With secondaryOk true, the commands should succeed on the secondary. +secondary.getDB("test").getMongo().setSecondaryOk(); assertPlanCacheCommandsSucceed(secondary.getDB("test")); replTest.stopSet(); diff --git a/jstests/replsets/prepare_transaction_read_at_cluster_time.js b/jstests/replsets/prepare_transaction_read_at_cluster_time.js index 24894823b1a..1e6ae30b5d6 100644 --- a/jstests/replsets/prepare_transaction_read_at_cluster_time.js +++ b/jstests/replsets/prepare_transaction_read_at_cluster_time.js @@ -16,7 +16,7 @@ const runDBHashFn = (host, dbName, clusterTime, useSnapshot) => { const conn = new Mongo(host); const db = conn.getDB(dbName); - conn.setSlaveOk(); + conn.setSecondaryOk(); let cmd; if (useSnapshot) { cmd = {dbHash: 1, readConcern: {level: "snapshot", atClusterTime: eval(clusterTime)}}; diff --git a/jstests/replsets/print_secondary_replication_info_unreachable_secondary.js b/jstests/replsets/print_secondary_replication_info_unreachable_secondary.js new file mode 100644 index 00000000000..4948ac85801 --- /dev/null +++ b/jstests/replsets/print_secondary_replication_info_unreachable_secondary.js @@ -0,0 +1,25 @@ +// Tests the output of db.printSecondaryReplicationInfo() for unreachable secondaries. + +(function() { +"use strict"; +const name = "printSecondaryReplicationInfo"; +const replSet = new ReplSetTest({name: name, nodes: 2}); +replSet.startSet(); +replSet.initiateWithHighElectionTimeout(); + +const primary = replSet.getPrimary(); +primary.getDB('test').foo.insert({a: 1}); +replSet.awaitReplication(); + +const secondary = replSet.getSecondary(); +replSet.stop(replSet.getNodeId(secondary)); +replSet.waitForState(secondary, ReplSetTest.State.DOWN); + +const joinShell = + startParallelShell("db.getSiblingDB('admin').printSecondaryReplicationInfo();", primary.port); +joinShell(); +assert( + rawMongoProgramOutput().match("no replication info, yet. State: \\(not reachable/healthy\\)")); + +replSet.stopSet(); +})(); diff --git a/jstests/replsets/quiesce_mode.js b/jstests/replsets/quiesce_mode.js index ae47952cbf8..52c39f3edbd 100644 --- a/jstests/replsets/quiesce_mode.js +++ b/jstests/replsets/quiesce_mode.js @@ -48,7 +48,7 @@ function runAwaitableIsMaster(topologyVersionField) { } function runFind() { - db.getMongo().setSlaveOk(); + db.getMongo().setSecondaryOk(); assert.eq(4, db.getSiblingDB("test").coll.find().itcount()); } diff --git a/jstests/replsets/read_committed_after_rollback.js b/jstests/replsets/read_committed_after_rollback.js index 41bd1d29268..a7e46e15e86 100644 --- a/jstests/replsets/read_committed_after_rollback.js +++ b/jstests/replsets/read_committed_after_rollback.js @@ -75,7 +75,7 @@ assert.eq(doDirtyRead(oldPrimaryColl), 'INVALID'); assert.eq(doCommittedRead(oldPrimaryColl), 'old'); // Change the partitioning so that oldPrimary is isolated, and newPrimary can be elected. -oldPrimary.setSlaveOk(); +oldPrimary.setSecondaryOk(); oldPrimary.disconnect(arbiters); newPrimary.reconnect(arbiters); assert.soon(() => newPrimary.adminCommand('isMaster').ismaster, '', 60 * 1000); diff --git a/jstests/replsets/read_committed_no_snapshots.js b/jstests/replsets/read_committed_no_snapshots.js index a0fe52cd565..280b0de7d49 100644 --- a/jstests/replsets/read_committed_no_snapshots.js +++ b/jstests/replsets/read_committed_no_snapshots.js @@ -38,9 +38,9 @@ replTest.initiateWithAnyNodeAsPrimary( var primary = replTest.getPrimary(); var secondaries = replTest.getSecondaries(); var healthySecondary = secondaries[0]; -healthySecondary.setSlaveOk(); +healthySecondary.setSecondaryOk(); var noSnapshotSecondary = secondaries[1]; -noSnapshotSecondary.setSlaveOk(); +noSnapshotSecondary.setSecondaryOk(); // Do a write, wait for it to replicate, and ensure it is visible. var res = primary.getDB(name).runCommandWithMetadata( // diff --git a/jstests/replsets/read_operations_during_rollback.js b/jstests/replsets/read_operations_during_rollback.js index d743c7b8303..f91ba15d31b 100644 --- a/jstests/replsets/read_operations_during_rollback.js +++ b/jstests/replsets/read_operations_during_rollback.js @@ -24,7 +24,7 @@ setFailPoint(rollbackNode, "rollbackHangAfterTransitionToRollback"); setFailPoint(rollbackNode, "GetMoreHangBeforeReadLock"); const joinGetMoreThread = startParallelShell(() => { - db.getMongo().setSlaveOk(); + db.getMongo().setSecondaryOk(); const cursorID = assert.commandWorked(db.runCommand({"find": "coll", batchSize: 0})).cursor.id; // Make sure an outstanding read operation gets killed during rollback even though the read // was started before rollback. Outstanding read operations are killed during rollback and diff --git a/jstests/replsets/read_operations_during_step_down.js b/jstests/replsets/read_operations_during_step_down.js index 96fa2651237..4909c7a9177 100644 --- a/jstests/replsets/read_operations_during_step_down.js +++ b/jstests/replsets/read_operations_during_step_down.js @@ -113,7 +113,7 @@ assert.eq(replMetrics.stateTransition.lastStateTransition, "stepDown"); assert.eq(replMetrics.stateTransition.userOperationsKilled, 0); // Should account for find and getmore commands issued before step down. assert.gte(replMetrics.stateTransition.userOperationsRunning, 2); -assert.eq(replMetrics.network.notMasterUnacknowledgedWrites, 0); +assert.eq(replMetrics.network.notPrimaryUnacknowledgedWrites, 0); rst.stopSet(); })(); diff --git a/jstests/replsets/read_operations_during_step_up.js b/jstests/replsets/read_operations_during_step_up.js index 91d202659a4..86ea4a3c0d6 100644 --- a/jstests/replsets/read_operations_during_step_up.js +++ b/jstests/replsets/read_operations_during_step_up.js @@ -36,10 +36,10 @@ assert.commandWorked(        primaryColl.insert({_id: 0}, {"writeConcern": {"w": "majority"}})); rst.awaitReplication(); -// It's possible for notMasterUnacknowledgedWrites to be non-zero because of mirrored reads during +// It's possible for notPrimaryUnacknowledgedWrites to be non-zero because of mirrored reads during // initial sync. let replMetrics = assert.commandWorked(secondaryAdmin.adminCommand({serverStatus: 1})).metrics.repl; -const startingNumNotMasterErrors = replMetrics.network.notMasterUnacknowledgedWrites; +const startingNumNotMasterErrors = replMetrics.network.notPrimaryUnacknowledgedWrites; // Open a cursor on secondary. const cursorIdToBeReadAfterStepUp = @@ -49,7 +49,7 @@ jsTestLog("2. Start blocking getMore cmd before step up"); const joinGetMoreThread = startParallelShell(() => { // Open another cursor on secondary before step up. secondaryDB = db.getSiblingDB(TestData.dbName); - secondaryDB.getMongo().setSlaveOk(true); + secondaryDB.getMongo().setSecondaryOk(); const cursorIdToBeReadDuringStepUp = assert.commandWorked(secondaryDB.runCommand({"find": TestData.collName, batchSize: 0})) @@ -71,7 +71,7 @@ waitForCurOpByFailPoint( jsTestLog("2. Start blocking find cmd before step up"); const joinFindThread = startParallelShell(() => { secondaryDB = db.getSiblingDB(TestData.dbName); - secondaryDB.getMongo().setSlaveOk(true); + secondaryDB.getMongo().setSecondaryOk(); // Enable the fail point for find cmd. assert.commandWorked( @@ -127,7 +127,7 @@ assert.eq(replMetrics.stateTransition.lastStateTransition, "stepUp"); assert.eq(replMetrics.stateTransition.userOperationsKilled, 0); // Should account for find and getmore commands issued before step up. assert.gte(replMetrics.stateTransition.userOperationsRunning, 2); -assert.eq(replMetrics.network.notMasterUnacknowledgedWrites, startingNumNotMasterErrors); +assert.eq(replMetrics.network.notPrimaryUnacknowledgedWrites, startingNumNotMasterErrors); rst.stopSet(); })(); diff --git a/jstests/replsets/reconstruct_prepared_transactions_initial_sync.js b/jstests/replsets/reconstruct_prepared_transactions_initial_sync.js index 6257f066eab..b72dd806295 100644 --- a/jstests/replsets/reconstruct_prepared_transactions_initial_sync.js +++ b/jstests/replsets/reconstruct_prepared_transactions_initial_sync.js @@ -129,7 +129,7 @@ replTest.awaitSecondaryNodes(); jsTestLog("Initial sync completed"); -secondary.setSlaveOk(); +secondary.setSecondaryOk(); const secondaryColl = secondary.getDB(dbName).getCollection(collName); // Make sure that while reading from the node that went through initial sync, we can't read diff --git a/jstests/replsets/reconstruct_prepared_transactions_initial_sync_index_build.js b/jstests/replsets/reconstruct_prepared_transactions_initial_sync_index_build.js index 3574010f636..38f7f431ca2 100644 --- a/jstests/replsets/reconstruct_prepared_transactions_initial_sync_index_build.js +++ b/jstests/replsets/reconstruct_prepared_transactions_initial_sync_index_build.js @@ -106,7 +106,7 @@ replTest.awaitSecondaryNodes(); jsTestLog("Initial sync completed"); -secondary.setSlaveOk(); +secondary.setSecondaryOk(); const secondaryColl = secondary.getDB(dbName).getCollection(collName); // Make sure that while reading from the node that went through initial sync, we can't read diff --git a/jstests/replsets/reconstruct_prepared_transactions_initial_sync_no_oplog_application.js b/jstests/replsets/reconstruct_prepared_transactions_initial_sync_no_oplog_application.js index dbc2c05dfff..d5b0eb39898 100644 --- a/jstests/replsets/reconstruct_prepared_transactions_initial_sync_no_oplog_application.js +++ b/jstests/replsets/reconstruct_prepared_transactions_initial_sync_no_oplog_application.js @@ -80,7 +80,7 @@ replTest.awaitSecondaryNodes(); jsTestLog("Initial sync completed"); -secondary.setSlaveOk(); +secondary.setSecondaryOk(); const secondaryColl = secondary.getDB(dbName).getCollection(collName); // Make sure that while reading from the node that went through initial sync, we can't read diff --git a/jstests/replsets/reconstruct_prepared_transactions_initial_sync_on_oplog_seed.js b/jstests/replsets/reconstruct_prepared_transactions_initial_sync_on_oplog_seed.js index 85e4b4e9874..f5100c39e35 100644 --- a/jstests/replsets/reconstruct_prepared_transactions_initial_sync_on_oplog_seed.js +++ b/jstests/replsets/reconstruct_prepared_transactions_initial_sync_on_oplog_seed.js @@ -107,7 +107,7 @@ PrepareHelpers.awaitMajorityCommitted(replTest, prepareTimestamp); jsTestLog("Initial sync completed"); -secondary.setSlaveOk(); +secondary.setSecondaryOk(); const secondaryColl = secondary.getDB(dbName).getCollection(collName); jsTestLog("Checking that the transaction is properly prepared"); diff --git a/jstests/replsets/recover_prepared_transactions_startup_secondary_application.js b/jstests/replsets/recover_prepared_transactions_startup_secondary_application.js index 31845da9629..56f40a8793f 100644 --- a/jstests/replsets/recover_prepared_transactions_startup_secondary_application.js +++ b/jstests/replsets/recover_prepared_transactions_startup_secondary_application.js @@ -82,7 +82,7 @@ PrepareHelpers.awaitMajorityCommitted(replTest, prepareTimestamp2); // Wait for the node to complete recovery before trying to read from it. replTest.awaitSecondaryNodes(); -secondary.setSlaveOk(); +secondary.setSecondaryOk(); jsTestLog("Checking that the first transaction is properly prepared"); diff --git a/jstests/replsets/rename_collection_temp.js b/jstests/replsets/rename_collection_temp.js index dc6ffd6f84a..1bf716784d6 100644 --- a/jstests/replsets/rename_collection_temp.js +++ b/jstests/replsets/rename_collection_temp.js @@ -54,7 +54,7 @@ replTest.awaitReplication(); var secondary = replTest.getSecondary(); var secondaryFoo = secondary.getDB("foo"); -secondaryFoo.permanentColl.setSlaveOk(true); +secondaryFoo.permanentColl.setSecondaryOk(); // Get the information on the secondary to ensure it was replicated correctly. checkCollectionTemp(secondaryFoo, "permanentColl", false); diff --git a/jstests/replsets/replset1.js b/jstests/replsets/replset1.js index 8225422338d..35ceb70121f 100644 --- a/jstests/replsets/replset1.js +++ b/jstests/replsets/replset1.js @@ -104,7 +104,7 @@ var doTest = function(signal) { var secondaries = replTest.getSecondaries(); assert(secondaries.length == 2, "Expected 2 secondaries but length was " + secondaries.length); secondaries.forEach(function(secondary) { - secondary.setSlaveOk(); + secondary.setSecondaryOk(); var count = secondary.getDB("bar").runCommand({count: "bar"}); printjson(count); assert.eq(1000, count.n, "secondary count wrong: " + secondary); @@ -118,7 +118,7 @@ var doTest = function(signal) { var t = db.foo; var ts = secondaries.map(function(z) { - z.setSlaveOk(); + z.setSecondaryOk(); return z.getDB("foo").foo; }); diff --git a/jstests/replsets/replset2.js b/jstests/replsets/replset2.js index 3c9b9613eed..38134794dfb 100644 --- a/jstests/replsets/replset2.js +++ b/jstests/replsets/replset2.js @@ -29,7 +29,7 @@ doTest = function(signal) { var secondaries = replTest.getSecondaries(); secondaries.forEach(function(secondary) { - secondary.setSlaveOk(); + secondary.setSecondaryOk(); }); // Test write concern with multiple inserts. diff --git a/jstests/replsets/replset5.js b/jstests/replsets/replset5.js index 5488d8a9cd0..9ea1424a426 100644 --- a/jstests/replsets/replset5.js +++ b/jstests/replsets/replset5.js @@ -55,8 +55,8 @@ if (wcError != null) { } var secondaries = replTest.getSecondaries(); -secondaries[0].setSlaveOk(); -secondaries[1].setSlaveOk(); +secondaries[0].setSecondaryOk(); +secondaries[1].setSecondaryOk(); var secondary0Count = secondaries[0].getDB(testDB).foo.find().itcount(); assert(secondary0Count == docNum, diff --git a/jstests/replsets/replset6.js b/jstests/replsets/replset6.js index 40998d7f4f2..fd33175d823 100644 --- a/jstests/replsets/replset6.js +++ b/jstests/replsets/replset6.js @@ -10,7 +10,7 @@ var p = rt.getPrimary(); rt.awaitSecondaryNodes(); var secondaries = rt.getSecondaries(); s = secondaries[0]; -s.setSlaveOk(); +s.setSecondaryOk(); admin = p.getDB("admin"); debug = function(foo) {}; // print( foo ); } diff --git a/jstests/replsets/resync_majority_member.js b/jstests/replsets/resync_majority_member.js index 4ab48f043ea..df0d233b4e3 100644 --- a/jstests/replsets/resync_majority_member.js +++ b/jstests/replsets/resync_majority_member.js @@ -99,8 +99,8 @@ assert.soon(() => { }); // Observe that the old write does not exist anywhere in the set. -syncSource.setSlaveOk(); -resyncNode.setSlaveOk(); +syncSource.setSecondaryOk(); +resyncNode.setSecondaryOk(); assert.eq(0, syncSource.getDB(dbName)[collName].find(disappearingDoc).itcount()); assert.eq(0, resyncNode.getDB(dbName)[collName].find(disappearingDoc).itcount()); diff --git a/jstests/replsets/rollback_auth.js b/jstests/replsets/rollback_auth.js index 0aa7995bdc2..372f84ce645 100644 --- a/jstests/replsets/rollback_auth.js +++ b/jstests/replsets/rollback_auth.js @@ -47,8 +47,8 @@ replTest.waitForState(replTest.nodes[0], ReplSetTest.State.PRIMARY); var master = replTest.getPrimary(); var a_conn = conns[0]; var b_conn = conns[1]; -a_conn.setSlaveOk(); -b_conn.setSlaveOk(); +a_conn.setSecondaryOk(); +b_conn.setSecondaryOk(); var A = a_conn.getDB("admin"); var B = b_conn.getDB("admin"); var a = a_conn.getDB("test"); diff --git a/jstests/replsets/rollback_creates_rollback_directory.js b/jstests/replsets/rollback_creates_rollback_directory.js index 3cb47eb65a2..db795769bdf 100644 --- a/jstests/replsets/rollback_creates_rollback_directory.js +++ b/jstests/replsets/rollback_creates_rollback_directory.js @@ -31,8 +31,8 @@ function runRollbackDirectoryTest(shouldCreateRollbackFiles) { var master = replTest.getPrimary(); var a_conn = conns[0]; var b_conn = conns[1]; - a_conn.setSlaveOk(); - b_conn.setSlaveOk(); + a_conn.setSecondaryOk(); + b_conn.setSecondaryOk(); var A = a_conn.getDB("test"); var B = b_conn.getDB("test"); var Apath = replTest.getDbPath(a_conn) + '/'; diff --git a/jstests/replsets/rollback_crud_op_sequences.js b/jstests/replsets/rollback_crud_op_sequences.js index a2e89332141..cd42c303a96 100644 --- a/jstests/replsets/rollback_crud_op_sequences.js +++ b/jstests/replsets/rollback_crud_op_sequences.js @@ -45,10 +45,10 @@ replTest.initiate({ replTest.waitForState(replTest.nodes[0], ReplSetTest.State.PRIMARY); var master = replTest.getPrimary(); var a_conn = conns[0]; -a_conn.setSlaveOk(); +a_conn.setSecondaryOk(); var A = a_conn.getDB("admin"); var b_conn = conns[1]; -b_conn.setSlaveOk(); +b_conn.setSecondaryOk(); var B = b_conn.getDB("admin"); assert.eq(master, conns[0], "conns[0] assumed to be master"); assert.eq(a_conn, master); diff --git a/jstests/replsets/rollback_ddl_op_sequences.js b/jstests/replsets/rollback_ddl_op_sequences.js index aff9cedaa39..62b2fb9cae2 100644 --- a/jstests/replsets/rollback_ddl_op_sequences.js +++ b/jstests/replsets/rollback_ddl_op_sequences.js @@ -54,10 +54,10 @@ replTest.initiate({ replTest.waitForState(replTest.nodes[0], ReplSetTest.State.PRIMARY); var master = replTest.getPrimary(); var a_conn = conns[0]; -a_conn.setSlaveOk(); +a_conn.setSecondaryOk(); var A = a_conn.getDB("admin"); var b_conn = conns[1]; -b_conn.setSlaveOk(); +b_conn.setSecondaryOk(); var B = b_conn.getDB("admin"); assert.eq(master, conns[0], "conns[0] assumed to be master"); assert.eq(a_conn, master); diff --git a/jstests/replsets/rollback_resumable_index_build_bulk_load_phase.js b/jstests/replsets/rollback_resumable_index_build_bulk_load_phase.js index 81631163f06..5b75f1b7b2c 100644 --- a/jstests/replsets/rollback_resumable_index_build_bulk_load_phase.js +++ b/jstests/replsets/rollback_resumable_index_build_bulk_load_phase.js @@ -13,12 +13,6 @@ load('jstests/replsets/libs/rollback_resumable_index_build.js'); -// TODO(SERVER-50775): Re-enable when stepdown issues are fixed in resumable index rollback tests. -if (true) { - jsTestLog('Skipping test.'); - return; -} - const dbName = "test"; const rollbackStartFailPointName = "hangIndexBuildDuringBulkLoadPhase"; const insertsToBeRolledBack = [{a: 4}, {a: 5}]; @@ -35,8 +29,9 @@ RollbackResumableIndexBuildTest.run(rollbackTest, {a: 1}, rollbackStartFailPointName, {iteration: 1}, - "hangAfterSettingUpIndexBuildUnlocked", + "hangAfterSettingUpIndexBuild", {}, + "setYieldAllLocksHang", insertsToBeRolledBack); // Rollback to the collection scan phase. @@ -47,7 +42,8 @@ RollbackResumableIndexBuildTest.run(rollbackTest, rollbackStartFailPointName, {iteration: 1}, "hangIndexBuildDuringCollectionScanPhaseBeforeInsertion", - {fieldsToMatch: {a: 2}}, + {iteration: 1}, + "setYieldAllLocksHang", insertsToBeRolledBack); rollbackTest.stop(); diff --git a/jstests/replsets/rollback_resumable_index_build_collection_scan_phase.js b/jstests/replsets/rollback_resumable_index_build_collection_scan_phase.js index 23807d85383..33abffdc2e3 100644 --- a/jstests/replsets/rollback_resumable_index_build_collection_scan_phase.js +++ b/jstests/replsets/rollback_resumable_index_build_collection_scan_phase.js @@ -13,12 +13,6 @@ load('jstests/replsets/libs/rollback_resumable_index_build.js'); -// TODO(SERVER-50775): Re-enable when stepdown issues are fixed in resumable index rollback tests. -if (true) { - jsTestLog('Skipping test.'); - return; -} - const dbName = "test"; const rollbackStartFailPointName = "hangIndexBuildDuringCollectionScanPhaseBeforeInsertion"; const insertsToBeRolledBack = [{a: 6}, {a: 7}]; @@ -34,9 +28,10 @@ RollbackResumableIndexBuildTest.run(rollbackTest, coll.getName(), {a: 1}, rollbackStartFailPointName, - {fieldsToMatch: {a: 2}}, - "hangAfterSettingUpIndexBuildUnlocked", + {iteration: 3}, + "hangAfterSettingUpIndexBuild", {}, + "setYieldAllLocksHang", insertsToBeRolledBack); // Rollback to earlier in the collection scan phase. @@ -45,9 +40,10 @@ RollbackResumableIndexBuildTest.run(rollbackTest, coll.getName(), {a: 1}, rollbackStartFailPointName, - {iteration: 4}, + {iteration: 3}, "hangIndexBuildDuringCollectionScanPhaseAfterInsertion", - {iteration: 2}, + {iteration: 1}, + "setYieldAllLocksHang", insertsToBeRolledBack); rollbackTest.stop(); diff --git a/jstests/replsets/rollback_resumable_index_build_complete.js b/jstests/replsets/rollback_resumable_index_build_complete.js index 51f5988f4c9..cb74fc96345 100644 --- a/jstests/replsets/rollback_resumable_index_build_complete.js +++ b/jstests/replsets/rollback_resumable_index_build_complete.js @@ -13,12 +13,6 @@ load('jstests/replsets/libs/rollback_resumable_index_build.js'); -// TODO(SERVER-50775): Re-enable when stepdown issues are fixed in resumable index rollback tests. -if (true) { - jsTestLog('Skipping test.'); - return; -} - const dbName = "test"; const insertsToBeRolledBack = [{a: 7}, {a: 8}]; @@ -32,7 +26,7 @@ RollbackResumableIndexBuildTest.runIndexBuildComplete(rollbackTest, dbName, coll.getName(), {a: 1}, - "hangAfterSettingUpIndexBuildUnlocked", + "hangAfterSettingUpIndexBuild", {}, insertsToBeRolledBack); diff --git a/jstests/replsets/rollback_resumable_index_build_drain_writes_phase.js b/jstests/replsets/rollback_resumable_index_build_drain_writes_phase.js index 4e025596884..922ee451e01 100644 --- a/jstests/replsets/rollback_resumable_index_build_drain_writes_phase.js +++ b/jstests/replsets/rollback_resumable_index_build_drain_writes_phase.js @@ -13,15 +13,9 @@ load('jstests/replsets/libs/rollback_resumable_index_build.js'); -// TODO(SERVER-50775): Re-enable when stepdown issues are fixed in resumable index rollback tests. -if (true) { - jsTestLog('Skipping test.'); - return; -} - const dbName = "test"; const rollbackStartFailPointName = "hangIndexBuildDuringDrainWritesPhase"; -const insertsToBeRolledBack = [{a: 13}, {a: 14}]; +const insertsToBeRolledBack = [{a: 18}, {a: 19}]; const rollbackTest = new RollbackTest(jsTestName()); const coll = rollbackTest.getPrimary().getDB(dbName).getCollection(jsTestName()); @@ -34,11 +28,12 @@ RollbackResumableIndexBuildTest.run(rollbackTest, coll.getName(), {a: 1}, rollbackStartFailPointName, - {iteration: 0}, - "hangAfterSettingUpIndexBuildUnlocked", + {iteration: 1}, + "hangAfterSettingUpIndexBuild", {}, + "hangDuringIndexBuildDrainYield", insertsToBeRolledBack, - [{a: 4}, {a: 5}]); + [{a: 4}, {a: 5}, {a: 6}]); // Rollback to the collection scan phase. RollbackResumableIndexBuildTest.run(rollbackTest, @@ -46,11 +41,12 @@ RollbackResumableIndexBuildTest.run(rollbackTest, coll.getName(), {a: 1}, rollbackStartFailPointName, - {iteration: 0}, + {iteration: 1}, "hangIndexBuildDuringCollectionScanPhaseBeforeInsertion", - {fieldsToMatch: {a: 2}}, + {iteration: 1}, + "hangDuringIndexBuildDrainYield", insertsToBeRolledBack, - [{a: 6}, {a: 7}]); + [{a: 7}, {a: 8}, {a: 9}]); // Rollback to the bulk load phase. RollbackResumableIndexBuildTest.run(rollbackTest, @@ -58,26 +54,25 @@ RollbackResumableIndexBuildTest.run(rollbackTest, coll.getName(), {a: 1}, rollbackStartFailPointName, - {iteration: 0}, + {iteration: 1}, "hangIndexBuildDuringBulkLoadPhase", {iteration: 1}, + "hangDuringIndexBuildDrainYield", insertsToBeRolledBack, - [{a: 8}, {a: 9}]); + [{a: 10}, {a: 11}, {a: 12}]); -// Rollback to earlier in the drain writes phase. We set maxIndexBuildDrainBatchSize to 1 so that -// the primary can step down between iterations. -assert.commandWorked( - rollbackTest.getPrimary().adminCommand({setParameter: 1, maxIndexBuildDrainBatchSize: 1})); +// Rollback to earlier in the drain writes phase. RollbackResumableIndexBuildTest.run(rollbackTest, dbName, coll.getName(), {a: 1}, rollbackStartFailPointName, - {iteration: 2}, + {iteration: 3}, "hangIndexBuildDuringDrainWritesPhaseSecond", - {iteration: 0}, + {iteration: 1}, + "hangDuringIndexBuildDrainYield", insertsToBeRolledBack, - [{a: 10}, {a: 11}, {a: 12}]); + [{a: 13}, {a: 14}, {a: 15}, {a: 16}, {a: 17}]); rollbackTest.stop(); })(); diff --git a/jstests/replsets/rslib.js b/jstests/replsets/rslib.js index 65567450c96..b53a5030d42 100644 --- a/jstests/replsets/rslib.js +++ b/jstests/replsets/rslib.js @@ -147,7 +147,7 @@ reconnect = function(conn) { }; getLatestOp = function(server) { - server.getDB("admin").getMongo().setSlaveOk(); + server.getDB("admin").getMongo().setSecondaryOk(); var log = server.getDB("local")['oplog.rs']; var cursor = log.find({}).sort({'$natural': -1}).limit(1); if (cursor.hasNext()) { @@ -157,7 +157,7 @@ getLatestOp = function(server) { }; getLeastRecentOp = function({server, readConcern}) { - server.getDB("admin").getMongo().setSlaveOk(); + server.getDB("admin").getMongo().setSecondaryOk(); const oplog = server.getDB("local").oplog.rs; const cursor = oplog.find().sort({$natural: 1}).limit(1).readConcern(readConcern); if (cursor.hasNext()) { diff --git a/jstests/replsets/server8070.js b/jstests/replsets/server8070.js index 876a768fd7a..5bc4fd8f60d 100644 --- a/jstests/replsets/server8070.js +++ b/jstests/replsets/server8070.js @@ -36,8 +36,8 @@ replSet.initiate({ // set up common points of access var master = replSet.getPrimary(); var primary = master.getDB("foo"); -replSet.nodes[1].setSlaveOk(); -replSet.nodes[2].setSlaveOk(); +replSet.nodes[1].setSecondaryOk(); +replSet.nodes[2].setSecondaryOk(); var member2 = replSet.nodes[1].getDB("admin"); var member3 = replSet.nodes[2].getDB("admin"); diff --git a/jstests/replsets/slavedelay3.js b/jstests/replsets/slavedelay3.js index 9d09fa4486c..1d12d22912b 100644 --- a/jstests/replsets/slavedelay3.js +++ b/jstests/replsets/slavedelay3.js @@ -17,7 +17,7 @@ var secondaryConns = replTest.getSecondaries(); var secondaries = []; for (var i in secondaryConns) { var d = secondaryConns[i].getDB(name); - d.getMongo().setSlaveOk(); + d.getMongo().setSecondaryOk(); secondaries.push(d); } diff --git a/jstests/replsets/slaveok_read_pref.js b/jstests/replsets/slaveok_read_pref.js index 9fc11600e8d..e35f36e9c97 100644 --- a/jstests/replsets/slaveok_read_pref.js +++ b/jstests/replsets/slaveok_read_pref.js @@ -1,5 +1,5 @@ -// Test that slaveOk is implicitly allowed for queries on a secondary with a read preference other -// than 'primary', and that queries which do have 'primary' read preference fail. +// Test that secondaryOk is implicitly allowed for queries on a secondary with a read preference +// other than 'primary', and that queries which do have 'primary' read preference fail. (function() { "use strict"; @@ -28,18 +28,18 @@ const secDB = rst.getSecondary().getDB(jsTestName()); for (let readMode of ["commands", "legacy"]) { for (let readPref of readPrefs) { - for (let slaveOk of [true, false]) { - const testType = {readMode: readMode, readPref: readPref, slaveOk: slaveOk}; + for (let secondaryOk of [true, false]) { + const testType = {readMode: readMode, readPref: readPref, secondaryOk: secondaryOk}; secDB.getMongo().forceReadMode(readMode); - secDB.getMongo().setSlaveOk(slaveOk); + secDB.getMongo().setSecondaryOk(secondaryOk); const cursor = (readPref ? secDB.test.find().readPref(readPref) : secDB.test.find()); - if (readPref === "primary" || (!readPref && !slaveOk)) { + if (readPref === "primary" || (!readPref && !secondaryOk)) { // Attempting to run the query throws an error of type NotPrimaryNoSecondaryOk. - const slaveOkErr = assert.throws(() => cursor.itcount(), [], tojson(testType)); - assert.commandFailedWithCode(slaveOkErr, ErrorCodes.NotPrimaryNoSecondaryOk); + const secondaryOkErr = assert.throws(() => cursor.itcount(), [], tojson(testType)); + assert.commandFailedWithCode(secondaryOkErr, ErrorCodes.NotPrimaryNoSecondaryOk); } else { // Succeeds for all non-primary readPrefs, and for no readPref iff slaveOk. const docCount = assert.doesNotThrow(() => cursor.itcount(), [], tojson(testType)); @@ -51,7 +51,7 @@ for (let readMode of ["commands", "legacy"]) { function assertNotPrimaryNoSecondaryOk(func) { secDB.getMongo().forceReadMode("commands"); - secDB.getMongo().setSlaveOk(false); + secDB.getMongo().setSecondaryOk(false); secDB.getMongo().setReadPref("primary"); const res = assert.throws(func); assert.commandFailedWithCode(res, ErrorCodes.NotPrimaryNoSecondaryOk); @@ -59,7 +59,7 @@ function assertNotPrimaryNoSecondaryOk(func) { // Test that agg with $out/$merge and non-inline mapReduce fail with 'NotPrimaryNoSecondaryOk' when // directed at a secondary with "primary" read preference. -const secondaryColl = secDB.slaveok_read_pref; +const secondaryColl = secDB.secondaryok_read_pref; assertNotPrimaryNoSecondaryOk(() => secondaryColl.aggregate([{$out: "target"}]).itcount()); assertNotPrimaryNoSecondaryOk( () => diff --git a/jstests/replsets/startup_without_fcv_document_succeeds_if_initial_sync_flag_set.js b/jstests/replsets/startup_without_fcv_document_succeeds_if_initial_sync_flag_set.js index 690151796b3..ec537f873fd 100644 --- a/jstests/replsets/startup_without_fcv_document_succeeds_if_initial_sync_flag_set.js +++ b/jstests/replsets/startup_without_fcv_document_succeeds_if_initial_sync_flag_set.js @@ -35,7 +35,7 @@ rst.awaitSecondaryNodes(); // Get the new secondary connection. secondary = rst.getSecondary(); -secondary.setSlaveOk(true); +secondary.setSecondaryOk(); const secondaryAdminDb = secondary.getDB("admin"); // Assert that the FCV document was cloned through initial sync on the secondary. diff --git a/jstests/replsets/step_down_on_secondary.js b/jstests/replsets/step_down_on_secondary.js index 64fcf73c3a2..5e8933d9017 100644 --- a/jstests/replsets/step_down_on_secondary.js +++ b/jstests/replsets/step_down_on_secondary.js @@ -93,7 +93,7 @@ jsTestLog("Do a read that hits a prepare conflict on the old primary"); const wTPrintPrepareConflictLogFailPoint = configureFailPoint(primary, "WTPrintPrepareConflictLog"); const joinReadThread = startParallelShell(() => { - db.getMongo().setSlaveOk(true); + db.getMongo().setSecondaryOk(); oldPrimaryDB = db.getSiblingDB(TestData.dbName); assert.commandFailedWithCode(oldPrimaryDB.runCommand({ diff --git a/jstests/replsets/tenant_migration_donor_state_machine.js b/jstests/replsets/tenant_migration_donor_state_machine.js index 7f21efef1c6..f626ce5d5b9 100644 --- a/jstests/replsets/tenant_migration_donor_state_machine.js +++ b/jstests/replsets/tenant_migration_donor_state_machine.js @@ -154,9 +154,6 @@ configDonorsColl.createIndex({expireAt: 1}, {expireAfterSeconds: 0}); jsTest.log("Test the case where the migration aborts"); const migrationId = UUID(); - let configDonorsColl = donorPrimary.getCollection(kConfigDonorsNS); - configDonorsColl.createIndex({expireAt: 1}, {expireAfterSeconds: 0}); - let abortFp = configureFailPoint(donorPrimary, "abortTenantMigrationAfterBlockingStarts"); assert.commandFailedWithCode(donorPrimary.adminCommand({ donorStartMigration: 1, @@ -187,6 +184,29 @@ configDonorsColl.createIndex({expireAt: 1}, {expireAfterSeconds: 0}); testDonorForgetMigration(donorRst, recipientRst, migrationId, kDBPrefix); })(); +// Drop the TTL index to make sure that the migration state is still available when the +// donorForgetMigration command is retried. +configDonorsColl.dropIndex({expireAt: 1}); + +(() => { + jsTest.log("Test that donorForgetMigration can be run multiple times"); + const migrationId = UUID(); + + assert.commandWorked(donorPrimary.adminCommand({ + donorStartMigration: 1, + migrationId: migrationId, + recipientConnectionString: kRecipientConnString, + databasePrefix: kDBPrefix, + readPreference: {mode: "primary"} + })); + + assert.commandWorked( + donorPrimary.adminCommand({donorForgetMigration: 1, migrationId: migrationId})); + + assert.commandWorked( + donorPrimary.adminCommand({donorForgetMigration: 1, migrationId: migrationId})); +})(); + donorRst.stopSet(); recipientRst.stopSet(); })(); diff --git a/jstests/replsets/transactions_only_allowed_on_primaries.js b/jstests/replsets/transactions_only_allowed_on_primaries.js index 7b71cf3eb67..e987eda0205 100644 --- a/jstests/replsets/transactions_only_allowed_on_primaries.js +++ b/jstests/replsets/transactions_only_allowed_on_primaries.js @@ -27,8 +27,8 @@ replTest.initiate(config); const primary = replTest.getPrimary(); const secondary = replTest.getSecondary(); -// Set slaveOk=true so that normal read commands would be allowed on the secondary. -secondary.setSlaveOk(true); +// Set secondaryOk=true so that normal read commands would be allowed on the secondary. +secondary.setSecondaryOk(); // Create a test collection that we can run commands against. const primaryDB = primary.getDB(dbName); diff --git a/jstests/sharding/agg_mongos_slaveok.js b/jstests/sharding/agg_mongos_slaveok.js index 287902092bc..01fb4286429 100644 --- a/jstests/sharding/agg_mongos_slaveok.js +++ b/jstests/sharding/agg_mongos_slaveok.js @@ -1,5 +1,5 @@ /** - * Tests aggregate command against mongos with slaveOk. For more tests on read preference, + * Tests aggregate command against mongos with secondaryOk. For more tests on read preference, * please refer to jstests/sharding/read_pref_cmd.js. * @tags: [ * requires_replication, @@ -21,12 +21,12 @@ var doTest = function(st, doSharded) { } testDB.user.insert({x: 10}, {writeConcern: {w: NODES}}); - testDB.setSlaveOk(true); + testDB.setSecondaryOk(); var secNode = st.rs0.getSecondary(); secNode.getDB('test').setProfilingLevel(2); - // wait for mongos to recognize that the slave is up + // wait for mongos to recognize that the secondary is up awaitRSClientHosts(st.s, secNode, {ok: true}); var res = testDB.runCommand({aggregate: 'user', pipeline: [{$project: {x: 1}}], cursor: {}}); diff --git a/jstests/sharding/all_shard_and_config_hosts_brought_down_one_by_one.js b/jstests/sharding/all_shard_and_config_hosts_brought_down_one_by_one.js index b0bd0f59e8c..a11f8dbc694 100644 --- a/jstests/sharding/all_shard_and_config_hosts_brought_down_one_by_one.js +++ b/jstests/sharding/all_shard_and_config_hosts_brought_down_one_by_one.js @@ -39,7 +39,7 @@ jsTest.log('Config nodes up: 1 of 3, shard nodes up: 1 of 2: ' + 'Only queries will work (no shard primary)'); st.rs0.stop(0); st.restartMongos(0); -st.s0.setSlaveOk(true); +st.s0.setSecondaryOk(); assert.eq([{_id: 0, count: 3}], st.s0.getDB('TestDB').TestColl.find().toArray()); jsTest.log('Config nodes up: 1 of 3, shard nodes up: 0 of 2: ' + diff --git a/jstests/sharding/auth_repl.js b/jstests/sharding/auth_repl.js index cd89c91f136..b806090fc3a 100644 --- a/jstests/sharding/auth_repl.js +++ b/jstests/sharding/auth_repl.js @@ -19,7 +19,7 @@ var testColl = testDB.user; // before setting up authentication assert.commandWorked(adminDB.runCommand({replSetGetStatus: 1})); -conn.setSlaveOk(); +conn.setSecondaryOk(); assert.commandWorked(adminDB.runCommand({replSetGetStatus: 1})); // Add admin user using direct connection to primary to simulate connection from remote host @@ -38,19 +38,19 @@ assert.eq(1, testDB.auth('a', 'a')); jsTest.log('Sending an authorized query that should be ok'); assert.commandWorked(testColl.insert({x: 1}, {writeConcern: {w: nodeCount}})); -conn.setSlaveOk(true); +conn.setSecondaryOk(); doc = testColl.findOne(); assert(doc != null); doc = testColl.find().readPref('secondary').next(); assert(doc != null); -conn.setSlaveOk(false); +conn.setSecondaryOk(false); doc = testColl.findOne(); assert(doc != null); var queryToPriShouldFail = function() { - conn.setSlaveOk(false); + conn.setSecondaryOk(false); assert.throws(function() { testColl.findOne(); @@ -63,7 +63,7 @@ var queryToPriShouldFail = function() { }; var queryToSecShouldFail = function() { - conn.setSlaveOk(true); + conn.setSecondaryOk(); assert.throws(function() { testColl.findOne(); @@ -104,7 +104,7 @@ queryToPriShouldFail(); assert.eq(1, testDB.auth('a', 'a')); // Find out the current cached secondary in the repl connection -conn.setSlaveOk(true); +conn.setSecondaryOk(); var serverInfo = testColl.find().readPref('secondary').explain().serverInfo; var secNodeIdx = -1; var secPortStr = serverInfo.port.toString(); diff --git a/jstests/sharding/auth_slaveok_routing.js b/jstests/sharding/auth_slaveok_routing.js index 8eff7833c9b..1e573fc7c9e 100644 --- a/jstests/sharding/auth_slaveok_routing.js +++ b/jstests/sharding/auth_slaveok_routing.js @@ -1,5 +1,5 @@ /** - * This tests whether slaveOk reads are properly routed through mongos in + * This tests whether secondaryOk reads are properly routed through mongos in * an authenticated environment. This test also includes restarting the * entire set, then querying afterwards. * @@ -59,11 +59,11 @@ priAdminDB.createUser({user: 'user', pwd: 'password', roles: jsTest.adminUserRol {w: 3, wtimeout: 30000}); coll.drop(); -coll.setSlaveOk(true); +coll.setSecondaryOk(); /* Secondaries should be up here, but they can still be in RECOVERY * state, which will make the ReplicaSetMonitor mark them as - * ok = false and not eligible for slaveOk queries. + * ok = false and not eligible for secondaryOk queries. */ awaitRSClientHosts(mongos, replTest.getSecondaries(), {ok: true, secondary: true}); @@ -90,7 +90,7 @@ for (var n = 0; n < nodeCount; n++) { replTest.awaitSecondaryNodes(); -coll.setSlaveOk(true); +coll.setSecondaryOk(); /* replSetMonitor does not refresh the nodes information when getting secondaries. * A node that is previously labeled as secondary can now be a primary, so we diff --git a/jstests/sharding/autodiscover_config_rs_from_secondary.js b/jstests/sharding/autodiscover_config_rs_from_secondary.js index 9d9bd4adbd5..cc6ca3c11ae 100644 --- a/jstests/sharding/autodiscover_config_rs_from_secondary.js +++ b/jstests/sharding/autodiscover_config_rs_from_secondary.js @@ -53,7 +53,7 @@ var mongos = MongoRunner.runMongos({configdb: seedList}); rst.stop(1); var admin = mongos.getDB('admin'); -mongos.setSlaveOk(true); +mongos.setSecondaryOk(); assert.eq(1, admin.foo.findOne().a); MongoRunner.stopMongos(mongos); rst.stopSet(); diff --git a/jstests/sharding/balance_repl.js b/jstests/sharding/balance_repl.js index 83c92ff37b1..fb501c979cb 100644 --- a/jstests/sharding/balance_repl.js +++ b/jstests/sharding/balance_repl.js @@ -44,7 +44,7 @@ var collPrimary = (new Mongo(s.s0.host)).getDB('TestDB').TestColl; assert.eq(2100, collPrimary.find().itcount()); var collSlaveOk = (new Mongo(s.s0.host)).getDB('TestDB').TestColl; -collSlaveOk.setSlaveOk(); +collSlaveOk.setSecondaryOk(); assert.eq(2100, collSlaveOk.find().itcount()); assert.commandWorked(s.s0.adminCommand({ diff --git a/jstests/sharding/chunk_history_window.js b/jstests/sharding/chunk_history_window.js index 1be21395483..adc2ca7247a 100644 --- a/jstests/sharding/chunk_history_window.js +++ b/jstests/sharding/chunk_history_window.js @@ -21,17 +21,29 @@ load("jstests/sharding/libs/sharded_transactions_helpers.js"); -const configHistoryWindowSecs = 10; +// The snapshot window is the max of minSnapshotHistoryWindowInSeconds and +// transactionLifetimeLimitSeconds. +const transactionLifetimeLimitSecs = 15; +const minSnapshotHistoryWindowSecs = transactionLifetimeLimitSecs; +const snapshotHistoryWindowSecs = + Math.max(minSnapshotHistoryWindowSecs, transactionLifetimeLimitSecs); + const st = new ShardingTest({ shards: {rs0: {nodes: 2}, rs1: {nodes: 2}}, other: { configOptions: { setParameter: { - minSnapshotHistoryWindowInSeconds: configHistoryWindowSecs, + minSnapshotHistoryWindowInSeconds: minSnapshotHistoryWindowSecs, + transactionLifetimeLimitSeconds: transactionLifetimeLimitSecs, logComponentVerbosity: tojson({sharding: {verbosity: 2}}) } }, - rsOptions: {setParameter: {minSnapshotHistoryWindowInSeconds: 600}} + rsOptions: { + setParameter: { + minSnapshotHistoryWindowInSeconds: minSnapshotHistoryWindowSecs, + transactionLifetimeLimitSeconds: transactionLifetimeLimitSecs, + } + } } }); @@ -40,14 +52,14 @@ assert.eq(assert .commandWorked( primaryAdmin.runCommand({getParameter: 1, minSnapshotHistoryWindowInSeconds: 1})) .minSnapshotHistoryWindowInSeconds, - 600); + minSnapshotHistoryWindowSecs); const configAdmin = st.configRS.getPrimary().getDB("admin"); assert.eq(assert .commandWorked( configAdmin.runCommand({getParameter: 1, minSnapshotHistoryWindowInSeconds: 1})) .minSnapshotHistoryWindowInSeconds, - 10); + minSnapshotHistoryWindowSecs); const mongosDB = st.s.getDB(jsTestName()); const mongosColl = mongosDB.test; @@ -81,9 +93,9 @@ assert.eq(2, chunk.history.length, tojson(chunk)); // Test history window with 1s margin. const testMarginMS = 1000; -// Test that reading from a snapshot at insertTS is valid for up to configHistoryWindowSecs +// Test that reading from a snapshot at insertTS is valid for up to snapshotHistoryWindowSecs // minus the testMarginMS (as a buffer). -const testWindowMS = configHistoryWindowSecs * 1000 - testMarginMS; +const testWindowMS = snapshotHistoryWindowSecs * 1000 - testMarginMS; while (Date.now() - 1000 * insertTS.getTime() < testWindowMS) { // Test that reading from a snapshot at insertTS is still valid. assert.commandWorked(mongosDB.runCommand( @@ -95,7 +107,7 @@ while (Date.now() - 1000 * insertTS.getTime() < testWindowMS) { } // Sleep until our most recent chunk move is before the oldest history in our window. -const chunkExpirationTime = postMoveChunkTime + configHistoryWindowSecs * 1000; +const chunkExpirationTime = postMoveChunkTime + snapshotHistoryWindowSecs * 1000; sleep(chunkExpirationTime + testMarginMS - Date.now()); jsTestLog("Move chunk back to shard 0 to trigger history cleanup"); diff --git a/jstests/sharding/chunk_operations_invalidate_single_shard.js b/jstests/sharding/chunk_operations_invalidate_single_shard.js index e660cec2305..30a736fcdea 100644 --- a/jstests/sharding/chunk_operations_invalidate_single_shard.js +++ b/jstests/sharding/chunk_operations_invalidate_single_shard.js @@ -52,6 +52,7 @@ let testSplit = () => { const mongosCollectionVersion = getMongosCollVersion(ns); assert.commandWorked(st.s.adminCommand({split: ns, middle: {x: -500}})); + assert.eq(mongosCollectionVersion, getMongosCollVersion(ns)); testColl.findOne({x: 0}); testColl.findOne({x: 1000}); diff --git a/jstests/sharding/cluster_create_indexes_always_routes_through_primary.js b/jstests/sharding/cluster_create_indexes_always_routes_through_primary.js index 6c661e0abac..6b61bd12a68 100644 --- a/jstests/sharding/cluster_create_indexes_always_routes_through_primary.js +++ b/jstests/sharding/cluster_create_indexes_always_routes_through_primary.js @@ -1,5 +1,5 @@ // Ensure that a call to createIndexes in a sharded cluster will route to the primary, even when -// setSlaveOk() is set to true. +// setSecondaryOk() is set to true. (function() { 'use strict'; @@ -12,7 +12,7 @@ assert.commandWorked(testDB.adminCommand({enableSharding: testDBName})); assert.commandWorked( testDB.adminCommand({shardCollection: testDB[collName].getFullName(), key: {x: 1}})); -st.s.setSlaveOk(true); +st.s.setSecondaryOk(); assert.commandWorked( testDB.runCommand({createIndexes: collName, indexes: [{key: {a: 1}, name: "index"}]})); diff --git a/jstests/sharding/config_rs_no_primary.js b/jstests/sharding/config_rs_no_primary.js index 91ce74de45d..8bcf7e54cd4 100644 --- a/jstests/sharding/config_rs_no_primary.js +++ b/jstests/sharding/config_rs_no_primary.js @@ -43,9 +43,9 @@ var testOps = function(mongos) { assert.throws(function() { mongos.getDB('config').shards.findOne(); }); - mongos.setSlaveOk(true); + mongos.setSecondaryOk(); var shardDoc = mongos.getDB('config').shards.findOne(); - mongos.setSlaveOk(false); + mongos.setSecondaryOk(false); assert.neq(null, shardDoc); jsTestLog("Doing ops that require metadata writes and thus should fail against: " + mongos); diff --git a/jstests/sharding/count_config_servers.js b/jstests/sharding/count_config_servers.js index ded75607cd0..0904a873e52 100644 --- a/jstests/sharding/count_config_servers.js +++ b/jstests/sharding/count_config_servers.js @@ -13,7 +13,7 @@ TestData.skipCheckOrphans = true; "use strict"; var st = new ShardingTest({name: 'sync_conn_cmd', shards: 0}); -st.s.setSlaveOk(true); +st.s.setSecondaryOk(); var configDB = st.config; var coll = configDB.test; diff --git a/jstests/sharding/count_slaveok.js b/jstests/sharding/count_slaveok.js index e527128a7cd..23612d96220 100644 --- a/jstests/sharding/count_slaveok.js +++ b/jstests/sharding/count_slaveok.js @@ -1,5 +1,5 @@ /** - * Tests count and distinct using slaveOk. Also tests a scenario querying a set where only one + * Tests count and distinct using secondaryOk. Also tests a scenario querying a set where only one * secondary is up. */ @@ -20,7 +20,7 @@ var rst = st.rs0; // Insert data into replica set var conn = new Mongo(st.s.host); -var coll = conn.getCollection('test.countSlaveOk'); +var coll = conn.getCollection('test.countSecondaryOk'); coll.drop(); var bulk = coll.initializeUnorderedBulkOp(); @@ -51,9 +51,9 @@ awaitRSClientHosts(conn, sec, {ok: true, secondary: true}); // Make sure that mongos realizes that primary is already down awaitRSClientHosts(conn, primary, {ok: false}); -// Need to check slaveOk=true first, since slaveOk=false will destroy conn in pool when +// Need to check secondaryOk=true first, since secondaryOk=false will destroy conn in pool when // master is down -conn.setSlaveOk(); +conn.setSecondaryOk(); // count using the command path assert.eq(30, coll.find({i: 0}).count()); @@ -62,14 +62,14 @@ assert.eq(30, coll.find({i: 0}).itcount()); assert.eq(10, coll.distinct("i").length); try { - conn.setSlaveOk(false); - // Should throw exception, since not slaveOk'd + conn.setSecondaryOk(false); + // Should throw exception, since not secondaryOk'd coll.find({i: 0}).count(); print("Should not reach here!"); assert(false); } catch (e) { - print("Non-slaveOk'd connection failed."); + print("Non-secondaryOk'd connection failed."); } st.stop(); diff --git a/jstests/sharding/error_propagation.js b/jstests/sharding/error_propagation.js index 6f47075f753..6fa9b7da74c 100644 --- a/jstests/sharding/error_propagation.js +++ b/jstests/sharding/error_propagation.js @@ -8,7 +8,7 @@ var st = new ShardingTest({mongos: 1, shards: 1, rs: {nodes: 3}}); var db = st.getDB('test'); -db.setSlaveOk(true); +db.setSecondaryOk(); assert.commandWorked(db.foo.insert({a: 1}, {writeConcern: {w: 3}})); assert.commandWorked(db.runCommand( diff --git a/jstests/sharding/mongos_forwards_api_parameters_to_shards.js b/jstests/sharding/mongos_forwards_api_parameters_to_shards.js new file mode 100644 index 00000000000..e611f716992 --- /dev/null +++ b/jstests/sharding/mongos_forwards_api_parameters_to_shards.js @@ -0,0 +1,213 @@ +/** + * When a client calls a mongos command with API parameters, mongos must forward them to shards. + * + * @tags: [multiversion_incompatible] + */ + +(function() { +'use strict'; + +load('jstests/sharding/libs/sharded_transactions_helpers.js'); + +let st = new ShardingTest({ + mongos: 1, + shards: 2, + rs: {nodes: 1, setParameter: {logComponentVerbosity: tojson({command: {verbosity: 2}})}} +}); + +class APIParameterTest { + constructor( + command, + {dbName = "db", inAPIVersion1 = true, permittedInTxn = true, shardCommandName} = {}) { + this.command = command; + this.dbName = dbName; + this.inAPIVersion1 = inAPIVersion1; + this.permittedInTxn = permittedInTxn; + if (shardCommandName === undefined) { + this.commandName = Object.keys(command)[0]; + } else { + // mongos executes a different command on the shards, e.g. mapReduce becomes aggregate. + this.commandName = shardCommandName; + } + } +} + +const tests = [ + // Write commands. Note, these rely on _id 1 residing on shard 0. + new APIParameterTest({insert: "collection", documents: [{_id: 1}]}), + new APIParameterTest({update: "collection", updates: [{q: {_id: 1}, u: {$set: {x: 1}}}]}), + new APIParameterTest({delete: "collection", deletes: [{q: {_id: 1}, limit: 1}]}), + + // Read commands. + new APIParameterTest({aggregate: "collection", pipeline: [], cursor: {}}), + new APIParameterTest({aggregate: "collection", pipeline: [], cursor: {}, explain: true}, + {shardCommandName: "explain", permittedInTxn: false}), + new APIParameterTest({find: "collection"}), + new APIParameterTest({count: "collection"}, {permittedInTxn: false}), + new APIParameterTest({count: "collection", query: {_id: {$lt: 0}}}, + {inAPIVersion1: false, permittedInTxn: false}), + new APIParameterTest({distinct: "collection", key: "_id"}, + {inAPIVersion1: false, permittedInTxn: false}), + new APIParameterTest( + { + mapReduce: "collection", + map: function() { + emit(1, 1); + }, + reduce: function(key, values) { + return {count: values.length}; + }, + out: {inline: 1} + }, + {inAPIVersion1: false, permittedInTxn: false, shardCommandName: "aggregate"}), + + // FindAndModify. + new APIParameterTest({findAndModify: "collection", query: {_id: 1}, remove: true}), + + // DDL. Order matters: we must create, modify, then drop an index on collection2. + new APIParameterTest({createIndexes: "collection2", indexes: [{key: {x: 1}, name: "x_1"}]}), + new APIParameterTest({collMod: "collection2", index: {keyPattern: {x: 1}, hidden: true}}, + {permittedInTxn: false}), + new APIParameterTest({dropIndexes: "collection2", index: "x_1"}, {permittedInTxn: false}), + // We can create indexes on a non-existent collection in a sharded transaction. + new APIParameterTest({create: "newCollection"}), + new APIParameterTest({renameCollection: "db.newCollection", to: "db.newerCollection"}, + {inAPIVersion1: false, permittedInTxn: false, dbName: "admin"}), + new APIParameterTest({drop: "collection"}, {permittedInTxn: false}), + new APIParameterTest({dropDatabase: 1}, {permittedInTxn: false}), +]; + +function checkPrimaryLog(conn, commandName, apiVersion, apiStrict, apiDeprecationErrors, message) { + const logs = checkLog.getGlobalLog(conn); + let lastCommandInvocation; + + for (let logMsg of logs) { + const obj = JSON.parse(logMsg); + // Search for "About to run the command" logs. + if (obj.id !== 21965) { + continue; + } + + const args = obj.attr.commandArgs; + if (commandName !== Object.keys(args)[0]) { + continue; + } + + lastCommandInvocation = args; + if (args.apiVersion !== apiVersion || args.apiStrict !== apiStrict || + args.apiDeprecationErrors !== apiDeprecationErrors) { + continue; + } + + // Found a match. + return; + } + + if (lastCommandInvocation === undefined) { + doassert(`Primary didn't log ${commandName}`); + return; + } + + doassert(`Primary didn't log ${message}, last invocation of ${commandName} was` + + ` ${tojson(lastCommandInvocation)}`); +} + +for (const sharded of [false, true]) { + for (const [apiVersion, apiStrict, apiDeprecationErrors] of [[undefined, undefined, undefined], + ["1", undefined, undefined], + ["1", undefined, false], + ["1", undefined, true], + ["1", false, undefined], + ["1", false, false], + ["1", false, true], + ["1", true, undefined], + ["1", true, false], + ["1", true, true], + ]) { + for (let inTransaction of [false, true]) { + if (sharded) { + jsTestLog("Sharded setup"); + assert.commandWorked(st.s.getDB("db")["collection"].insert( + {_id: 0}, {writeConcern: {w: "majority"}})); + assert.commandWorked(st.s.getDB("db")["collection"].insert( + {_id: 20}, {writeConcern: {w: "majority"}})); + + assert.commandWorked(st.s.adminCommand({enableSharding: "db"})); + st.ensurePrimaryShard("db", st.shard0.shardName); + assert.commandWorked( + st.s.adminCommand({shardCollection: "db.collection", key: {_id: 1}})); + + // The chunk with _id 1 is on shard 0. + assert.commandWorked( + st.s.adminCommand({split: "db.collection", middle: {_id: 10}})); + assert.commandWorked(st.s.adminCommand( + {moveChunk: "db.collection", find: {_id: 20}, to: st.shard1.shardName})); + } else { + jsTestLog("Unsharded setup"); + assert.commandWorked(st.s.getDB("db")["collection"].insert( + {_id: 0}, {writeConcern: {w: "majority"}})); + st.ensurePrimaryShard("db", st.shard0.shardName); + } + + // Shard 0's primary. + const primary = st.rs0.getPrimary(); + + for (const test of tests) { + if (inTransaction && !test.permittedInTxn) { + continue; + } + + if (apiStrict && !test.inAPIVersion1) { + continue; + } + + // Make a copy of the test's command body, and set its API parameters. + const commandWithAPIParams = Object.assign({}, test.command); + if (apiVersion !== undefined) { + commandWithAPIParams.apiVersion = apiVersion; + } + + if (apiStrict !== undefined) { + commandWithAPIParams.apiStrict = apiStrict; + } + + if (apiDeprecationErrors !== undefined) { + commandWithAPIParams.apiDeprecationErrors = apiDeprecationErrors; + } + + assert.commandWorked(primary.adminCommand({clearLog: "global"})); + const message = `command ${tojson(commandWithAPIParams)}` + + ` ${sharded ? "sharded" : "unsharded"},` + + ` ${inTransaction ? "in" : "outside"} transaction`; + + flushRoutersAndRefreshShardMetadata(st, {ns: "db.collection"}); + + jsTestLog(`Running ${message}`); + + if (inTransaction) { + const session = st.s0.startSession(); + const sessionDb = session.getDatabase(test.dbName); + session.startTransaction(); + assert.commandWorked(sessionDb.runCommand(commandWithAPIParams)); + assert.commandWorked(session.commitTransaction_forTesting()); + } else { + const db = st.s0.getDB(test.dbName); + assert.commandWorked(db.runCommand(commandWithAPIParams)); + } + + checkPrimaryLog(primary, + test.commandName, + apiVersion, + apiStrict, + apiDeprecationErrors, + message); + } + + jsTestLog("JS test cleanup: Drop database 'db'"); + st.s0.getDB("db").runCommand({dropDatabase: 1}); + } + } +} + +st.stop(); +})(); diff --git a/jstests/sharding/mongos_rs_auth_shard_failure_tolerance.js b/jstests/sharding/mongos_rs_auth_shard_failure_tolerance.js index 5cb277197b1..466c4314d45 100644 --- a/jstests/sharding/mongos_rs_auth_shard_failure_tolerance.js +++ b/jstests/sharding/mongos_rs_auth_shard_failure_tolerance.js @@ -160,9 +160,9 @@ gc(); // Clean up new connections jsTest.log("Stopping primary of second shard..."); -mongosConnActive.setSlaveOk(); +mongosConnActive.setSecondaryOk(); mongosConnIdle = authDBUsers(new Mongo(mongos.host)); -mongosConnIdle.setSlaveOk(); +mongosConnIdle.setSecondaryOk(); // Need to save this node for later var rs1Secondary = st.rs1.getSecondary(); @@ -192,13 +192,13 @@ assert.neq(null, mongosConnIdle.getCollection(collUnsharded.toString()).findOne( jsTest.log("Testing new connections with second primary down..."); mongosConnNew = authDBUsers(new Mongo(mongos.host)); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); assert.neq(null, mongosConnNew.getCollection(collSharded.toString()).findOne({_id: -1})); mongosConnNew = authDBUsers(new Mongo(mongos.host)); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); assert.neq(null, mongosConnNew.getCollection(collSharded.toString()).findOne({_id: 1})); mongosConnNew = authDBUsers(new Mongo(mongos.host)); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); assert.neq(null, mongosConnNew.getCollection(collUnsharded.toString()).findOne({_id: 1})); mongosConnNew = authDBUsers(new Mongo(mongos.host)); @@ -212,9 +212,9 @@ gc(); // Clean up new connections jsTest.log("Stopping primary of first shard..."); -mongosConnActive.setSlaveOk(); +mongosConnActive.setSecondaryOk(); mongosConnIdle = authDBUsers(new Mongo(mongos.host)); -mongosConnIdle.setSlaveOk(); +mongosConnIdle.setSecondaryOk(); st.rs0.stop(st.rs0.getPrimary()); @@ -241,13 +241,13 @@ assert.neq(null, mongosConnIdle.getCollection(collUnsharded.toString()).findOne( jsTest.log("Testing new connections with first primary down..."); mongosConnNew = authDBUsers(new Mongo(mongos.host)); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); assert.neq(null, mongosConnNew.getCollection(collSharded.toString()).findOne({_id: -1})); mongosConnNew = authDBUsers(new Mongo(mongos.host)); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); assert.neq(null, mongosConnNew.getCollection(collSharded.toString()).findOne({_id: 1})); mongosConnNew = authDBUsers(new Mongo(mongos.host)); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); assert.neq(null, mongosConnNew.getCollection(collUnsharded.toString()).findOne({_id: 1})); mongosConnNew = authDBUsers(new Mongo(mongos.host)); @@ -261,9 +261,9 @@ gc(); // Clean up new connections jsTest.log("Stopping second shard..."); -mongosConnActive.setSlaveOk(); +mongosConnActive.setSecondaryOk(); mongosConnIdle = authDBUsers(new Mongo(mongos.host)); -mongosConnIdle.setSlaveOk(); +mongosConnIdle.setSecondaryOk(); st.rs1.stop(rs1Secondary); @@ -288,10 +288,10 @@ assert.neq(null, mongosConnIdle.getCollection(collUnsharded.toString()).findOne( jsTest.log("Testing new connections with second shard down..."); mongosConnNew = authDBUsers(new Mongo(mongos.host)); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); assert.neq(null, mongosConnNew.getCollection(collSharded.toString()).findOne({_id: -1})); mongosConnNew = authDBUsers(new Mongo(mongos.host)); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); assert.neq(null, mongosConnNew.getCollection(collUnsharded.toString()).findOne({_id: 1})); mongosConnNew = authDBUsers(new Mongo(mongos.host)); diff --git a/jstests/sharding/mongos_rs_shard_failure_tolerance.js b/jstests/sharding/mongos_rs_shard_failure_tolerance.js index 34d68c45f6e..89dc4c07986 100644 --- a/jstests/sharding/mongos_rs_shard_failure_tolerance.js +++ b/jstests/sharding/mongos_rs_shard_failure_tolerance.js @@ -131,11 +131,11 @@ st.rs1.stop(st.rs1.getPrimary()); jsTest.log("Testing active connection with second primary down..."); // Reads with read prefs -mongosConnActive.setSlaveOk(); +mongosConnActive.setSecondaryOk(); assert.neq(null, mongosConnActive.getCollection(collSharded.toString()).findOne({_id: -1})); assert.neq(null, mongosConnActive.getCollection(collSharded.toString()).findOne({_id: 1})); assert.neq(null, mongosConnActive.getCollection(collUnsharded.toString()).findOne({_id: 1})); -mongosConnActive.setSlaveOk(false); +mongosConnActive.setSecondaryOk(false); mongosConnActive.setReadPref("primary"); assert.neq(null, mongosConnActive.getCollection(collSharded.toString()).findOne({_id: -1})); @@ -145,14 +145,14 @@ assert.throws(function() { assert.neq(null, mongosConnActive.getCollection(collUnsharded.toString()).findOne({_id: 1})); // Ensure read prefs override slaveOK -mongosConnActive.setSlaveOk(); +mongosConnActive.setSecondaryOk(); mongosConnActive.setReadPref("primary"); assert.neq(null, mongosConnActive.getCollection(collSharded.toString()).findOne({_id: -1})); assert.throws(function() { mongosConnActive.getCollection(collSharded.toString()).findOne({_id: 1}); }); assert.neq(null, mongosConnActive.getCollection(collUnsharded.toString()).findOne({_id: 1})); -mongosConnActive.setSlaveOk(false); +mongosConnActive.setSecondaryOk(false); mongosConnActive.setReadPref("secondary"); assert.neq(null, mongosConnActive.getCollection(collSharded.toString()).findOne({_id: -1})); @@ -187,11 +187,11 @@ assert.writeError(mongosConnIdle.getCollection(collSharded.toString()).insert({_ assert.commandWorked(mongosConnIdle.getCollection(collUnsharded.toString()).insert({_id: 6}, wc)); // Reads with read prefs -mongosConnIdle.setSlaveOk(); +mongosConnIdle.setSecondaryOk(); assert.neq(null, mongosConnIdle.getCollection(collSharded.toString()).findOne({_id: -1})); assert.neq(null, mongosConnIdle.getCollection(collSharded.toString()).findOne({_id: 1})); assert.neq(null, mongosConnIdle.getCollection(collUnsharded.toString()).findOne({_id: 1})); -mongosConnIdle.setSlaveOk(false); +mongosConnIdle.setSecondaryOk(false); mongosConnIdle.setReadPref("primary"); assert.neq(null, mongosConnIdle.getCollection(collSharded.toString()).findOne({_id: -1})); @@ -201,14 +201,14 @@ assert.throws(function() { assert.neq(null, mongosConnIdle.getCollection(collUnsharded.toString()).findOne({_id: 1})); // Ensure read prefs override slaveOK -mongosConnIdle.setSlaveOk(); +mongosConnIdle.setSecondaryOk(); mongosConnIdle.setReadPref("primary"); assert.neq(null, mongosConnIdle.getCollection(collSharded.toString()).findOne({_id: -1})); assert.throws(function() { mongosConnIdle.getCollection(collSharded.toString()).findOne({_id: 1}); }); assert.neq(null, mongosConnIdle.getCollection(collUnsharded.toString()).findOne({_id: 1})); -mongosConnIdle.setSlaveOk(false); +mongosConnIdle.setSecondaryOk(false); mongosConnIdle.setReadPref("secondary"); assert.neq(null, mongosConnIdle.getCollection(collSharded.toString()).findOne({_id: -1})); @@ -234,13 +234,13 @@ jsTest.log("Testing new connections with second primary down..."); // Reads with read prefs mongosConnNew = new Mongo(mongos.host); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); assert.neq(null, mongosConnNew.getCollection(collSharded.toString()).findOne({_id: -1})); mongosConnNew = new Mongo(mongos.host); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); assert.neq(null, mongosConnNew.getCollection(collSharded.toString()).findOne({_id: 1})); mongosConnNew = new Mongo(mongos.host); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); assert.neq(null, mongosConnNew.getCollection(collUnsharded.toString()).findOne({_id: 1})); gc(); // Clean up new connections incrementally to compensate for slow win32 machine. @@ -261,17 +261,17 @@ gc(); // Clean up new connections incrementally to compensate for slow win32 ma // Ensure read prefs override slaveok mongosConnNew = new Mongo(mongos.host); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); mongosConnNew.setReadPref("primary"); assert.neq(null, mongosConnNew.getCollection(collSharded.toString()).findOne({_id: -1})); mongosConnNew = new Mongo(mongos.host); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); mongosConnNew.setReadPref("primary"); assert.throws(function() { mongosConnNew.getCollection(collSharded.toString()).findOne({_id: 1}); }); mongosConnNew = new Mongo(mongos.host); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); mongosConnNew.setReadPref("primary"); assert.neq(null, mongosConnNew.getCollection(collUnsharded.toString()).findOne({_id: 1})); @@ -343,7 +343,7 @@ st.rs0.stop(st.rs0.getPrimary()); jsTest.log("Testing active connection with first primary down..."); -mongosConnActive.setSlaveOk(); +mongosConnActive.setSecondaryOk(); assert.neq(null, mongosConnActive.getCollection(collSharded.toString()).findOne({_id: -1})); assert.neq(null, mongosConnActive.getCollection(collSharded.toString()).findOne({_id: 1})); assert.neq(null, mongosConnActive.getCollection(collUnsharded.toString()).findOne({_id: 1})); @@ -358,7 +358,7 @@ assert.writeError(mongosConnIdle.getCollection(collSharded.toString()).insert({_ assert.writeError(mongosConnIdle.getCollection(collSharded.toString()).insert({_id: 9})); assert.writeError(mongosConnIdle.getCollection(collUnsharded.toString()).insert({_id: 9})); -mongosConnIdle.setSlaveOk(); +mongosConnIdle.setSecondaryOk(); assert.neq(null, mongosConnIdle.getCollection(collSharded.toString()).findOne({_id: -1})); assert.neq(null, mongosConnIdle.getCollection(collSharded.toString()).findOne({_id: 1})); assert.neq(null, mongosConnIdle.getCollection(collUnsharded.toString()).findOne({_id: 1})); @@ -366,13 +366,13 @@ assert.neq(null, mongosConnIdle.getCollection(collUnsharded.toString()).findOne( jsTest.log("Testing new connections with first primary down..."); mongosConnNew = new Mongo(mongos.host); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); assert.neq(null, mongosConnNew.getCollection(collSharded.toString()).findOne({_id: -1})); mongosConnNew = new Mongo(mongos.host); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); assert.neq(null, mongosConnNew.getCollection(collSharded.toString()).findOne({_id: 1})); mongosConnNew = new Mongo(mongos.host); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); assert.neq(null, mongosConnNew.getCollection(collUnsharded.toString()).findOne({_id: 1})); mongosConnNew = new Mongo(mongos.host); @@ -392,7 +392,7 @@ st.rs1.stop(rs1Secondary); jsTest.log("Testing active connection with second shard down..."); -mongosConnActive.setSlaveOk(); +mongosConnActive.setSecondaryOk(); assert.neq(null, mongosConnActive.getCollection(collSharded.toString()).findOne({_id: -1})); assert.neq(null, mongosConnActive.getCollection(collUnsharded.toString()).findOne({_id: 1})); @@ -406,17 +406,17 @@ assert.writeError(mongosConnIdle.getCollection(collSharded.toString()).insert({_ assert.writeError(mongosConnIdle.getCollection(collSharded.toString()).insert({_id: 12})); assert.writeError(mongosConnIdle.getCollection(collUnsharded.toString()).insert({_id: 12})); -mongosConnIdle.setSlaveOk(); +mongosConnIdle.setSecondaryOk(); assert.neq(null, mongosConnIdle.getCollection(collSharded.toString()).findOne({_id: -1})); assert.neq(null, mongosConnIdle.getCollection(collUnsharded.toString()).findOne({_id: 1})); jsTest.log("Testing new connections with second shard down..."); mongosConnNew = new Mongo(mongos.host); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); assert.neq(null, mongosConnNew.getCollection(collSharded.toString()).findOne({_id: -1})); mongosConnNew = new Mongo(mongos.host); -mongosConnNew.setSlaveOk(); +mongosConnNew.setSecondaryOk(); assert.neq(null, mongosConnNew.getCollection(collUnsharded.toString()).findOne({_id: 1})); mongosConnNew = new Mongo(mongos.host); diff --git a/jstests/sharding/query/explain_read_pref.js b/jstests/sharding/query/explain_read_pref.js index ce5e2cf47af..c3c51d85756 100644 --- a/jstests/sharding/query/explain_read_pref.js +++ b/jstests/sharding/query/explain_read_pref.js @@ -58,7 +58,7 @@ var testAllModes = function(conn, isMongos) { var mode = args[0], tagSets = args[1], secExpected = args[2]; var testDB = conn.getDB('TestDB'); - conn.setSlaveOk(false); // purely rely on readPref + conn.setSecondaryOk(false); // purely rely on readPref jsTest.log('Testing mode: ' + mode + ', tag sets: ' + tojson(tagSets)); // .explain().find() diff --git a/jstests/sharding/read_pref.js b/jstests/sharding/read_pref.js index 95c0e9697c3..9267cb18430 100644 --- a/jstests/sharding/read_pref.js +++ b/jstests/sharding/read_pref.js @@ -134,7 +134,7 @@ var doTest = function(useDollarQuerySyntax) { var explainServer = getExplainServer(explain); assert.neq(primaryNode.name, explainServer); - conn.setSlaveOk(); + conn.setSecondaryOk(); // It should also work with slaveOk explain = getExplain("secondary"); diff --git a/jstests/sharding/read_pref_cmd.js b/jstests/sharding/read_pref_cmd.js index 2c2a7f3332b..f94dd924f45 100644 --- a/jstests/sharding/read_pref_cmd.js +++ b/jstests/sharding/read_pref_cmd.js @@ -165,7 +165,7 @@ let testConnReadPreference = function(conn, isMongos, rsNodes, {readPref, expect let testDB = conn.getDB(kDbName); let shardedColl = conn.getCollection(kShardedNs); - conn.setSlaveOk(false); // purely rely on readPref + conn.setSecondaryOk(false); // purely rely on readPref conn.setReadPref(readPref.mode, readPref.tagSets, readPref.hedge); /** @@ -387,7 +387,7 @@ let testCursorReadPreference = function(conn, isMongos, rsNodes, {readPref, expe tojson(readPref.tagSets)}, hedge ${tojson(readPref.hedge)}`); let testColl = conn.getCollection(kShardedNs); - conn.setSlaveOk(false); // purely rely on readPref + conn.setSecondaryOk(false); // purely rely on readPref let bulk = testColl.initializeUnorderedBulkOp(); for (let i = 0; i < kNumDocs; ++i) { diff --git a/jstests/sharding/read_write_concern_defaults_application.js b/jstests/sharding/read_write_concern_defaults_application.js index 5db16a6e27f..1fd7146d32f 100644 --- a/jstests/sharding/read_write_concern_defaults_application.js +++ b/jstests/sharding/read_write_concern_defaults_application.js @@ -459,6 +459,7 @@ let testCases = { hello: {skip: "does not accept read or write concern"}, hostInfo: {skip: "does not accept read or write concern"}, httpClientRequest: {skip: "does not accept read or write concern"}, + importCollection: {skip: "internal command"}, insert: { setUp: function(conn) { assert.commandWorked(conn.getDB(db).runCommand({create: coll, writeConcern: {w: 1}})); diff --git a/jstests/sharding/recovering_slaveok.js b/jstests/sharding/recovering_slaveok.js index 512719b08b6..d9bcd44da87 100644 --- a/jstests/sharding/recovering_slaveok.js +++ b/jstests/sharding/recovering_slaveok.js @@ -1,6 +1,6 @@ /** - * This tests that slaveOk'd queries in sharded setups get correctly routed when a slave goes into - * RECOVERING state, and don't break + * This tests that secondaryOk'd queries in sharded setups get correctly routed when a slave goes + * into RECOVERING state, and don't break */ // Shard secondaries are restarted, which may cause that shard's primary to stepdown while it does @@ -12,11 +12,11 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true; load("jstests/replsets/rslib.js"); var shardTest = - new ShardingTest({name: "recovering_slaveok", shards: 2, mongos: 2, other: {rs: true}}); + new ShardingTest({name: "recovering_secondaryok", shards: 2, mongos: 2, other: {rs: true}}); var mongos = shardTest.s0; var mongosSOK = shardTest.s1; -mongosSOK.setSlaveOk(); +mongosSOK.setSecondaryOk(); var admin = mongos.getDB("admin"); var config = mongos.getDB("config"); @@ -50,7 +50,7 @@ shardTest.shardColl(coll, /* dbname */ null, /* waitForDelete */ true); -print("3: test normal and slaveOk queries"); +print("3: test normal and secondaryOk queries"); // Make shardA and rsA the same var shardA = shardTest.getShard(coll, {_id: -1}); @@ -87,7 +87,7 @@ print("6: stop non-RECOVERING secondary"); rsA.stop(goodSec); -print("7: check our regular and slaveOk query"); +print("7: check our regular and secondaryOk query"); assert.eq(2, coll.find().itcount()); assert.eq(2, collSOk.find().itcount()); @@ -100,7 +100,7 @@ print("9: wait for recovery"); rsA.waitForState(rsA.getSecondaries(), ReplSetTest.State.SECONDARY, 5 * 60 * 1000); -print("10: check our regular and slaveOk query"); +print("10: check our regular and secondaryOk query"); // We need to make sure our nodes are considered accessible from mongos - otherwise we fail // See SERVER-7274 @@ -112,7 +112,7 @@ awaitRSClientHosts(coll.getMongo(), rsB.nodes, {ok: true}); awaitRSClientHosts(collSOk.getMongo(), [rsA.getSecondaries()[0]], {secondary: true, ok: true}); awaitRSClientHosts(collSOk.getMongo(), [rsB.getSecondaries()[0]], {secondary: true, ok: true}); -print("SlaveOK Query..."); +print("SecondaryOk Query..."); var sOKCount = collSOk.find().itcount(); var collCount = null; diff --git a/jstests/sharding/session_info_in_oplog.js b/jstests/sharding/session_info_in_oplog.js index 617d5759207..a7644fca599 100644 --- a/jstests/sharding/session_info_in_oplog.js +++ b/jstests/sharding/session_info_in_oplog.js @@ -329,7 +329,7 @@ replTest.initiate(); var priConn = replTest.getPrimary(); var secConn = replTest.getSecondary(); -secConn.setSlaveOk(true); +secConn.setSecondaryOk(); runTests(priConn, priConn, secConn); @@ -338,7 +338,7 @@ replTest.stopSet(); var st = new ShardingTest({shards: {rs0: {nodes: kNodes}}}); secConn = st.rs0.getSecondary(); -secConn.setSlaveOk(true); +secConn.setSecondaryOk(); runTests(st.s, st.rs0.getPrimary(), secConn); st.stop(); diff --git a/jstests/sharding/shard_aware_init_secondaries.js b/jstests/sharding/shard_aware_init_secondaries.js index 59a8542f44b..f852c6e58a1 100644 --- a/jstests/sharding/shard_aware_init_secondaries.js +++ b/jstests/sharding/shard_aware_init_secondaries.js @@ -41,7 +41,7 @@ assert.commandWorked(priConn.getDB('admin').system.version.update( shardIdentityQuery, shardIdentityUpdate, {upsert: true, writeConcern: {w: 2}})); var secConn = replTest.getSecondary(); -secConn.setSlaveOk(true); +secConn.setSecondaryOk(); var res = secConn.getDB('admin').runCommand({shardingState: 1}); @@ -55,7 +55,7 @@ replTest.waitForPrimary(); replTest.awaitSecondaryNodes(); secConn = replTest.getSecondary(); -secConn.setSlaveOk(true); +secConn.setSecondaryOk(); res = secConn.getDB('admin').runCommand({shardingState: 1}); diff --git a/jstests/sharding/shard_identity_config_update.js b/jstests/sharding/shard_identity_config_update.js index 3e668c5903c..43c10bbbd22 100644 --- a/jstests/sharding/shard_identity_config_update.js +++ b/jstests/sharding/shard_identity_config_update.js @@ -63,7 +63,7 @@ assert.soon(function() { }); var secConn = st.rs0.getSecondary(); -secConn.setSlaveOk(true); +secConn.setSecondaryOk(); assert.soon(function() { return checkConfigStrUpdated(secConn, expectedConfigStr); }); @@ -96,7 +96,7 @@ assert.soon(function() { }); secConn = st.rs0.getSecondary(); -secConn.setSlaveOk(true); +secConn.setSecondaryOk(); assert.soon(function() { return checkConfigStrUpdated(secConn, origConfigConnStr); }); diff --git a/jstests/sharding/shard_identity_rollback.js b/jstests/sharding/shard_identity_rollback.js index d6e47fa3137..25dbc2e19e4 100644 --- a/jstests/sharding/shard_identity_rollback.js +++ b/jstests/sharding/shard_identity_rollback.js @@ -52,7 +52,7 @@ assert.eq(shardIdentityDoc.clusterId, res.clusterId); // Ensure sharding state on the secondaries was *not* initialized secondaries.forEach(function(secondary) { - secondary.setSlaveOk(true); + secondary.setSecondaryOk(); res = secondary.getDB('admin').runCommand({shardingState: 1}); assert(!res.enabled, tojson(res)); }); @@ -105,7 +105,7 @@ try { // specified. We do want to wait to be able to connect to the node here however, so we need to pass // {waitForConnect: true}. priConn = replTest.start(priConn.nodeId, {shardsvr: '', waitForConnect: true}, true); -priConn.setSlaveOk(); +priConn.setSecondaryOk(); // Wait for the old primary to replicate the document that was written to the new primary while // it was shut down. diff --git a/jstests/sharding/shard_insert_getlasterror_w2.js b/jstests/sharding/shard_insert_getlasterror_w2.js index 7bde30b2dc5..a4a0f5c540f 100644 --- a/jstests/sharding/shard_insert_getlasterror_w2.js +++ b/jstests/sharding/shard_insert_getlasterror_w2.js @@ -70,7 +70,7 @@ replSet1.stop(secondary2); replSet1.waitForState(primary, ReplSetTest.State.SECONDARY); testDB.getMongo().adminCommand({setParameter: 1, logLevel: 1}); -testDB.getMongo().setSlaveOk(); +testDB.getMongo().setSecondaryOk(); print("trying some queries"); assert.soon(function() { try { diff --git a/jstests/slow1/replsets_priority1.js b/jstests/slow1/replsets_priority1.js index 3ff6c058cc7..4dea828c793 100644 --- a/jstests/slow1/replsets_priority1.js +++ b/jstests/slow1/replsets_priority1.js @@ -146,9 +146,9 @@ for (var i = 0; i < n; i++) { assert.soon(function() { var versions = [0, 0]; var secondaries = rs.getSecondaries(); - secondaries[0].setSlaveOk(); + secondaries[0].setSecondaryOk(); versions[0] = secondaries[0].getDB("local").system.replset.findOne().version; - secondaries[1].setSlaveOk(); + secondaries[1].setSecondaryOk(); versions[1] = secondaries[1].getDB("local").system.replset.findOne().version; return versions[0] == config.version && versions[1] == config.version; }); diff --git a/jstests/ssl/mongo_uri_secondaries.js b/jstests/ssl/mongo_uri_secondaries.js index a4ed1eae93c..73cca540c80 100644 --- a/jstests/ssl/mongo_uri_secondaries.js +++ b/jstests/ssl/mongo_uri_secondaries.js @@ -39,7 +39,7 @@ const subShellCommand = function(hosts) { for (var i = 0; i < 10; i++) { var db = Ms[i].getDB("test"); - db.setSlaveOk(true); + db.setSecondaryOk(); db.col.find().readPref("secondary").toArray(); } }; diff --git a/src/mongo/client/dbclient_base.cpp b/src/mongo/client/dbclient_base.cpp index 04748a525af..9a7c87eed66 100644 --- a/src/mongo/client/dbclient_base.cpp +++ b/src/mongo/client/dbclient_base.cpp @@ -49,8 +49,8 @@ #include "mongo/client/constants.h" #include "mongo/client/dbclient_cursor.h" #include "mongo/config.h" +#include "mongo/db/api_parameters_gen.h" #include "mongo/db/commands.h" -#include "mongo/db/initialize_api_parameters_gen.h" #include "mongo/db/json.h" #include "mongo/db/namespace_string.h" #include "mongo/db/query/kill_cursors_gen.h" diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript index 2d90253f008..a0c8de84714 100644 --- a/src/mongo/db/SConscript +++ b/src/mongo/db/SConscript @@ -869,6 +869,7 @@ env.Library( '$BUILD_DIR/mongo/db/storage/storage_engine_lock_file', '$BUILD_DIR/mongo/db/storage/storage_engine_metadata', 'commands/server_status_core', + 'initialize_api_parameters', 'introspect', 'lasterror', 'query_exec', @@ -1460,17 +1461,39 @@ env.Library( env.Library( target='shared_request_handling', source=[ - 'initialize_api_parameters.cpp', 'transaction_validation.cpp', - env.Idlc('initialize_api_parameters.idl')[0], ], LIBDEPS=[ + 'api_parameters', 'error_labels', 'logical_session_cache_impl', ], ) env.Library( + target='api_parameters', + source=[ + 'api_parameters.cpp', + env.Idlc('api_parameters.idl')[0], + ], + LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/idl/idl_parser', + '$BUILD_DIR/mongo/idl/server_parameter', + ], +) + +env.Library( + target='initialize_api_parameters', + source=[ + 'initialize_api_parameters.cpp', + ], + LIBDEPS_PRIVATE=[ + 'api_parameters', + 'commands', + ], +) + +env.Library( target='logical_time', source=[ 'logical_time.cpp', diff --git a/src/mongo/db/api_parameters.cpp b/src/mongo/db/api_parameters.cpp new file mode 100644 index 00000000000..05ffe9c49cb --- /dev/null +++ b/src/mongo/db/api_parameters.cpp @@ -0,0 +1,79 @@ +/** + * Copyright (C) 2020-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kCommand + +#include "mongo/platform/basic.h" + +#include "mongo/db/api_parameters.h" + +namespace mongo { + +const OperationContext::Decoration<APIParameters> APIParameters::get = + OperationContext::declareDecoration<APIParameters>(); + +APIParameters APIParameters::fromClient(const APIParametersFromClient& apiParamsFromClient) { + APIParameters apiParameters = APIParameters(); + auto apiVersion = apiParamsFromClient.getApiVersion(); + auto apiStrict = apiParamsFromClient.getApiStrict(); + auto apiDeprecationErrors = apiParamsFromClient.getApiDeprecationErrors(); + + if (apiVersion) { + apiParameters.setAPIVersion(apiVersion.value()); + } + + if (apiStrict) { + apiParameters.setAPIStrict(apiStrict.value()); + } + + if (apiDeprecationErrors) { + apiParameters.setAPIDeprecationErrors(apiDeprecationErrors.value()); + } + + return apiParameters; +} + +APIParameters APIParameters::fromBSON(const BSONObj& cmdObj) { + return APIParameters::fromClient( + APIParametersFromClient::parse("APIParametersFromClient"_sd, cmdObj)); +} + +void APIParameters::appendInfo(BSONObjBuilder* builder) const { + if (_apiVersion) { + builder->append(kAPIVersionFieldName, *_apiVersion); + } + if (_apiStrict) { + builder->append(kAPIStrictFieldName, *_apiStrict); + } + if (_apiDeprecationErrors) { + builder->append(kAPIDeprecationErrorsFieldName, *_apiDeprecationErrors); + } +} + +} // namespace mongo diff --git a/src/mongo/db/api_parameters.h b/src/mongo/db/api_parameters.h new file mode 100644 index 00000000000..7539dcb345e --- /dev/null +++ b/src/mongo/db/api_parameters.h @@ -0,0 +1,122 @@ +/** + * Copyright (C) 2020-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/api_parameters_gen.h" +#include "mongo/db/operation_context.h" + +namespace mongo { + +/** + * Decorates operation context with methods to retrieve apiVersion, apiStrict, and + * apiDeprecationErrors. + */ +class APIParameters { + +public: + static constexpr StringData kAPIVersionFieldName = "apiVersion"_sd; + static constexpr StringData kAPIStrictFieldName = "apiStrict"_sd; + static constexpr StringData kAPIDeprecationErrorsFieldName = "apiDeprecationErrors"_sd; + + static const OperationContext::Decoration<APIParameters> get; + static APIParameters fromClient(const APIParametersFromClient& apiParamsFromClient); + static APIParameters fromBSON(const BSONObj& cmdObj); + + void appendInfo(BSONObjBuilder* builder) const; + + const boost::optional<std::string>& getAPIVersion() const { + return _apiVersion; + } + + void setAPIVersion(StringData apiVersion) { + _apiVersion = apiVersion.toString(); + } + + const boost::optional<bool>& getAPIStrict() const { + return _apiStrict; + } + + void setAPIStrict(bool apiStrict) { + _apiStrict = apiStrict; + } + + const boost::optional<bool>& getAPIDeprecationErrors() const { + return _apiDeprecationErrors; + } + + void setAPIDeprecationErrors(bool apiDeprecationErrors) { + _apiDeprecationErrors = apiDeprecationErrors; + } + + const bool getParamsPassed() const { + return _apiVersion || _apiStrict || _apiDeprecationErrors; + } + +private: + boost::optional<std::string> _apiVersion; + boost::optional<bool> _apiStrict; + boost::optional<bool> _apiDeprecationErrors; +}; + + +/** + * Temporarily remove the user's API parameters from an OperationContext. + */ +class IgnoreAPIParametersBlock { +public: + IgnoreAPIParametersBlock() = delete; + IgnoreAPIParametersBlock(const IgnoreAPIParametersBlock&) = delete; + IgnoreAPIParametersBlock& operator=(const IgnoreAPIParametersBlock&) = delete; + + explicit IgnoreAPIParametersBlock(OperationContext* opCtx) : _opCtx(opCtx) { + _apiParams = APIParameters::get(_opCtx); + APIParameters::get(_opCtx) = APIParameters(); + } + + void release() { + if (_released) { + return; + } + + APIParameters::get(_opCtx) = _apiParams; + _released = true; + } + + ~IgnoreAPIParametersBlock() { + release(); + } + +private: + OperationContext* _opCtx; + APIParameters _apiParams; + bool _released = false; +}; + +} // namespace mongo diff --git a/src/mongo/db/initialize_api_parameters.idl b/src/mongo/db/api_parameters.idl index cc3a3d13e6c..cc3a3d13e6c 100644 --- a/src/mongo/db/initialize_api_parameters.idl +++ b/src/mongo/db/api_parameters.idl diff --git a/src/mongo/db/catalog/multi_index_block.cpp b/src/mongo/db/catalog/multi_index_block.cpp index 617145e3abc..63a304a1f30 100644 --- a/src/mongo/db/catalog/multi_index_block.cpp +++ b/src/mongo/db/catalog/multi_index_block.cpp @@ -872,7 +872,9 @@ boost::optional<ResumeIndexInfo> MultiIndexBlock::_abortWithoutCleanup(Operation void MultiIndexBlock::_writeStateToDisk(OperationContext* opCtx) const { auto obj = _constructStateObject(); - auto rs = opCtx->getServiceContext()->getStorageEngine()->makeTemporaryRecordStore(opCtx); + auto rs = opCtx->getServiceContext() + ->getStorageEngine() + ->makeTemporaryRecordStoreForResumableIndexBuild(opCtx); WriteUnitOfWork wuow(opCtx); diff --git a/src/mongo/db/catalog_raii.h b/src/mongo/db/catalog_raii.h index 47444538dd5..367b87e933b 100644 --- a/src/mongo/db/catalog_raii.h +++ b/src/mongo/db/catalog_raii.h @@ -291,7 +291,7 @@ private: class ReadSourceScope { public: ReadSourceScope(OperationContext* opCtx, - RecoveryUnit::ReadSource readSource = RecoveryUnit::ReadSource::kUnset, + RecoveryUnit::ReadSource readSource, boost::optional<Timestamp> provided = boost::none); ~ReadSourceScope(); diff --git a/src/mongo/db/catalog_raii_test.cpp b/src/mongo/db/catalog_raii_test.cpp index cc222301ca0..e767d1f30ca 100644 --- a/src/mongo/db/catalog_raii_test.cpp +++ b/src/mongo/db/catalog_raii_test.cpp @@ -230,7 +230,7 @@ public: } private: - ReadSource _source = ReadSource::kUnset; + ReadSource _source = ReadSource::kNoTimestamp; boost::optional<Timestamp> _timestamp; }; @@ -257,8 +257,8 @@ TEST_F(ReadSourceScopeTest, RestoreReadSource) { ASSERT_EQ(opCtx()->recoveryUnit()->getTimestampReadSource(), ReadSource::kProvided); ASSERT_EQ(opCtx()->recoveryUnit()->getPointInTimeReadTimestamp(), Timestamp(1, 2)); { - ReadSourceScope scope(opCtx()); - ASSERT_EQ(opCtx()->recoveryUnit()->getTimestampReadSource(), ReadSource::kUnset); + ReadSourceScope scope(opCtx(), ReadSource::kNoTimestamp); + ASSERT_EQ(opCtx()->recoveryUnit()->getTimestampReadSource(), ReadSource::kNoTimestamp); opCtx()->recoveryUnit()->setTimestampReadSource(ReadSource::kNoOverlap); ASSERT_EQ(opCtx()->recoveryUnit()->getTimestampReadSource(), ReadSource::kNoOverlap); diff --git a/src/mongo/db/clientcursor.h b/src/mongo/db/clientcursor.h index ee2040764b6..f4d7960a759 100644 --- a/src/mongo/db/clientcursor.h +++ b/src/mongo/db/clientcursor.h @@ -32,10 +32,10 @@ #include <boost/optional.hpp> #include <functional> +#include "mongo/db/api_parameters.h" #include "mongo/db/auth/privilege.h" #include "mongo/db/auth/user_name.h" #include "mongo/db/cursor_id.h" -#include "mongo/db/initialize_api_parameters.h" #include "mongo/db/jsobj.h" #include "mongo/db/logical_session_id.h" #include "mongo/db/query/plan_executor.h" diff --git a/src/mongo/db/command_generic_argument.cpp b/src/mongo/db/command_generic_argument.cpp index 8434b65a3c3..e15c2498a97 100644 --- a/src/mongo/db/command_generic_argument.cpp +++ b/src/mongo/db/command_generic_argument.cpp @@ -56,9 +56,9 @@ static constexpr std::array<SpecialArgRecord, 34> specials{{ // /-isGeneric // | /-stripFromRequest // | | /-stripFromReply - {"apiVersion"_sd, 1, 0, 0}, - {"apiStrict"_sd, 1, 0, 0}, - {"apiDeprecationErrors"_sd, 1, 0, 0}, + {"apiVersion"_sd, 1, 1, 0}, + {"apiStrict"_sd, 1, 1, 0}, + {"apiDeprecationErrors"_sd, 1, 1, 0}, {"$audit"_sd, 1, 1, 0}, {"$client"_sd, 1, 1, 0}, {"$configServerState"_sd, 1, 1, 1}, diff --git a/src/mongo/db/commands.cpp b/src/mongo/db/commands.cpp index f9ef6f72574..8d9a8de296b 100644 --- a/src/mongo/db/commands.cpp +++ b/src/mongo/db/commands.cpp @@ -867,6 +867,14 @@ Command::Command(StringData name, std::vector<StringData> aliases) globalCommandRegistry()->registerCommand(this, _name, _aliases); } +const std::set<std::string>& Command::apiVersions() const { + return kNoApiVersions; +} + +const std::set<std::string>& Command::deprecatedApiVersions() const { + return kNoApiVersions; +} + bool Command::hasAlias(const StringData& alias) const { return globalCommandRegistry()->findCommand(alias) == this; } diff --git a/src/mongo/db/commands.h b/src/mongo/db/commands.h index 1877556f356..06803dbad84 100644 --- a/src/mongo/db/commands.h +++ b/src/mongo/db/commands.h @@ -358,15 +358,18 @@ public: /* * Returns the list of API versions that include this command. */ - virtual const std::set<std::string>& apiVersions() const { - return kNoApiVersions; - } + virtual const std::set<std::string>& apiVersions() const; /* * Returns the list of API versions in which this command is deprecated. */ - virtual const std::set<std::string>& deprecatedApiVersions() const { - return kNoApiVersions; + virtual const std::set<std::string>& deprecatedApiVersions() const; + + /* + * Some commands permit any values for apiVersion, apiStrict, and apiDeprecationErrors. + */ + virtual bool acceptsAnyApiVersionParameters() const { + return false; } /** diff --git a/src/mongo/db/commands/test_api_version_2_commands.cpp b/src/mongo/db/commands/test_api_version_2_commands.cpp index b2c79a7ef70..738e13b1366 100644 --- a/src/mongo/db/commands/test_api_version_2_commands.cpp +++ b/src/mongo/db/commands/test_api_version_2_commands.cpp @@ -27,8 +27,8 @@ * it in the license file. */ +#include "mongo/db/api_parameters.h" #include "mongo/db/commands.h" -#include "mongo/db/initialize_api_parameters.h" namespace mongo { diff --git a/src/mongo/db/commands/test_deprecation_command.cpp b/src/mongo/db/commands/test_deprecation_command.cpp index 44e61edb4a0..74d93942ddd 100644 --- a/src/mongo/db/commands/test_deprecation_command.cpp +++ b/src/mongo/db/commands/test_deprecation_command.cpp @@ -27,8 +27,8 @@ * it in the license file. */ +#include "mongo/db/api_parameters.h" #include "mongo/db/commands.h" -#include "mongo/db/initialize_api_parameters.h" namespace mongo { diff --git a/src/mongo/db/db_raii.cpp b/src/mongo/db/db_raii.cpp index a8329f4641d..22a9181f157 100644 --- a/src/mongo/db/db_raii.cpp +++ b/src/mongo/db/db_raii.cpp @@ -90,6 +90,10 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx, const NamespaceStringOrUUID& nsOrUUID, AutoGetCollectionViewMode viewMode, Date_t deadline) { + // The caller was expecting to conflict with batch application before entering this function. + // i.e. the caller does not currently have a ShouldNotConflict... block in scope. + bool callerWasConflicting = opCtx->lockState()->shouldConflictWithSecondaryBatchApplication(); + // Don't take the ParallelBatchWriterMode lock when the server parameter is set and our // storage engine supports snapshot reads. if (gAllowSecondaryReadsDuringBatchApplication.load() && @@ -100,11 +104,6 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx, const auto collectionLockMode = getLockModeForQuery(opCtx, nsOrUUID.nss()); _autoColl.emplace(opCtx, nsOrUUID, collectionLockMode, viewMode, deadline); - // If the read source is explicitly set to kNoTimestamp, we read the most up to date data and do - // not consider changing our ReadSource (e.g. FTDC needs that). - if (opCtx->recoveryUnit()->getTimestampReadSource() == RecoveryUnit::ReadSource::kNoTimestamp) - return; - repl::ReplicationCoordinator* const replCoord = repl::ReplicationCoordinator::get(opCtx); const auto readConcernLevel = repl::ReadConcernArgs::get(opCtx).getLevel(); @@ -154,6 +153,32 @@ AutoGetCollectionForRead::AutoGetCollectionForRead(OperationContext* opCtx, << afterClusterTime->asTimestamp().toString()); } + // This assertion protects operations from reading inconsistent data on secondaries when + // using the default ReadSource of kNoTimestamp. + + // Reading at lastApplied on secondaries is the safest behavior and is enabled for all user + // and DBDirectClient reads using 'local' and 'available' readConcerns. If an internal + // operation wishes to read without a timestamp during a batch, a ShouldNotConflict can + // suppress this fatal assertion with the following considerations: + // * The operation is not reading replicated data in a replication state where batch + // application is active OR + // * Reading inconsistent, out-of-order data is either inconsequential or required by + // the operation. + + // If the caller entered this function expecting to conflict with batch application + // (i.e. no ShouldNotConflict block in scope), but they are reading without a timestamp and + // not holding the PBWM lock, then there is a possibility that this reader may + // unintentionally see inconsistent data during a batch. Certain namespaces are applied + // serially in oplog application, and therefore can be safely read without taking the PBWM + // lock or reading at a timestamp. + if (readSource == RecoveryUnit::ReadSource::kNoTimestamp && callerWasConflicting && + !nss.mustBeAppliedInOwnOplogBatch() && + SnapshotHelper::shouldReadAtLastApplied(opCtx, nss)) { + LOGV2_FATAL(4728700, + "Reading from replicated collection without read timestamp or PBWM lock", + "collection"_attr = nss); + } + auto minSnapshot = coll->getMinimumVisibleSnapshot(); if (!SnapshotHelper::collectionChangesConflictWithRead(minSnapshot, readTimestamp)) { return; diff --git a/src/mongo/db/db_raii_test.cpp b/src/mongo/db/db_raii_test.cpp index b101ce91961..eba322c5581 100644 --- a/src/mongo/db/db_raii_test.cpp +++ b/src/mongo/db/db_raii_test.cpp @@ -42,6 +42,7 @@ #include "mongo/db/query/internal_plans.h" #include "mongo/db/storage/snapshot_manager.h" #include "mongo/logv2/log.h" +#include "mongo/unittest/death_test.h" #include "mongo/unittest/unittest.h" #include "mongo/util/time_support.h" @@ -219,6 +220,8 @@ TEST_F(DBRAIITestFixture, Lock::DBLock dbLock1(client1.second.get(), nss.db(), MODE_IX); ASSERT(client1.second->lockState()->isDbLockedForMode(nss.db(), MODE_IX)); + // Simulate using a DBDirectClient to test this behavior for user reads. + client2.first->setInDirectClient(true); AutoGetCollectionForRead coll(client2.second.get(), nss); } @@ -239,6 +242,8 @@ TEST_F(DBRAIITestFixture, Lock::DBLock dbLock1(client1.second.get(), nss.db(), MODE_IX); ASSERT(client1.second->lockState()->isDbLockedForMode(nss.db(), MODE_IX)); + // Simulate using a DBDirectClient to test this behavior for user reads. + client2.first->setInDirectClient(true); AutoGetCollectionForRead coll(client2.second.get(), nss); } @@ -266,10 +271,12 @@ TEST_F(DBRAIITestFixture, Lock::DBLock dbLock1(client1.second.get(), nss.db(), MODE_IX); ASSERT(client1.second->lockState()->isDbLockedForMode(nss.db(), MODE_IX)); + // Simulate using a DBDirectClient to test this behavior for user reads. + client2.first->setInDirectClient(true); AutoGetCollectionForRead coll(client2.second.get(), NamespaceString("local.system.js")); // Reading from an unreplicated collection does not change the ReadSource to kLastApplied. ASSERT_EQ(client2.second.get()->recoveryUnit()->getTimestampReadSource(), - RecoveryUnit::ReadSource::kUnset); + RecoveryUnit::ReadSource::kNoTimestamp); // Reading from a replicated collection will try to switch to kLastApplied. Because we are // already reading without a timestamp and we can't reacquire the PBWM lock to continue reading @@ -300,12 +307,15 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedConflict) { auto snapshotManager = client1.second.get()->getServiceContext()->getStorageEngine()->getSnapshotManager(); snapshotManager->setLastApplied(opTime.getTimestamp()); + + // Simulate using a DBDirectClient to test this behavior for user reads. + client1.first->setInDirectClient(true); AutoGetCollectionForRead coll(client1.second.get(), nss); // We can't read from kLastApplied in this scenario because there is a catalog conflict. Resort // to taking the PBWM lock and reading without a timestamp. ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(), - RecoveryUnit::ReadSource::kUnset); + RecoveryUnit::ReadSource::kNoTimestamp); ASSERT_TRUE(client1.second.get()->lockState()->isLockHeldForMode( resourceIdParallelBatchWriterMode, MODE_IS)); } @@ -325,6 +335,9 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedUnavailable) { auto snapshotManager = client1.second.get()->getServiceContext()->getStorageEngine()->getSnapshotManager(); ASSERT_FALSE(snapshotManager->getLastApplied()); + + // Simulate using a DBDirectClient to test this behavior for user reads. + client1.first->setInDirectClient(true); AutoGetCollectionForRead coll(client1.second.get(), nss); ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(), @@ -334,6 +347,33 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadLastAppliedUnavailable) { resourceIdParallelBatchWriterMode, MODE_IS)); } +TEST_F(DBRAIITestFixture, AutoGetCollectionForReadOplogOnSecondary) { + // This test simulates a situation where AutoGetCollectionForRead reads at lastApplied on a + // secondary. + auto replCoord = repl::ReplicationCoordinator::get(client1.second.get()); + ASSERT_OK(replCoord->setFollowerMode(repl::MemberState::RS_SECONDARY)); + + // Ensure the default ReadSource is used. + ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(), + RecoveryUnit::ReadSource::kNoTimestamp); + + // Don't call into the ReplicationCoordinator to update lastApplied because it is only a mock + // class and does not update the correct state in the SnapshotManager. + repl::OpTime opTime(Timestamp(2, 1), 1); + auto snapshotManager = + client1.second.get()->getServiceContext()->getStorageEngine()->getSnapshotManager(); + snapshotManager->setLastApplied(opTime.getTimestamp()); + + // Simulate using a DBDirectClient to test this behavior for user reads. + client1.first->setInDirectClient(true); + AutoGetCollectionForRead coll(client1.second.get(), NamespaceString::kRsOplogNamespace); + + ASSERT_EQ(client1.second.get()->recoveryUnit()->getTimestampReadSource(), + RecoveryUnit::ReadSource::kLastApplied); + ASSERT_FALSE(client1.second.get()->lockState()->isLockHeldForMode( + resourceIdParallelBatchWriterMode, MODE_IS)); +} + TEST_F(DBRAIITestFixture, AutoGetCollectionForReadUsesLastAppliedOnSecondary) { auto opCtx = client1.second.get(); @@ -342,11 +382,15 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadUsesLastAppliedOnSecondary) { CollectionOptions options; options.capped = true; ASSERT_OK(storageInterface()->createCollection(opCtx, nss, options)); + + // Simulate using a DBDirectClient to test this behavior for user reads. + opCtx->getClient()->setInDirectClient(true); AutoGetCollectionForRead autoColl(opCtx, nss); auto exec = makeTailableQueryPlan(opCtx, autoColl.getCollection()); // The collection scan should use the default ReadSource on a primary. - ASSERT_EQ(RecoveryUnit::ReadSource::kUnset, opCtx->recoveryUnit()->getTimestampReadSource()); + ASSERT_EQ(RecoveryUnit::ReadSource::kNoTimestamp, + opCtx->recoveryUnit()->getTimestampReadSource()); // When the tailable query recovers from its yield, it should discover that the node is // secondary and change its read source. @@ -373,6 +417,9 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadChangedReadSourceAfterStepUp) ASSERT_OK(storageInterface()->createCollection(opCtx, nss, options)); ASSERT_OK( repl::ReplicationCoordinator::get(opCtx)->setFollowerMode(repl::MemberState::RS_SECONDARY)); + + // Simulate using a DBDirectClient to test this behavior for user reads. + opCtx->getClient()->setInDirectClient(true); AutoGetCollectionForRead autoColl(opCtx, nss); auto exec = makeTailableQueryPlan(opCtx, autoColl.getCollection()); @@ -390,9 +437,36 @@ TEST_F(DBRAIITestFixture, AutoGetCollectionForReadChangedReadSourceAfterStepUp) // After restoring, the collection scan should now be reading with kUnset, the default on // primaries. - ASSERT_EQ(RecoveryUnit::ReadSource::kUnset, opCtx->recoveryUnit()->getTimestampReadSource()); + ASSERT_EQ(RecoveryUnit::ReadSource::kNoTimestamp, + opCtx->recoveryUnit()->getTimestampReadSource()); ASSERT_EQUALS(PlanExecutor::IS_EOF, exec->getNext(&unused, nullptr)); } +DEATH_TEST_F(DBRAIITestFixture, AutoGetCollectionForReadUnsafe, "Fatal assertion") { + auto opCtx = client1.second.get(); + ASSERT_OK(storageInterface()->createCollection(opCtx, nss, {})); + + ASSERT_OK( + repl::ReplicationCoordinator::get(opCtx)->setFollowerMode(repl::MemberState::RS_SECONDARY)); + + // Non-user read on a replicated collection should fail because we are reading on a secondary + // without a timestamp. + AutoGetCollectionForRead autoColl(opCtx, nss); +} + +TEST_F(DBRAIITestFixture, AutoGetCollectionForReadSafe) { + auto opCtx = client1.second.get(); + ASSERT_OK(storageInterface()->createCollection(opCtx, nss, {})); + + ASSERT_OK( + repl::ReplicationCoordinator::get(opCtx)->setFollowerMode(repl::MemberState::RS_SECONDARY)); + + // Non-user read on a replicated collection should not fail because of the ShouldNotConflict + // block. + ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict(opCtx->lockState()); + + AutoGetCollectionForRead autoColl(opCtx, nss); +} + } // namespace } // namespace mongo diff --git a/src/mongo/db/dbdirectclient.cpp b/src/mongo/db/dbdirectclient.cpp index 5386bf567d2..bb1f5553906 100644 --- a/src/mongo/db/dbdirectclient.cpp +++ b/src/mongo/db/dbdirectclient.cpp @@ -143,6 +143,7 @@ DbResponse loopbackBuildResponse(OperationContext* const opCtx, toSend.header().setId(nextMessageId()); toSend.header().setResponseToMsgId(0); + IgnoreAPIParametersBlock ignoreApiParametersBlock(opCtx); return opCtx->getServiceContext()->getServiceEntryPoint()->handleRequest(opCtx, toSend).get(); } } // namespace diff --git a/src/mongo/db/exec/sbe/expressions/expression.cpp b/src/mongo/db/exec/sbe/expressions/expression.cpp index 5c598445272..6b517f293b5 100644 --- a/src/mongo/db/exec/sbe/expressions/expression.cpp +++ b/src/mongo/db/exec/sbe/expressions/expression.cpp @@ -359,7 +359,7 @@ static stdx::unordered_map<std::string, BuiltinFn> kBuiltinFunctions = { {"addToArray", BuiltinFn{[](size_t n) { return n == 1; }, vm::Builtin::addToArray, true}}, {"addToSet", BuiltinFn{[](size_t n) { return n == 1; }, vm::Builtin::addToSet, true}}, {"doubleDoubleSum", - BuiltinFn{[](size_t n) { return n > 0; }, vm::Builtin::doubleDoubleSum, true}}, + BuiltinFn{[](size_t n) { return n > 0; }, vm::Builtin::doubleDoubleSum, false}}, {"bitTestZero", BuiltinFn{[](size_t n) { return n == 2; }, vm::Builtin::bitTestZero, false}}, {"bitTestMask", BuiltinFn{[](size_t n) { return n == 2; }, vm::Builtin::bitTestMask, false}}, {"bitTestPosition", @@ -402,6 +402,7 @@ static stdx::unordered_map<std::string, InstrFn> kInstrFunctions = { InstrFn{[](size_t n) { return n == 1; }, &vm::CodeFragment::appendIsNumber, false}}, {"isBinData", InstrFn{[](size_t n) { return n == 1; }, &vm::CodeFragment::appendIsBinData, false}}, + {"isDate", InstrFn{[](size_t n) { return n == 1; }, &vm::CodeFragment::appendIsDate, false}}, {"sum", InstrFn{[](size_t n) { return n == 1; }, &vm::CodeFragment::appendSum, true}}, {"min", InstrFn{[](size_t n) { return n == 1; }, &vm::CodeFragment::appendMin, true}}, {"max", InstrFn{[](size_t n) { return n == 1; }, &vm::CodeFragment::appendMax, true}}, diff --git a/src/mongo/db/exec/sbe/stages/loop_join.h b/src/mongo/db/exec/sbe/stages/loop_join.h index bf19c50b8f2..0f94d39a9c1 100644 --- a/src/mongo/db/exec/sbe/stages/loop_join.h +++ b/src/mongo/db/exec/sbe/stages/loop_join.h @@ -57,8 +57,7 @@ public: private: // Set of variables coming from the outer side. const value::SlotVector _outerProjects; - // Set of correlated variables from the outer side that are visible on the inner side. They must - // be also present in the _outerProjects. + // Set of correlated variables from the outer side that are visible on the inner side. const value::SlotVector _outerCorrelated; // If not set then this is a cross product. const std::unique_ptr<EExpression> _predicate; diff --git a/src/mongo/db/exec/sbe/vm/vm.cpp b/src/mongo/db/exec/sbe/vm/vm.cpp index b5890497f45..ba7c849431b 100644 --- a/src/mongo/db/exec/sbe/vm/vm.cpp +++ b/src/mongo/db/exec/sbe/vm/vm.cpp @@ -96,6 +96,7 @@ int Instruction::stackOffset[Instruction::Tags::lastInstruction] = { 0, // isString 0, // isNumber 0, // isBinData + 0, // isDate 0, // typeMatch 0, // function is special, the stack offset is encoded in the instruction itself @@ -314,6 +315,10 @@ void CodeFragment::appendIsBinData() { appendSimpleInstruction(Instruction::isBinData); } +void CodeFragment::appendIsDate() { + appendSimpleInstruction(Instruction::isDate); +} + void CodeFragment::appendTypeMatch(uint32_t typeMask) { Instruction i; i.tag = Instruction::typeMatch; @@ -1814,6 +1819,18 @@ std::tuple<uint8_t, value::TypeTags, value::Value> ByteCode::run(const CodeFragm } break; } + case Instruction::isDate: { + auto [owned, tag, val] = getFromStack(0); + + if (tag != value::TypeTags::Nothing) { + topStack(false, value::TypeTags::Boolean, tag == value::TypeTags::Date); + } + + if (owned) { + value::releaseValue(tag, val); + } + break; + } case Instruction::typeMatch: { auto typeMask = value::readFromMemory<uint32_t>(pcPointer); pcPointer += sizeof(typeMask); diff --git a/src/mongo/db/exec/sbe/vm/vm.h b/src/mongo/db/exec/sbe/vm/vm.h index a5197d17437..e4590a79c71 100644 --- a/src/mongo/db/exec/sbe/vm/vm.h +++ b/src/mongo/db/exec/sbe/vm/vm.h @@ -149,6 +149,7 @@ struct Instruction { isString, isNumber, isBinData, + isDate, typeMatch, function, @@ -259,6 +260,7 @@ public: void appendIsString(); void appendIsNumber(); void appendIsBinData(); + void appendIsDate(); void appendTypeMatch(uint32_t typeMask); void appendFunction(Builtin f, uint8_t arity); void appendJump(int jumpOffset); diff --git a/src/mongo/db/free_mon/free_mon_storage.cpp b/src/mongo/db/free_mon/free_mon_storage.cpp index 7c25c6a671c..89be39295e1 100644 --- a/src/mongo/db/free_mon/free_mon_storage.cpp +++ b/src/mongo/db/free_mon/free_mon_storage.cpp @@ -57,6 +57,10 @@ boost::optional<FreeMonStorageState> FreeMonStorage::read(OperationContext* opCt auto storageInterface = repl::StorageInterface::get(opCtx); + // Ensure we read without a timestamp. + invariant(RecoveryUnit::ReadSource::kNoTimestamp == + opCtx->recoveryUnit()->getTimestampReadSource()); + AutoGetCollectionForRead autoRead(opCtx, NamespaceString::kServerConfigurationNamespace); auto swObj = storageInterface->findById( diff --git a/src/mongo/db/ftdc/collector.cpp b/src/mongo/db/ftdc/collector.cpp index 37dd68b136e..11ba9d4d3a4 100644 --- a/src/mongo/db/ftdc/collector.cpp +++ b/src/mongo/db/ftdc/collector.cpp @@ -70,8 +70,9 @@ std::tuple<BSONObj, Date_t> FTDCCollectorCollection::collect(Client* client) { ShouldNotConflictWithSecondaryBatchApplicationBlock shouldNotConflictBlock(opCtx->lockState()); opCtx->lockState()->skipAcquireTicket(); - // Explicitly start future read transactions without a timestamp. - opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); + // Ensure future transactions read without a timestamp. + invariant(RecoveryUnit::ReadSource::kNoTimestamp == + opCtx->recoveryUnit()->getTimestampReadSource()); for (auto& collector : _collectors) { BSONObjBuilder subObjBuilder(builder.subobjStart(collector->name())); diff --git a/src/mongo/db/index_build_entry_helpers.cpp b/src/mongo/db/index_build_entry_helpers.cpp index da3f43b29e2..fc689873f6e 100644 --- a/src/mongo/db/index_build_entry_helpers.cpp +++ b/src/mongo/db/index_build_entry_helpers.cpp @@ -254,7 +254,8 @@ Status removeIndexBuildEntry(OperationContext* opCtx, UUID indexBuildUUID) { StatusWith<IndexBuildEntry> getIndexBuildEntry(OperationContext* opCtx, UUID indexBuildUUID) { // Read the most up to date data. - ReadSourceScope readSourceScope(opCtx, RecoveryUnit::ReadSource::kNoTimestamp); + invariant(RecoveryUnit::ReadSource::kNoTimestamp == + opCtx->recoveryUnit()->getTimestampReadSource()); AutoGetCollectionForRead autoCollection(opCtx, NamespaceString::kIndexBuildEntryNamespace); const Collection* collection = autoCollection.getCollection(); diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp index c8caafc318f..d27dd0848db 100644 --- a/src/mongo/db/index_builds_coordinator.cpp +++ b/src/mongo/db/index_builds_coordinator.cpp @@ -2553,7 +2553,8 @@ void IndexBuildsCoordinator::_buildIndex(OperationContext* opCtx, // Read without a timestamp. When we commit, we block writes which guarantees all writes are // visible. - opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); + invariant(RecoveryUnit::ReadSource::kNoTimestamp == + opCtx->recoveryUnit()->getTimestampReadSource()); // The collection scan might read with a kMajorityCommitted read source, but will restore // kNoTimestamp afterwards. _scanCollectionAndInsertSortedKeysIntoIndex(opCtx, replState); @@ -2655,7 +2656,7 @@ void IndexBuildsCoordinator::_insertKeysFromSideTablesWithoutBlockingWrites( uassertStatusOK(_indexBuildsManager.drainBackgroundWrites( opCtx, replState->buildUUID, - RecoveryUnit::ReadSource::kUnset, + RecoveryUnit::ReadSource::kNoTimestamp, IndexBuildInterceptor::DrainYieldPolicy::kYield)); } @@ -2681,7 +2682,7 @@ void IndexBuildsCoordinator::_insertKeysFromSideTablesBlockingWrites( uassertStatusOK(_indexBuildsManager.drainBackgroundWrites( opCtx, replState->buildUUID, - RecoveryUnit::ReadSource::kUnset, + RecoveryUnit::ReadSource::kNoTimestamp, IndexBuildInterceptor::DrainYieldPolicy::kNoYield)); } @@ -2769,7 +2770,7 @@ IndexBuildsCoordinator::CommitResult IndexBuildsCoordinator::_insertKeysFromSide uassertStatusOK(_indexBuildsManager.drainBackgroundWrites( opCtx, replState->buildUUID, - RecoveryUnit::ReadSource::kUnset, + RecoveryUnit::ReadSource::kNoTimestamp, IndexBuildInterceptor::DrainYieldPolicy::kNoYield)); try { @@ -2916,7 +2917,7 @@ StatusWith<std::pair<long long, long long>> IndexBuildsCoordinator::_runIndexReb uassertStatusOK(_indexBuildsManager.drainBackgroundWrites( opCtx, replState->buildUUID, - RecoveryUnit::ReadSource::kUnset, + RecoveryUnit::ReadSource::kNoTimestamp, IndexBuildInterceptor::DrainYieldPolicy::kNoYield)); uassertStatusOK( diff --git a/src/mongo/db/initialize_api_parameters.cpp b/src/mongo/db/initialize_api_parameters.cpp index 11a5b68ae10..575fd476827 100644 --- a/src/mongo/db/initialize_api_parameters.cpp +++ b/src/mongo/db/initialize_api_parameters.cpp @@ -27,8 +27,17 @@ * it in the license file. */ +#include "mongo/platform/basic.h" + #include "mongo/db/initialize_api_parameters.h" +#include <string> + +#include "mongo/db/commands.h" +#include "mongo/db/operation_context.h" +#include "mongo/util/assert_util.h" +#include "mongo/util/str.h" + namespace mongo { const APIParametersFromClient initializeAPIParameters(OperationContext* opCtx, @@ -44,6 +53,10 @@ const APIParametersFromClient initializeAPIParameters(OperationContext* opCtx, apiParamsFromClient.getApiVersion()); } + if (command->acceptsAnyApiVersionParameters()) { + return apiParamsFromClient; + } + if (apiParamsFromClient.getApiDeprecationErrors() || apiParamsFromClient.getApiStrict()) { uassert(4886600, "Provided apiStrict and/or apiDeprecationErrors without passing apiVersion", @@ -88,44 +101,4 @@ const APIParametersFromClient initializeAPIParameters(OperationContext* opCtx, return apiParamsFromClient; } -const OperationContext::Decoration<APIParameters> handle = - OperationContext::declareDecoration<APIParameters>(); - -APIParameters& APIParameters::get(OperationContext* opCtx) { - return handle(opCtx); -} - -APIParameters APIParameters::fromClient(const APIParametersFromClient& apiParamsFromClient) { - APIParameters apiParameters = APIParameters(); - auto apiVersion = apiParamsFromClient.getApiVersion(); - auto apiStrict = apiParamsFromClient.getApiStrict(); - auto apiDeprecationErrors = apiParamsFromClient.getApiDeprecationErrors(); - - if (apiVersion) { - apiParameters.setAPIVersion(apiVersion.value()); - } - - if (apiStrict) { - apiParameters.setAPIStrict(apiStrict.value()); - } - - if (apiDeprecationErrors) { - apiParameters.setAPIDeprecationErrors(apiDeprecationErrors.value()); - } - - return apiParameters; -} - -void APIParameters::appendInfo(BSONObjBuilder* builder) const { - if (_apiVersion) { - builder->append(kAPIVersionFieldName, *_apiVersion); - } - if (_apiStrict) { - builder->append(kAPIStrictFieldName, *_apiStrict); - } - if (_apiDeprecationErrors) { - builder->append(kAPIDeprecationErrorsFieldName, *_apiDeprecationErrors); - } -} - } // namespace mongo diff --git a/src/mongo/db/initialize_api_parameters.h b/src/mongo/db/initialize_api_parameters.h index 73215f607c8..e62d0defecc 100644 --- a/src/mongo/db/initialize_api_parameters.h +++ b/src/mongo/db/initialize_api_parameters.h @@ -29,73 +29,19 @@ #pragma once -#include "mongo/db/commands.h" -#include "mongo/db/initialize_api_parameters_gen.h" -#include "mongo/db/operation_context.h" +#include "api_parameters.h" namespace mongo { +class BSONObj; +class Command; +class OperationContext; + /** - * See VERSIONED_API_README.md for an overview of the Versioned API. - * - * This function parses a command's API Version parameters from a request and stores the apiVersion, + * Parse a command's API Version parameters from a request and store the apiVersion, * apiStrict, and apiDeprecationErrors fields. */ const APIParametersFromClient initializeAPIParameters(OperationContext* opCtx, const BSONObj& requestBody, Command* command); - -/** - * Decorates operation context with methods to retrieve apiVersion, apiStrict, and - * apiDeprecationErrors. - */ -class APIParameters { - -public: - static constexpr StringData kAPIVersionFieldName = "apiVersion"_sd; - static constexpr StringData kAPIStrictFieldName = "apiStrict"_sd; - static constexpr StringData kAPIDeprecationErrorsFieldName = "apiDeprecationErrors"_sd; - - APIParameters() = default; - static APIParameters& get(OperationContext* opCtx); - static APIParameters fromClient(const APIParametersFromClient& apiParamsFromClient); - - void appendInfo(BSONObjBuilder* builder) const; - - const boost::optional<std::string>& getAPIVersion() const { - return _apiVersion; - } - - void setAPIVersion(StringData apiVersion) { - _apiVersion = apiVersion.toString(); - } - - const boost::optional<bool>& getAPIStrict() const { - return _apiStrict; - } - - void setAPIStrict(bool apiStrict) { - _apiStrict = apiStrict; - } - - const boost::optional<bool>& getAPIDeprecationErrors() const { - return _apiDeprecationErrors; - } - - void setAPIDeprecationErrors(bool apiDeprecationErrors) { - _apiDeprecationErrors = apiDeprecationErrors; - } - - bool getParamsPassed() const { - return _apiVersion || _apiStrict || _apiDeprecationErrors; - } - - BSONObj toBSON() const; - -private: - boost::optional<std::string> _apiVersion; - boost::optional<bool> _apiStrict; - boost::optional<bool> _apiDeprecationErrors; -}; - } // namespace mongo diff --git a/src/mongo/db/mongod_options.cpp b/src/mongo/db/mongod_options.cpp index f0722782157..e499d04881a 100644 --- a/src/mongo/db/mongod_options.cpp +++ b/src/mongo/db/mongod_options.cpp @@ -404,6 +404,9 @@ Status storeMongodOptions(const moe::Environment& params) { if (params.count("storage.syncPeriodSecs")) { storageGlobalParams.syncdelay = params["storage.syncPeriodSecs"].as<double>(); + storageGlobalParams.checkpointDelaySecs = + static_cast<size_t>(params["storage.syncPeriodSecs"].as<double>()); + if (storageGlobalParams.syncdelay < 0 || storageGlobalParams.syncdelay > StorageGlobalParams::kMaxSyncdelaySecs) { return Status(ErrorCodes::BadValue, diff --git a/src/mongo/db/namespace_string.cpp b/src/mongo/db/namespace_string.cpp index 9471aca909c..bee7df5ca40 100644 --- a/src/mongo/db/namespace_string.cpp +++ b/src/mongo/db/namespace_string.cpp @@ -144,6 +144,18 @@ bool NamespaceString::isLegalClientSystemNS() const { return false; } +/** + * Oplog entries on 'system.views' should also be processed one at a time. View catalog immediately + * reflects changes for each oplog entry so we can see inconsistent view catalog if multiple oplog + * entries on 'system.views' are being applied out of the original order. + * + * Process updates to 'admin.system.version' individually as well so the secondary's FCV when + * processing each operation matches the primary's when committing that operation. + */ +bool NamespaceString::mustBeAppliedInOwnOplogBatch() const { + return isSystemDotViews() || isServerConfigurationCollection() || isPrivilegeCollection(); +} + NamespaceString NamespaceString::makeListCollectionsNSS(StringData dbName) { NamespaceString nss(dbName, listCollectionsCursorCol); dassert(nss.isValid()); diff --git a/src/mongo/db/namespace_string.h b/src/mongo/db/namespace_string.h index a43406f8bd4..e5de9877c84 100644 --- a/src/mongo/db/namespace_string.h +++ b/src/mongo/db/namespace_string.h @@ -338,6 +338,11 @@ public: bool isDropPendingNamespace() const; /** + * Returns true if operations on this namespace must be applied in their own oplog batch. + */ + bool mustBeAppliedInOwnOplogBatch() const; + + /** * Returns the drop-pending namespace name for this namespace, provided the given optime. * * Example: diff --git a/src/mongo/db/pipeline/document_source_writer.h b/src/mongo/db/pipeline/document_source_writer.h index 9c175890ecf..b91c49a90db 100644 --- a/src/mongo/db/pipeline/document_source_writer.h +++ b/src/mongo/db/pipeline/document_source_writer.h @@ -65,7 +65,7 @@ public: } repl::ReadConcernArgs::get(_opCtx) = repl::ReadConcernArgs(); - _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::kUnset); + _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); } ~DocumentSourceWriteBlock() { diff --git a/src/mongo/db/pipeline/expression_context.h b/src/mongo/db/pipeline/expression_context.h index 6cd1bba4f3b..5140d8ea32f 100644 --- a/src/mongo/db/pipeline/expression_context.h +++ b/src/mongo/db/pipeline/expression_context.h @@ -328,10 +328,6 @@ public: // 'jsHeapLimitMB' server parameter. boost::optional<int> jsHeapLimitMB; - // When set this timeout limits the allowed execution time for a JavaScript function invocation - // under any Scope returned by getJsExecWithScope(). - int jsFnTimeoutMillis; - // An interface for accessing information or performing operations that have different // implementations on mongod and mongos, or that only make sense on one of the two. // Additionally, putting some of this functionality behind an interface prevents aggregation diff --git a/src/mongo/db/pipeline/process_interface/common_process_interface.cpp b/src/mongo/db/pipeline/process_interface/common_process_interface.cpp index 330ef41693e..b6b304c348b 100644 --- a/src/mongo/db/pipeline/process_interface/common_process_interface.cpp +++ b/src/mongo/db/pipeline/process_interface/common_process_interface.cpp @@ -184,15 +184,11 @@ bool CommonProcessInterface::keyPatternNamesExactPaths(const BSONObj& keyPattern boost::optional<ChunkVersion> CommonProcessInterface::refreshAndGetCollectionVersion( const boost::intrusive_ptr<ExpressionContext>& expCtx, const NamespaceString& nss) const { - const bool forceRefreshFromThisThread = false; - auto cm = uassertStatusOK( - Grid::get(expCtx->opCtx) - ->catalogCache() - ->getCollectionRoutingInfoWithRefresh(expCtx->opCtx, nss, forceRefreshFromThisThread)); - if (cm.isSharded()) { - return cm.getVersion(); - } - return boost::none; + const auto cm = uassertStatusOK(Grid::get(expCtx->opCtx) + ->catalogCache() + ->getCollectionRoutingInfoWithRefresh(expCtx->opCtx, nss)); + + return cm.isSharded() ? boost::make_optional(cm.getVersion()) : boost::none; } std::vector<FieldPath> CommonProcessInterface::_shardKeyToDocumentKeyFields( diff --git a/src/mongo/db/pipeline/sharded_agg_helpers.h b/src/mongo/db/pipeline/sharded_agg_helpers.h index 13a20fee607..c63ac997a32 100644 --- a/src/mongo/db/pipeline/sharded_agg_helpers.h +++ b/src/mongo/db/pipeline/sharded_agg_helpers.h @@ -245,13 +245,9 @@ auto shardVersionRetry(OperationContext* opCtx, str::stream() << "StaleConfig error on unexpected namespace. Expected " << nss << ", received " << staleInfo->getNss()); catalogCache->invalidateShardOrEntireCollectionEntryForShardedCollection( - opCtx, - nss, - staleInfo->getVersionWanted(), - staleInfo->getVersionReceived(), - staleInfo->getShardId()); + nss, staleInfo->getVersionWanted(), staleInfo->getShardId()); } else { - catalogCache->onEpochChange(nss); + catalogCache->invalidateCollectionEntry_LINEARIZABLE(nss); } if (!logAndTestMaxRetries(e)) { throw; diff --git a/src/mongo/db/query/SConscript b/src/mongo/db/query/SConscript index a82d443d5bf..bfb2bc1dc4c 100644 --- a/src/mongo/db/query/SConscript +++ b/src/mongo/db/query/SConscript @@ -180,6 +180,7 @@ env.Library( ], LIBDEPS=[ "$BUILD_DIR/mongo/base", + "$BUILD_DIR/mongo/db/api_parameters", "$BUILD_DIR/mongo/db/catalog/collection_catalog", # TODO: This dependency edge can be removed when the 'allowDiskUse' option no longer depends # on enabling test commands. diff --git a/src/mongo/db/query/optimizer/SConscript b/src/mongo/db/query/optimizer/SConscript index 175b109625d..0863192a593 100644 --- a/src/mongo/db/query/optimizer/SConscript +++ b/src/mongo/db/query/optimizer/SConscript @@ -8,6 +8,7 @@ env.Library( target="optimizer", source=[ "defs.cpp", + "memo.cpp", "node.cpp", ], LIBDEPS=[ diff --git a/src/mongo/db/query/optimizer/algebra/operator.h b/src/mongo/db/query/optimizer/algebra/operator.h new file mode 100644 index 00000000000..524b7246413 --- /dev/null +++ b/src/mongo/db/query/optimizer/algebra/operator.h @@ -0,0 +1,305 @@ +/** + * Copyright (C) 2020-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include <vector> + +#include "mongo/db/query/optimizer/algebra/polyvalue.h" + +namespace mongo::optimizer { +namespace algebra { + +template <typename T, int S> +struct OpNodeStorage { + T _nodes[S]; + + template <typename... Ts> + OpNodeStorage(Ts&&... vals) : _nodes{std::forward<Ts>(vals)...} {} +}; + +template <typename T> +struct OpNodeStorage<T, 0> {}; + +/*=====----- + * + * Arity of operator can be: + * 1. statically known - A, A, A, ... + * 2. dynamic prefix with optional statically know - vector<A>, A, A, A, ... + * + * Denotations map A to some B. + * So static arity <A,A,A> is mapped to <B,B,B>. + * Similarly, arity <vector<A>,A> is mapped to <vector<B>,B> + * + * There is a wrinkle when B is a reference (if allowed at all) + * Arity <vector<A>, A, A> is mapped to <vector<B>&, B&, B&> - note that the reference is lifted + * outside of the vector. + * + */ +template <typename Slot, typename Derived, int Arity> +class OpSpecificArity : public OpNodeStorage<Slot, Arity> { + using Base = OpNodeStorage<Slot, Arity>; + +public: + template <typename... Ts> + OpSpecificArity(Ts&&... vals) : Base({std::forward<Ts>(vals)...}) { + static_assert(sizeof...(Ts) == Arity, "constructor paramaters do not match"); + } + + template <int I, std::enable_if_t<(I >= 0 && I < Arity), int> = 0> + auto& get() noexcept { + return this->_nodes[I]; + } + + template <int I, std::enable_if_t<(I >= 0 && I < Arity), int> = 0> + const auto& get() const noexcept { + return this->_nodes[I]; + } +}; +/*=====----- + * + * Operator with dynamic arity + * + */ +template <typename Slot, typename Derived, int Arity> +class OpSpecificDynamicArity : public OpSpecificArity<Slot, Derived, Arity> { + using Base = OpSpecificArity<Slot, Derived, Arity>; + + std::vector<Slot> _dyNodes; + +public: + template <typename... Ts> + OpSpecificDynamicArity(std::vector<Slot> nodes, Ts&&... vals) + : Base({std::forward<Ts>(vals)...}), _dyNodes(std::move(nodes)) {} + + auto& nodes() { + return _dyNodes; + } + const auto& nodes() const { + return _dyNodes; + } +}; + +/*=====----- + * + * Semantic transport interface + * + */ +namespace detail { +template <typename D, typename T, typename = std::void_t<>> +struct has_prepare : std::false_type {}; +template <typename D, typename T> +struct has_prepare<D, T, std::void_t<decltype(std::declval<D>().prepare(std::declval<T&>()))>> + : std::true_type {}; + +template <typename D, typename T> +inline constexpr auto has_prepare_v = has_prepare<D, T>::value; + +template <typename Slot, typename Derived, int Arity> +inline constexpr int get_arity(const OpSpecificArity<Slot, Derived, Arity>*) { + return Arity; +} + +template <typename Slot, typename Derived, int Arity> +inline constexpr bool is_dynamic(const OpSpecificArity<Slot, Derived, Arity>*) { + return false; +} + +template <typename Slot, typename Derived, int Arity> +inline constexpr bool is_dynamic(const OpSpecificDynamicArity<Slot, Derived, Arity>*) { + return true; +} + +template <typename T> +using OpConcreteType = typename std::remove_reference_t<T>::template get_t<0>; +} // namespace detail + +template <typename D, bool withSlot> +class OpTransporter { + D& _domain; + + template <typename T, bool B> + struct Deducer {}; + template <typename T> + struct Deducer<T, true> { + using type = decltype(std::declval<D>().transport( + std::declval<T>(), std::declval<detail::OpConcreteType<T>&>())); + }; + template <typename T> + struct Deducer<T, false> { + using type = + decltype(std::declval<D>().transport(std::declval<detail::OpConcreteType<T>&>())); + }; + template <typename T> + using deduced_t = typename Deducer<T, withSlot>::type; + + template <typename N, typename T, typename... Ts> + auto transformStep(N&& slot, T&& op, Ts&&... args) { + if constexpr (withSlot) { + return _domain.transport( + std::forward<N>(slot), std::forward<T>(op), std::forward<Ts>(args)...); + } else { + return _domain.transport(std::forward<T>(op), std::forward<Ts>(args)...); + } + } + + template <typename N, typename T, size_t... I> + auto transportUnpack(N&& slot, T&& op, std::index_sequence<I...>) { + return transformStep( + std::forward<N>(slot), std::forward<T>(op), op.template get<I>().visit(*this)...); + } + template <typename N, typename T, size_t... I> + auto transportDynamicUnpack(N&& slot, T&& op, std::index_sequence<I...>) { + std::vector<decltype(slot.visit(*this))> v; + for (auto& node : op.nodes()) { + v.emplace_back(node.visit(*this)); + } + return transformStep(std::forward<N>(slot), + std::forward<T>(op), + std::move(v), + op.template get<I>().visit(*this)...); + } + template <typename N, typename T, size_t... I> + void transportUnpackVoid(N&& slot, T&& op, std::index_sequence<I...>) { + (op.template get<I>().visit(*this), ...); + return transformStep(std::forward<N>(slot), std::forward<T>(op), op.template get<I>()...); + } + template <typename N, typename T, size_t... I> + void transportDynamicUnpackVoid(N&& slot, T&& op, std::index_sequence<I...>) { + for (auto& node : op.nodes()) { + node.visit(*this); + } + (op.template get<I>().visit(*this), ...); + return transformStep( + std::forward<N>(slot), std::forward<T>(op), op.nodes(), op.template get<I>()...); + } + +public: + OpTransporter(D& domain) : _domain(domain) {} + + template <typename N, typename T, typename R = deduced_t<N>> + R operator()(N&& slot, T&& op) { + // N is either `PolyValue<Ts...>&` or `const PolyValue<Ts...>&` i.e. reference + // T is either `A&` or `const A&` where A is one of Ts + using type = std::remove_reference_t<T>; + + constexpr int arity = detail::get_arity(static_cast<type*>(nullptr)); + constexpr bool is_dynamic = detail::is_dynamic(static_cast<type*>(nullptr)); + + if constexpr (detail::has_prepare_v<D, type>) { + _domain.prepare(std::forward<T>(op)); + } + if constexpr (is_dynamic) { + if constexpr (std::is_same_v<R, void>) { + return transportDynamicUnpackVoid( + std::forward<N>(slot), std::forward<T>(op), std::make_index_sequence<arity>{}); + } else { + return transportDynamicUnpack( + std::forward<N>(slot), std::forward<T>(op), std::make_index_sequence<arity>{}); + } + } else { + if constexpr (std::is_same_v<R, void>) { + return transportUnpackVoid( + std::forward<N>(slot), std::forward<T>(op), std::make_index_sequence<arity>{}); + } else { + return transportUnpack( + std::forward<N>(slot), std::forward<T>(op), std::make_index_sequence<arity>{}); + } + } + } +}; + +template <typename D, bool withSlot> +class OpWalker { + D& _domain; + + template <typename N, typename T, typename... Ts> + auto walkStep(N&& slot, T&& op, Ts&&... args) { + if constexpr (withSlot) { + return _domain.walk( + std::forward<N>(slot), std::forward<T>(op), std::forward<Ts>(args)...); + } else { + return _domain.walk(std::forward<T>(op), std::forward<Ts>(args)...); + } + } + + template <typename N, typename T, typename... Args, size_t... I> + auto walkUnpack(N&& slot, T&& op, std::index_sequence<I...>, Args&&... args) { + return walkStep(std::forward<N>(slot), + std::forward<T>(op), + std::forward<Args>(args)..., + op.template get<I>()...); + } + template <typename N, typename T, typename... Args, size_t... I> + auto walkDynamicUnpack(N&& slot, T&& op, std::index_sequence<I...>, Args&&... args) { + return walkStep(std::forward<N>(slot), + std::forward<T>(op), + std::forward<Args>(args)..., + op.nodes(), + op.template get<I>()...); + } + +public: + OpWalker(D& domain) : _domain(domain) {} + + template <typename N, typename T, typename... Args> + auto operator()(N&& slot, T&& op, Args&&... args) { + // N is either `PolyValue<Ts...>&` or `const PolyValue<Ts...>&` i.e. reference + // T is either `A&` or `const A&` where A is one of Ts + using type = std::remove_reference_t<T>; + + constexpr int arity = detail::get_arity(static_cast<type*>(nullptr)); + constexpr bool is_dynamic = detail::is_dynamic(static_cast<type*>(nullptr)); + + if constexpr (is_dynamic) { + return walkDynamicUnpack(std::forward<N>(slot), + std::forward<T>(op), + std::make_index_sequence<arity>{}, + std::forward<Args>(args)...); + } else { + return walkUnpack(std::forward<N>(slot), + std::forward<T>(op), + std::make_index_sequence<arity>{}, + std::forward<Args>(args)...); + } + } +}; + +template <bool withSlot = false, typename D, typename N> +auto transport(N&& node, D& domain) { + return node.visit(OpTransporter<D, withSlot>{domain}); +} + +template <bool withSlot = false, typename D, typename N, typename... Args> +auto walk(N&& node, D& domain, Args&&... args) { + return node.visit(OpWalker<D, withSlot>{domain}, std::forward<Args>(args)...); +} + +} // namespace algebra +} // namespace mongo::optimizer diff --git a/src/mongo/db/query/optimizer/algebra/polyvalue.h b/src/mongo/db/query/optimizer/algebra/polyvalue.h new file mode 100644 index 00000000000..374041c5704 --- /dev/null +++ b/src/mongo/db/query/optimizer/algebra/polyvalue.h @@ -0,0 +1,381 @@ +/** + * Copyright (C) 2020-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include <array> +#include <stdexcept> +#include <type_traits> + +namespace mongo::optimizer { +namespace algebra { +namespace detail { + +template <typename T, typename... Args> +inline constexpr bool is_one_of_v = std::disjunction_v<std::is_same<T, Args>...>; + +template <typename T, typename... Args> +inline constexpr bool is_one_of_f() { + return is_one_of_v<T, Args...>; +} + +template <typename... Args> +struct is_unique_t : std::true_type {}; + +template <typename H, typename... T> +struct is_unique_t<H, T...> + : std::bool_constant<!is_one_of_f<H, T...>() && is_unique_t<T...>::value> {}; + +template <typename... Args> +inline constexpr bool is_unique_v = is_unique_t<Args...>::value; + +// Given the type T find its index in Ts +template <typename T, typename... Ts> +static inline constexpr int find_index() { + static_assert(detail::is_unique_v<Ts...>, "Types must be unique"); + constexpr bool matchVector[] = {std::is_same<T, Ts>::value...}; + + for (int index = 0; index < static_cast<int>(sizeof...(Ts)); ++index) { + if (matchVector[index]) { + return index; + } + } + + return -1; +} + +template <int N, typename T, typename... Ts> +struct get_type_by_index_impl { + using type = typename get_type_by_index_impl<N - 1, Ts...>::type; +}; +template <typename T, typename... Ts> +struct get_type_by_index_impl<0, T, Ts...> { + using type = T; +}; + +// Given the index I return the type from Ts +template <int I, typename... Ts> +using get_type_by_index = typename get_type_by_index_impl<I, Ts...>::type; + +} // namespace detail + +/*=====----- + * + * The overload trick to construct visitors from lambdas. + * + */ +template <class... Ts> +struct overload : Ts... { + using Ts::operator()...; +}; +template <class... Ts> +overload(Ts...)->overload<Ts...>; + +/*=====----- + * + * Forward declarations + * + */ +template <typename... Ts> +class PolyValue; + +template <typename T, typename... Ts> +class ControlBlockVTable; + +/*=====----- + * + * The base control block that PolyValue holds. + * + * It does not contain anything else by the runtime tag. + * + */ +template <typename... Ts> +class ControlBlock { + const int _tag; + +protected: + ControlBlock(int tag) noexcept : _tag(tag) {} + +public: + auto getRuntimeTag() const noexcept { + return _tag; + } +}; + +/*=====----- + * + * The concrete control block VTable generator. + * + * It must be empty ad PolyValue derives from the generators + * and we want EBO to kick in. + * + */ +template <typename T, typename... Ts> +class ControlBlockVTable { + static constexpr int _staticTag = detail::find_index<T, Ts...>(); + static_assert(_staticTag != -1, "Type must be on the list"); + + using AbstractType = ControlBlock<Ts...>; + using PolyValueType = PolyValue<Ts...>; + + /*=====----- + * + * The concrete control block for every type T of Ts. + * + * It derives from the ControlBlock. All methods are private and only + * the friend class ControlBlockVTable can call them. + * + */ + class ConcreteType : public AbstractType { + T _t; + + public: + template <typename... Args> + ConcreteType(Args&&... args) : AbstractType(_staticTag), _t(std::forward<Args>(args)...) {} + + const T* getPtr() const { + return &_t; + } + + T* getPtr() { + return &_t; + } + }; + + static constexpr auto concrete(AbstractType* block) noexcept { + return static_cast<ConcreteType*>(block); + } + + static constexpr auto concrete(const AbstractType* block) noexcept { + return static_cast<const ConcreteType*>(block); + } + +public: + template <typename... Args> + static AbstractType* make(Args&&... args) { + return new ConcreteType(std::forward<Args>(args)...); + } + + static AbstractType* clone(const AbstractType* block) { + return new ConcreteType(*concrete(block)); + } + + static void destroy(AbstractType* block) noexcept { + delete concrete(block); + } + + static bool compareEq(AbstractType* blockLhs, AbstractType* blockRhs) noexcept { + if (blockLhs->getRuntimeTag() == blockRhs->getRuntimeTag()) { + return *castConst<T>(blockLhs) == *castConst<T>(blockRhs); + } + return false; + } + + template <typename U> + static constexpr bool is_v = std::is_base_of_v<U, T>; + + template <typename U> + static U* cast(AbstractType* block) { + if constexpr (is_v<U>) { + return static_cast<U*>(concrete(block)->getPtr()); + } else { + // gcc bug 81676 + (void)block; + return nullptr; + } + } + + template <typename U> + static const U* castConst(const AbstractType* block) { + if constexpr (is_v<U>) { + return static_cast<const U*>(concrete(block)->getPtr()); + } else { + // gcc bug 81676 + (void)block; + return nullptr; + } + } + + template <typename V, typename... Args> + static auto visit(V&& v, PolyValueType& holder, AbstractType* block, Args&&... args) { + return v(holder, *cast<T>(block), std::forward<Args>(args)...); + } + + template <typename V, typename... Args> + static auto visitConst(V&& v, + const PolyValueType& holder, + const AbstractType* block, + Args&&... args) { + return v(holder, *castConst<T>(block), std::forward<Args>(args)...); + } +}; + +/*=====----- + * + * This is a variation on variant and polymorphic value theme. + * + * A tag based dispatch + * + * Supported operations: + * - construction + * - destruction + * - clone a = b; + * - cast a.cast<T>() + * - multi-method cast to common base a.cast<B>() + * - multi-method visit + */ +template <typename... Ts> +class PolyValue : private ControlBlockVTable<Ts, Ts...>... { + static_assert(detail::is_unique_v<Ts...>, "Types must be unique"); + static_assert(std::conjunction_v<std::is_empty<ControlBlockVTable<Ts, Ts...>>...>, + "VTable base classes must be empty"); + + ControlBlock<Ts...>* _object{nullptr}; + + PolyValue(ControlBlock<Ts...>* object) noexcept : _object(object) {} + + auto tag() const noexcept { + return _object->getRuntimeTag(); + } + + void check() const { + if (!_object) { + throw std::logic_error("PolyValue is empty"); + } + } + + static void destroy(ControlBlock<Ts...>* object) { + static constexpr std::array destroyTbl = {&ControlBlockVTable<Ts, Ts...>::destroy...}; + + destroyTbl[object->getRuntimeTag()](object); + } + +public: + PolyValue() = delete; + + PolyValue(const PolyValue& other) { + static constexpr std::array cloneTbl = {&ControlBlockVTable<Ts, Ts...>::clone...}; + if (other._object) { + _object = cloneTbl[other.tag()](other._object); + } + } + + PolyValue(PolyValue&& other) noexcept { + swap(other); + } + + ~PolyValue() noexcept { + if (_object) { + destroy(_object); + } + } + + PolyValue& operator=(PolyValue other) noexcept { + swap(other); + return *this; + } + + template <typename T, typename... Args> + static PolyValue make(Args&&... args) { + return PolyValue{ControlBlockVTable<T, Ts...>::make(std::forward<Args>(args)...)}; + } + + template <int I> + using get_t = detail::get_type_by_index<I, Ts...>; + + template <typename V, typename... Args> + auto visit(V&& v, Args&&... args) { + // unfortunately gcc rejects much nicer code, clang and msvc accept + // static constexpr std::array visitTbl = { &ControlBlockVTable<Ts, Ts...>::template + // visit<V>... }; + + using FunPtrType = + decltype(&ControlBlockVTable<get_t<0>, Ts...>::template visit<V, Args...>); + static constexpr FunPtrType visitTbl[] = { + &ControlBlockVTable<Ts, Ts...>::template visit<V, Args...>...}; + + check(); + return visitTbl[tag()](std::forward<V>(v), *this, _object, std::forward<Args>(args)...); + } + + template <typename V, typename... Args> + auto visit(V&& v, Args&&... args) const { + // unfortunately gcc rejects much nicer code, clang and msvc accept + // static constexpr std::array visitTbl = { &ControlBlockVTable<Ts, Ts...>::template + // visitConst<V>... }; + + using FunPtrType = + decltype(&ControlBlockVTable<get_t<0>, Ts...>::template visitConst<V, Args...>); + static constexpr FunPtrType visitTbl[] = { + &ControlBlockVTable<Ts, Ts...>::template visitConst<V, Args...>...}; + + check(); + return visitTbl[tag()](std::forward<V>(v), *this, _object, std::forward<Args>(args)...); + } + + template <typename T> + T* cast() { + check(); + static constexpr std::array castTbl = {&ControlBlockVTable<Ts, Ts...>::template cast<T>...}; + return castTbl[tag()](_object); + } + + template <typename T> + const T* cast() const { + static constexpr std::array castTbl = { + &ControlBlockVTable<Ts, Ts...>::template castConst<T>...}; + + check(); + return castTbl[tag()](_object); + } + + template <typename T> + bool is() const { + static constexpr std::array isTbl = {ControlBlockVTable<Ts, Ts...>::template is_v<T>...}; + + check(); + return isTbl[tag()]; + } + + bool empty() const { + return !_object; + } + + void swap(PolyValue& other) noexcept { + std::swap(other._object, _object); + } + + bool operator==(const PolyValue& rhs) const noexcept { + static constexpr std::array cmp = {ControlBlockVTable<Ts, Ts...>::compareEq...}; + return cmp[tag()](_object, rhs._object); + } +}; + +} // namespace algebra +} // namespace mongo::optimizer diff --git a/src/mongo/db/query/optimizer/memo.cpp b/src/mongo/db/query/optimizer/memo.cpp new file mode 100644 index 00000000000..c4dadbb3d5a --- /dev/null +++ b/src/mongo/db/query/optimizer/memo.cpp @@ -0,0 +1,43 @@ +/** + * Copyright (C) 2020-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/query/optimizer/algebra/operator.h" +#include "mongo/db/query/optimizer/memo.h" +#include "mongo/db/query/optimizer/node.h" + +namespace mongo::optimizer { + +std::string MemoGenerator::generateMemo(const PolymorphicNode& e) { + _os.str(""); + _os.clear(); + algebra::transport<false>(e, *this); + return _os.str(); +} + +} // namespace mongo::optimizer diff --git a/src/mongo/db/query/optimizer/visitor.h b/src/mongo/db/query/optimizer/memo.h index 1aa0a886fab..ad3703f8fd8 100644 --- a/src/mongo/db/query/optimizer/visitor.h +++ b/src/mongo/db/query/optimizer/memo.h @@ -31,16 +31,24 @@ #include <string> +#include "mongo/db/query/optimizer/node.h" + namespace mongo::optimizer { -class AbstractVisitor { +class MemoGenerator { public: - virtual void visit(const ScanNode& node) = 0; - virtual void visit(const MultiJoinNode& node) = 0; - virtual void visit(const UnionNode& node) = 0; - virtual void visit(const GroupByNode& node) = 0; - virtual void visit(const UnwindNode& node) = 0; - virtual void visit(const WindNode& node) = 0; + template <typename T, typename... Ts> + void transport(const T&, Ts&&...) {} + + template <typename T> + void prepare(const T& n) { + n.generateMemo(_os); + } + + std::string generateMemo(const PolymorphicNode& e); + +private: + std::ostringstream _os; }; -} // namespace mongo::optimizer +} // namespace mongo::optimizer diff --git a/src/mongo/db/query/optimizer/node.cpp b/src/mongo/db/query/optimizer/node.cpp index 4836dcce39e..a1455efd60f 100644 --- a/src/mongo/db/query/optimizer/node.cpp +++ b/src/mongo/db/query/optimizer/node.cpp @@ -30,130 +30,19 @@ #include <functional> #include <stack> +#include "mongo/db/query/optimizer/memo.h" #include "mongo/db/query/optimizer/node.h" -#include "mongo/db/query/optimizer/visitor.h" -#include "mongo/util/assert_util.h" namespace mongo::optimizer { -Node::Node(Context& ctx) : _nodeId(ctx.getNextNodeId()), _children() {} - -Node::Node(Context& ctx, NodePtr child) : _nodeId(ctx.getNextNodeId()) { - _children.push_back(std::move(child)); -} - -Node::Node(Context& ctx, ChildVector children) - : _nodeId(ctx.getNextNodeId()), _children(std::move(children)) {} +Node::Node(Context& ctx) : _nodeId(ctx.getNextNodeId()) {} void Node::generateMemoBase(std::ostringstream& os) const { os << "NodeId: " << _nodeId << "\n"; } -void Node::visitPreOrder(AbstractVisitor& visitor) const { - visit(visitor); - for (const NodePtr& ptr : _children) { - ptr->visitPreOrder(visitor); - } -} - -void Node::visitPostOrder(AbstractVisitor& visitor) const { - for (const NodePtr& ptr : _children) { - ptr->visitPostOrder(visitor); - } - visit(visitor); -} - -std::string Node::generateMemo() const { - class MemoVisitor : public AbstractVisitor { - protected: - void visit(const ScanNode& node) override { - node.generateMemo(_os); - } - void visit(const MultiJoinNode& node) override { - node.generateMemo(_os); - } - void visit(const UnionNode& node) override { - node.generateMemo(_os); - } - void visit(const GroupByNode& node) override { - node.generateMemo(_os); - } - void visit(const UnwindNode& node) override { - node.generateMemo(_os); - } - void visit(const WindNode& node) override { - node.generateMemo(_os); - } - - public: - std::ostringstream _os; - }; - - MemoVisitor visitor; - visitPreOrder(visitor); - return visitor._os.str(); -} - -NodePtr Node::clone(Context& ctx) const { - class CloneVisitor : public AbstractVisitor { - public: - explicit CloneVisitor(Context& ctx) : _ctx(ctx), _childStack() {} - - protected: - void visit(const ScanNode& node) override { - doClone(node, [&](ChildVector v){ return ScanNode::clone(_ctx, node); }); - } - void visit(const MultiJoinNode& node) override { - doClone(node, [&](ChildVector v){ return MultiJoinNode::clone(_ctx, node, std::move(v)); }); - } - void visit(const UnionNode& node) override { - doClone(node, [&](ChildVector v){ return UnionNode::clone(_ctx, node, std::move(v)); }); - } - void visit(const GroupByNode& node) override { - doClone(node, [&](ChildVector v){ return GroupByNode::clone(_ctx, node, std::move(v.at(0))); }); - } - void visit(const UnwindNode& node) override { - doClone(node, [&](ChildVector v){ return UnwindNode::clone(_ctx, node, std::move(v.at(0))); }); - } - void visit(const WindNode& node) override { - doClone(node, [&](ChildVector v){ return WindNode::clone(_ctx, node, std::move(v.at(0))); }); - } - - private: - void doClone(const Node& node, const std::function<NodePtr(ChildVector newChildren)>& cloneFn) { - ChildVector newChildren; - for (int i = 0; i < node.getChildCount(); i++) { - newChildren.push_back(std::move(_childStack.top())); - _childStack.pop(); - } - _childStack.push(cloneFn(std::move(newChildren))); - } - - public: - Context& _ctx; - std::stack<NodePtr> _childStack; - }; - - CloneVisitor visitor(ctx); - visitPostOrder(visitor); - invariant(visitor._childStack.size() == 1); - return std::move(visitor._childStack.top()); -} - -int Node::getChildCount() const { - return _children.size(); -} - -NodePtr ScanNode::create(Context& ctx, CollectionNameType collectionName) { - return NodePtr(new ScanNode(ctx, std::move(collectionName))); -} - -NodePtr ScanNode::clone(Context& ctx, const ScanNode& other) { - return create(ctx, other._collectionName); -} - ScanNode::ScanNode(Context& ctx, CollectionNameType collectionName) - : Node(ctx), _collectionName(std::move(collectionName)) {} + : Base(), Node(ctx), _collectionName(std::move(collectionName)) {} void ScanNode::generateMemo(std::ostringstream& os) const { Node::generateMemoBase(os); @@ -161,27 +50,12 @@ void ScanNode::generateMemo(std::ostringstream& os) const { << "\n"; } -void ScanNode::visit(AbstractVisitor& visitor) const { - visitor.visit(*this); -} - -NodePtr MultiJoinNode::create(Context& ctx, - FilterSet filterSet, - ProjectionMap projectionMap, - ChildVector children) { - return NodePtr(new MultiJoinNode( - ctx, std::move(filterSet), std::move(projectionMap), std::move(children))); -} - -NodePtr MultiJoinNode::clone(Context& ctx, const MultiJoinNode& other, ChildVector newChildren) { - return create(ctx, other._filterSet, other._projectionMap, std::move(newChildren)); -} - MultiJoinNode::MultiJoinNode(Context& ctx, FilterSet filterSet, ProjectionMap projectionMap, - ChildVector children) - : Node(ctx, std::move(children)), + PolymorphicNodeVector children) + : Base(std::move(children)), + Node(ctx), _filterSet(std::move(filterSet)), _projectionMap(std::move(projectionMap)) {} @@ -191,20 +65,8 @@ void MultiJoinNode::generateMemo(std::ostringstream& os) const { << "\n"; } -void MultiJoinNode::visit(AbstractVisitor& visitor) const { - visitor.visit(*this); -} - -NodePtr UnionNode::create(Context& ctx, ChildVector children) { - return NodePtr(new UnionNode(ctx, std::move(children))); -} - -NodePtr UnionNode::clone(Context& ctx, const UnionNode& other, ChildVector newChildren) { - return create(ctx, std::move(newChildren)); -} - -UnionNode::UnionNode(Context& ctx, ChildVector children) - : Node(ctx, std::move(children)) {} +UnionNode::UnionNode(Context& ctx, PolymorphicNodeVector children) + : Base(std::move(children)), Node(ctx) {} void UnionNode::generateMemo(std::ostringstream& os) const { Node::generateMemoBase(os); @@ -212,27 +74,12 @@ void UnionNode::generateMemo(std::ostringstream& os) const { << "\n"; } -void UnionNode::visit(AbstractVisitor& visitor) const { - visitor.visit(*this); -} - -NodePtr GroupByNode::create(Context& ctx, - GroupByNode::GroupByVector groupByVector, - GroupByNode::ProjectionMap projectionMap, - NodePtr child) { - return NodePtr( - new GroupByNode(ctx, std::move(groupByVector), std::move(projectionMap), std::move(child))); -} - -NodePtr GroupByNode::clone(Context& ctx, const GroupByNode& other, NodePtr newChild) { - return create(ctx, other._groupByVector, other._projectionMap, std::move(newChild)); -} - GroupByNode::GroupByNode(Context& ctx, GroupByNode::GroupByVector groupByVector, GroupByNode::ProjectionMap projectionMap, - NodePtr child) - : Node(ctx, std::move(child)), + PolymorphicNode child) + : Base(std::move(child)), + Node(ctx), _groupByVector(std::move(groupByVector)), _projectionMap(std::move(projectionMap)) {} @@ -242,27 +89,12 @@ void GroupByNode::generateMemo(std::ostringstream& os) const { << "\n"; } -void GroupByNode::visit(AbstractVisitor& visitor) const { - visitor.visit(*this); -} - -NodePtr UnwindNode::create(Context& ctx, - ProjectionName projectionName, - const bool retainNonArrays, - NodePtr child) { - return NodePtr( - new UnwindNode(ctx, std::move(projectionName), retainNonArrays, std::move(child))); -} - -NodePtr UnwindNode::clone(Context& ctx, const UnwindNode& other, NodePtr newChild) { - return create(ctx, other._projectionName, other._retainNonArrays, std::move(newChild)); -} - UnwindNode::UnwindNode(Context& ctx, ProjectionName projectionName, const bool retainNonArrays, - NodePtr child) - : Node(ctx, std::move(child)), + PolymorphicNode child) + : Base(std::move(child)), + Node(ctx), _projectionName(std::move(projectionName)), _retainNonArrays(retainNonArrays) {} @@ -272,20 +104,8 @@ void UnwindNode::generateMemo(std::ostringstream& os) const { << "\n"; } -void UnwindNode::visit(AbstractVisitor& visitor) const { - visitor.visit(*this); -} - -NodePtr WindNode::create(Context& ctx, ProjectionName projectionName, NodePtr child) { - return NodePtr(new WindNode(ctx, std::move(projectionName), std::move(child))); -} - -NodePtr WindNode::clone(Context& ctx, const WindNode& other, NodePtr newChild) { - return create(ctx, other._projectionName, std::move(newChild)); -} - -WindNode::WindNode(Context& ctx, ProjectionName projectionName, NodePtr child) - : Node(ctx, std::move(child)), _projectionName(std::move(projectionName)) {} +WindNode::WindNode(Context& ctx, ProjectionName projectionName, PolymorphicNode child) + : Base(std::move(child)), Node(ctx), _projectionName(std::move(projectionName)) {} void WindNode::generateMemo(std::ostringstream& os) const { Node::generateMemoBase(os); @@ -293,8 +113,4 @@ void WindNode::generateMemo(std::ostringstream& os) const { << "\n"; } -void WindNode::visit(AbstractVisitor& visitor) const { - visitor.visit(*this); -} - } // namespace mongo::optimizer diff --git a/src/mongo/db/query/optimizer/node.h b/src/mongo/db/query/optimizer/node.h index 78010d7d333..33215f967e0 100644 --- a/src/mongo/db/query/optimizer/node.h +++ b/src/mongo/db/query/optimizer/node.h @@ -37,6 +37,7 @@ #include <utility> #include <vector> +#include "mongo/db/query/optimizer/algebra/operator.h" #include "mongo/db/query/optimizer/defs.h" #include "mongo/db/query/optimizer/filter.h" #include "mongo/db/query/optimizer/projection.h" @@ -45,156 +46,137 @@ namespace mongo::optimizer { -class Node; -using NodePtr = std::unique_ptr<Node>; -class AbstractVisitor; +class ScanNode; +class MultiJoinNode; +class UnionNode; +class GroupByNode; +class UnwindNode; +class WindNode; -class Node { -public: - using ChildVector = std::vector<NodePtr>; +using PolymorphicNode = + algebra::PolyValue<ScanNode, MultiJoinNode, UnionNode, GroupByNode, UnwindNode, WindNode>; + +template <typename Derived, size_t Arity> +using Operator = algebra::OpSpecificArity<PolymorphicNode, Derived, Arity>; + +template <typename Derived, size_t Arity> +using OperatorDynamic = algebra::OpSpecificDynamicArity<PolymorphicNode, Derived, Arity>; + +template <typename Derived> +using OperatorDynamicHomogenous = OperatorDynamic<Derived, 0>; + +using PolymorphicNodeVector = std::vector<PolymorphicNode>; + +template <typename T, typename... Args> +inline auto make(Args&&... args) { + return PolymorphicNode::make<T>(std::forward<Args>(args)...); +} +template <typename... Args> +inline auto makeSeq(Args&&... args) { + PolymorphicNodeVector seq; + (seq.emplace_back(std::forward<Args>(args)), ...); + return seq; +} + +class Node { protected: explicit Node(Context& ctx); - explicit Node(Context& ctx, NodePtr child); - explicit Node(Context& ctx, ChildVector children); void generateMemoBase(std::ostringstream& os) const; - virtual void visit(AbstractVisitor& visitor) const = 0; - void visitPreOrder(AbstractVisitor& visitor) const; - void visitPostOrder(AbstractVisitor& visitor) const; - - // clone public: Node() = delete; - std::string generateMemo() const; - - NodePtr clone(Context& ctx) const; - - int getChildCount() const; - private: const NodeIdType _nodeId; - ChildVector _children; }; -class ScanNode : public Node { +class ScanNode final : public Operator<ScanNode, 0>, public Node { + using Base = Operator<ScanNode, 0>; + public: - static NodePtr create(Context& ctx, CollectionNameType collectionName); - static NodePtr clone(Context& ctx, const ScanNode& other); + explicit ScanNode(Context& ctx, CollectionNameType collectionName); void generateMemo(std::ostringstream& os) const; -protected: - void visit(AbstractVisitor& visitor) const override; - private: - explicit ScanNode(Context& ctx, CollectionNameType collectionName); - const CollectionNameType _collectionName; }; -class MultiJoinNode : public Node { +class MultiJoinNode final : public OperatorDynamicHomogenous<MultiJoinNode>, public Node { + using Base = OperatorDynamicHomogenous<MultiJoinNode>; + public: using FilterSet = std::unordered_set<FilterType>; using ProjectionMap = std::unordered_map<ProjectionName, ProjectionType>; - static NodePtr create(Context& ctx, - FilterSet filterSet, - ProjectionMap projectionMap, - ChildVector children); - static NodePtr clone(Context& ctx, const MultiJoinNode& other, ChildVector newChildren); - - void generateMemo(std::ostringstream& os) const; - -protected: - void visit(AbstractVisitor& visitor) const override; - -private: explicit MultiJoinNode(Context& ctx, FilterSet filterSet, ProjectionMap projectionMap, - ChildVector children); + PolymorphicNodeVector children); + void generateMemo(std::ostringstream& os) const; + +private: FilterSet _filterSet; ProjectionMap _projectionMap; }; -class UnionNode : public Node { +class UnionNode final : public OperatorDynamicHomogenous<UnionNode>, public Node { + using Base = OperatorDynamicHomogenous<UnionNode>; + public: - static NodePtr create(Context& ctx, ChildVector children); - static NodePtr clone(Context& ctx, const UnionNode& other, ChildVector newChildren); + explicit UnionNode(Context& ctx, PolymorphicNodeVector children); void generateMemo(std::ostringstream& os) const; - -protected: - void visit(AbstractVisitor& visitor) const override; - -private: - explicit UnionNode(Context& ctx, ChildVector children); }; -class GroupByNode : public Node { +class GroupByNode : public Operator<GroupByNode, 1>, public Node { + using Base = Operator<GroupByNode, 1>; + public: using GroupByVector = std::vector<ProjectionName>; using ProjectionMap = std::unordered_map<ProjectionName, ProjectionType>; - static NodePtr create(Context& ctx, - GroupByVector groupByVector, - ProjectionMap projectionMap, - NodePtr child); - static NodePtr clone(Context& ctx, const GroupByNode& other, NodePtr newChild); - - void generateMemo(std::ostringstream& os) const; - -protected: - void visit(AbstractVisitor& visitor) const override; - -private: explicit GroupByNode(Context& ctx, GroupByVector groupByVector, ProjectionMap projectionMap, - NodePtr child); + PolymorphicNode child); + + void generateMemo(std::ostringstream& os) const; +private: GroupByVector _groupByVector; ProjectionMap _projectionMap; }; -class UnwindNode : public Node { +class UnwindNode final : public Operator<UnwindNode, 1>, public Node { + using Base = Operator<UnwindNode, 1>; + public: - static NodePtr create(Context& ctx, - ProjectionName projectionName, - bool retainNonArrays, - NodePtr child); - static NodePtr clone(Context& ctx, const UnwindNode& other, NodePtr newChild); + explicit UnwindNode(Context& ctx, + ProjectionName projectionName, + bool retainNonArrays, + PolymorphicNode child); void generateMemo(std::ostringstream& os) const; -protected: - void visit(AbstractVisitor& visitor) const override; - private: - UnwindNode(Context& ctx, ProjectionName projectionName, bool retainNonArrays, NodePtr child); - const ProjectionName _projectionName; const bool _retainNonArrays; }; -class WindNode : public Node { +class WindNode final : public Operator<WindNode, 1>, public Node { + using Base = Operator<WindNode, 1>; + public: - static NodePtr create(Context& ctx, ProjectionName projectionName, NodePtr child); - static NodePtr clone(Context& ctx, const WindNode& other, NodePtr newChild); + explicit WindNode(Context& ctx, ProjectionName projectionName, PolymorphicNode child); void generateMemo(std::ostringstream& os) const; -protected: - void visit(AbstractVisitor& visitor) const override; - private: - WindNode(Context& ctx, ProjectionName projectionName, NodePtr child); - const ProjectionName _projectionName; }; diff --git a/src/mongo/db/query/optimizer/optimizer_test.cpp b/src/mongo/db/query/optimizer/optimizer_test.cpp index 86966e05a7e..f1cffe77303 100644 --- a/src/mongo/db/query/optimizer/optimizer_test.cpp +++ b/src/mongo/db/query/optimizer/optimizer_test.cpp @@ -27,6 +27,7 @@ * it in the license file. */ +#include "mongo/db/query/optimizer/memo.h" #include "mongo/db/query/optimizer/node.h" #include "mongo/unittest/unittest.h" @@ -35,15 +36,20 @@ namespace { TEST(Optimizer, Basic) { Context ctx; + MemoGenerator gen; - NodePtr ptrScan = ScanNode::create(ctx, "test"); - Node::ChildVector v; - v.push_back(std::move(ptrScan)); - NodePtr ptrJoin = MultiJoinNode::create(ctx, {}, {}, std::move(v)); - ASSERT_EQ("NodeId: 1\nMultiJoin\nNodeId: 0\nScan\n", ptrJoin->generateMemo()); + PolymorphicNode scanNode = make<ScanNode>(ctx, "test"); + ASSERT_EQ("NodeId: 0\nScan\n", gen.generateMemo(scanNode)); - NodePtr cloned = ptrJoin->clone(ctx); - ASSERT_EQ("NodeId: 3\nMultiJoin\nNodeId: 2\nScan\n", cloned->generateMemo()); + PolymorphicNode joinNode = make<MultiJoinNode>(ctx, + MultiJoinNode::FilterSet{}, + MultiJoinNode::ProjectionMap{}, + makeSeq(std::move(scanNode))); + ASSERT_EQ("NodeId: 1\nMultiJoin\nNodeId: 0\nScan\n", gen.generateMemo(joinNode)); + + + PolymorphicNode cloned = joinNode; + ASSERT_EQ("NodeId: 1\nMultiJoin\nNodeId: 0\nScan\n", gen.generateMemo(cloned)); } } // namespace diff --git a/src/mongo/db/query/sbe_stage_builder_coll_scan.cpp b/src/mongo/db/query/sbe_stage_builder_coll_scan.cpp index 05f9bcefb96..1a338abf238 100644 --- a/src/mongo/db/query/sbe_stage_builder_coll_scan.cpp +++ b/src/mongo/db/query/sbe_stage_builder_coll_scan.cpp @@ -41,6 +41,7 @@ #include "mongo/db/exec/sbe/stages/loop_join.h" #include "mongo/db/exec/sbe/stages/project.h" #include "mongo/db/exec/sbe/stages/scan.h" +#include "mongo/db/exec/sbe/stages/union.h" #include "mongo/db/query/sbe_stage_builder_filter.h" #include "mongo/db/query/util/make_data_structure.h" #include "mongo/db/storage/oplog_hack.h" @@ -330,20 +331,63 @@ generateGenericCollScan(const Collection* collection, // Check if the scan should be started after the provided resume RecordId and construct a nested // loop join sub-tree to project out the resume RecordId as a seekRecordIdSlot and feed it to - // the inner side (scan). - // - // Note that we also inject a 'skip 1' stage on top of the inner branch, as we need to start - // _after_ the resume RecordId. - // - // TODO SERVER-48472: raise KeyNotFound error if we cannot position the cursor on - // seekRecordIdSlot. + // the inner side (scan). We will also construct a union sub-tree as an outer side of the loop + // join to implement the check that the record we're trying to reposition the scan exists. if (seekRecordIdSlot && !isTailableResumeBranch) { + // Project out the RecordId we want to resume from as 'seekSlot'. + auto seekSlot = slotIdGenerator->generate(); + auto projStage = sbe::makeProjectStage( + sbe::makeS<sbe::LimitSkipStage>(sbe::makeS<sbe::CoScanStage>(), 1, boost::none), + seekSlot, + sbe::makeE<sbe::EConstant>(sbe::value::TypeTags::NumberInt64, + csn->resumeAfterRecordId->repr())); + + // Construct a 'seek' branch of the 'union'. If we're succeeded to reposition the cursor, + // the branch will output the 'seekSlot' to start the real scan from, otherwise it will + // produce EOF. + auto seekBranch = + sbe::makeS<sbe::LoopJoinStage>(std::move(projStage), + sbe::makeS<sbe::ScanStage>(nss, + boost::none, + boost::none, + std::vector<std::string>{}, + sbe::makeSV(), + seekSlot, + forward, + yieldPolicy, + tracker), + + sbe::makeSV(seekSlot), + sbe::makeSV(seekSlot), + nullptr); + + // Construct a 'fail' branch of the union. The 'unusedSlot' is needed as each union branch + // must have the same number of slots, and we use just one in the 'seek' branch above. This + // branch will only be executed if the 'seek' branch produces EOF, which can only happen if + // if the seek did not find the record id specified in $_resumeAfter. + auto unusedSlot = slotIdGenerator->generate(); + auto failBranch = sbe::makeProjectStage( + sbe::makeS<sbe::CoScanStage>(), + unusedSlot, + sbe::makeE<sbe::EFail>( + ErrorCodes::KeyNotFound, + str::stream() << "Failed to resume collection scan: the recordId from which we are " + << "attempting to resume no longer exists in the collection: " + << csn->resumeAfterRecordId)); + + // Construct a union stage from the 'seek' and 'fail' branches. Note that this stage will + // ever produce a single call to getNext() due to a 'limit 1' sitting on top of it. + auto unionStage = sbe::makeS<sbe::UnionStage>( + make_vector<std::unique_ptr<sbe::PlanStage>>(std::move(seekBranch), + std::move(failBranch)), + std::vector<sbe::value::SlotVector>{sbe::makeSV(seekSlot), sbe::makeSV(unusedSlot)}, + sbe::makeSV(*seekRecordIdSlot)); + + // Construct the final loop join. Note that we also inject a 'skip 1' stage on top of the + // inner branch, as we need to start _after_ the resume RecordId, and a 'limit 1' stage on + // top of the outer branch, as it should produce just a single seek recordId. stage = sbe::makeS<sbe::LoopJoinStage>( - sbe::makeProjectStage( - sbe::makeS<sbe::LimitSkipStage>(sbe::makeS<sbe::CoScanStage>(), 1, boost::none), - *seekRecordIdSlot, - sbe::makeE<sbe::EConstant>(sbe::value::TypeTags::NumberInt64, - csn->resumeAfterRecordId->repr())), + sbe::makeS<sbe::LimitSkipStage>(std::move(unionStage), 1, boost::none), sbe::makeS<sbe::LimitSkipStage>(std::move(stage), boost::none, 1), sbe::makeSV(), sbe::makeSV(*seekRecordIdSlot), diff --git a/src/mongo/db/query/sbe_stage_builder_expression.cpp b/src/mongo/db/query/sbe_stage_builder_expression.cpp index b17164df951..22c03198b9b 100644 --- a/src/mongo/db/query/sbe_stage_builder_expression.cpp +++ b/src/mongo/db/query/sbe_stage_builder_expression.cpp @@ -883,13 +883,103 @@ public: _context->pushExpr( sbe::makeE<sbe::ELocalBind>(frameId, std::move(binds), std::move(absExpr))); } + void visit(ExpressionAdd* expr) final { - _context->ensureArity(2); - auto rhs = _context->popExpr(); - auto lhs = _context->popExpr(); - _context->pushExpr( - sbe::makeE<sbe::EPrimBinary>(sbe::EPrimBinary::add, std::move(lhs), std::move(rhs))); + size_t arity = expr->getChildren().size(); + _context->ensureArity(arity); + auto frameId = _context->frameIdGenerator->generate(); + + + auto generateNotNumberOrDate = [frameId](const sbe::value::SlotId slotId) { + sbe::EVariable var{frameId, slotId}; + return sbe::makeE<sbe::EPrimBinary>( + sbe::EPrimBinary::logicAnd, + sbe::makeE<sbe::EPrimUnary>( + sbe::EPrimUnary::logicNot, + sbe::makeE<sbe::EFunction>("isNumber", sbe::makeEs(var.clone()))), + sbe::makeE<sbe::EPrimUnary>( + sbe::EPrimUnary::logicNot, + sbe::makeE<sbe::EFunction>("isDate", sbe::makeEs(var.clone())))); + }; + + if (arity == 2) { + auto rhs = _context->popExpr(); + auto lhs = _context->popExpr(); + auto binds = sbe::makeEs(std::move(lhs), std::move(rhs)); + sbe::EVariable lhsVar{frameId, 0}; + sbe::EVariable rhsVar{frameId, 1}; + + auto addExpr = sbe::makeE<sbe::EIf>( + sbe::makeE<sbe::EPrimBinary>(sbe::EPrimBinary::logicOr, + generateNullOrMissing(frameId, 0), + generateNullOrMissing(frameId, 1)), + sbe::makeE<sbe::EConstant>(sbe::value::TypeTags::Null, 0), + sbe::makeE<sbe::EIf>( + sbe::makeE<sbe::EPrimBinary>(sbe::EPrimBinary::logicOr, + generateNotNumberOrDate(0), + generateNotNumberOrDate(1)), + sbe::makeE<sbe::EFail>( + ErrorCodes::Error{4974201}, + "only numbers and dates are allowed in an $add expression"), + sbe::makeE<sbe::EIf>( + sbe::makeE<sbe::EPrimBinary>( + sbe::EPrimBinary::logicAnd, + sbe::makeE<sbe::EFunction>("isDate", sbe::makeEs(lhsVar.clone())), + sbe::makeE<sbe::EFunction>("isDate", sbe::makeEs(rhsVar.clone()))), + sbe::makeE<sbe::EFail>(ErrorCodes::Error{4974202}, + "only one date allowed in an $add expression"), + sbe::makeE<sbe::EPrimBinary>( + sbe::EPrimBinary::add, lhsVar.clone(), rhsVar.clone())))); + + _context->pushExpr( + sbe::makeE<sbe::ELocalBind>(frameId, std::move(binds), std::move(addExpr))); + } else { + std::vector<std::unique_ptr<sbe::EExpression>> binds; + for (size_t i = 0; i < arity; i++) { + binds.push_back(_context->popExpr()); + } + std::reverse(std::begin(binds), std::end(binds)); + + std::vector<std::unique_ptr<sbe::EExpression>> checkExprsNull; + std::vector<std::unique_ptr<sbe::EExpression>> checkExprsNotNumberOrDate; + std::vector<std::unique_ptr<sbe::EExpression>> argVars; + for (size_t idx = 0; idx < arity; idx++) { + checkExprsNull.push_back(generateNullOrMissing(frameId, idx)); + checkExprsNotNumberOrDate.push_back(generateNotNumberOrDate(idx)); + argVars.push_back(sbe::makeE<sbe::EVariable>(frameId, idx)); + } + + using iter_t = std::vector<std::unique_ptr<sbe::EExpression>>::iterator; + auto checkNullAllArguments = + std::accumulate(std::move_iterator<iter_t>(checkExprsNull.begin() + 1), + std::move_iterator<iter_t>(checkExprsNull.end()), + std::move(checkExprsNull.front()), + [](auto&& acc, auto&& ex) { + return sbe::makeE<sbe::EPrimBinary>( + sbe::EPrimBinary::logicOr, std::move(acc), std::move(ex)); + }); + auto checkNotNumberOrDateAllArguments = + std::accumulate(std::move_iterator<iter_t>(checkExprsNotNumberOrDate.begin() + 1), + std::move_iterator<iter_t>(checkExprsNotNumberOrDate.end()), + std::move(checkExprsNotNumberOrDate.front()), + [](auto&& acc, auto&& ex) { + return sbe::makeE<sbe::EPrimBinary>( + sbe::EPrimBinary::logicOr, std::move(acc), std::move(ex)); + }); + auto addExpr = sbe::makeE<sbe::EIf>( + std::move(checkNullAllArguments), + sbe::makeE<sbe::EConstant>(sbe::value::TypeTags::Null, 0), + sbe::makeE<sbe::EIf>( + std::move(checkNotNumberOrDateAllArguments), + sbe::makeE<sbe::EFail>( + ErrorCodes::Error{4974203}, + "only numbers and dates are allowed in an $add expression"), + sbe::makeE<sbe::EFunction>("doubleDoubleSum", std::move(argVars)))); + _context->pushExpr( + sbe::makeE<sbe::ELocalBind>(frameId, std::move(binds), std::move(addExpr))); + } } + void visit(ExpressionAllElementsTrue* expr) final { unsupportedExpression(expr->getOpName()); } diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript index eeddbed7948..0ef88b0a410 100644 --- a/src/mongo/db/repl/SConscript +++ b/src/mongo/db/repl/SConscript @@ -1232,6 +1232,7 @@ env.Library( '$BUILD_DIR/mongo/db/repl/speculative_authenticate', '$BUILD_DIR/mongo/db/stats/counters', '$BUILD_DIR/mongo/transport/message_compressor', + 'primary_only_service', 'replication_auth', 'split_horizon', ], diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp index 77daa595256..a9c8b47e61a 100644 --- a/src/mongo/db/repl/bgsync.cpp +++ b/src/mongo/db/repl/bgsync.cpp @@ -700,8 +700,9 @@ void BackgroundSync::_runRollback(OperationContext* opCtx, ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict(opCtx->lockState()); - // Explicitly start future read transactions without a timestamp. - opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); + // Ensure future transactions read without a timestamp. + invariant(RecoveryUnit::ReadSource::kNoTimestamp == + opCtx->recoveryUnit()->getTimestampReadSource()); // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. @@ -878,8 +879,9 @@ void BackgroundSync::start(OperationContext* opCtx) { OpTime lastAppliedOpTime; ShouldNotConflictWithSecondaryBatchApplicationBlock noConflict(opCtx->lockState()); - // Explicitly start future read transactions without a timestamp. - opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); + // Ensure future transactions read without a timestamp. + invariant(RecoveryUnit::ReadSource::kNoTimestamp == + opCtx->recoveryUnit()->getTimestampReadSource()); do { lastAppliedOpTime = _readLastAppliedOpTime(opCtx); diff --git a/src/mongo/db/repl/collection_bulk_loader_impl.cpp b/src/mongo/db/repl/collection_bulk_loader_impl.cpp index eab00297cdd..23fce736413 100644 --- a/src/mongo/db/repl/collection_bulk_loader_impl.cpp +++ b/src/mongo/db/repl/collection_bulk_loader_impl.cpp @@ -278,7 +278,7 @@ Status CollectionBulkLoaderImpl::commit() { status = _idIndexBlock->drainBackgroundWrites( _opCtx.get(), - RecoveryUnit::ReadSource::kUnset, + RecoveryUnit::ReadSource::kNoTimestamp, _nss.isSystemDotViews() ? IndexBuildInterceptor::DrainYieldPolicy::kNoYield : IndexBuildInterceptor::DrainYieldPolicy::kYield); if (!status.isOK()) { diff --git a/src/mongo/db/repl/oplog_applier_impl.cpp b/src/mongo/db/repl/oplog_applier_impl.cpp index 24ff5ad96d6..f769fd14c6d 100644 --- a/src/mongo/db/repl/oplog_applier_impl.cpp +++ b/src/mongo/db/repl/oplog_applier_impl.cpp @@ -779,8 +779,9 @@ Status OplogApplierImpl::applyOplogBatchPerWorker(OperationContext* opCtx, // destroyed by unstash in its destructor. Thus we set the flag explicitly. opCtx->lockState()->setShouldConflictWithSecondaryBatchApplication(false); - // Explicitly start future read transactions without a timestamp. - opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); + // Ensure future transactions read without a timestamp. + invariant(RecoveryUnit::ReadSource::kNoTimestamp == + opCtx->recoveryUnit()->getTimestampReadSource()); // When querying indexes, we return the record matching the key if it exists, or an adjacent // document. This means that it is possible for us to hit a prepare conflict if we query for an diff --git a/src/mongo/db/repl/oplog_batcher.cpp b/src/mongo/db/repl/oplog_batcher.cpp index 99f7077519d..efd257d26d8 100644 --- a/src/mongo/db/repl/oplog_batcher.cpp +++ b/src/mongo/db/repl/oplog_batcher.cpp @@ -121,13 +121,6 @@ bool isUnpreparedCommit(const OplogEntry& entry) { * the final oplog entry in the transaction is processed individually, since the operations are not * actually run until the commit operation is reached. * - * Oplog entries on 'system.views' should also be processed one at a time. View catalog immediately - * reflects changes for each oplog entry so we can see inconsistent view catalog if multiple oplog - * entries on 'system.views' are being applied out of the original order. - * - * Process updates to 'admin.system.version' individually as well so the secondary's FCV when - * processing each operation matches the primary's when committing that operation. - * * The ends of large transactions (> 16MB) should also be processed immediately on its own in order * to avoid scenarios where parts of the transaction is batched with other operations not in the * transaction. @@ -143,8 +136,7 @@ bool OplogBatcher::mustProcessIndividually(const OplogEntry& entry) { } const auto nss = entry.getNss(); - return nss.isSystemDotViews() || nss.isServerConfigurationCollection() || - nss.isPrivilegeCollection(); + return nss.mustBeAppliedInOwnOplogBatch(); } std::size_t OplogBatcher::getOpCount(const OplogEntry& entry) { @@ -355,12 +347,6 @@ std::size_t getBatchLimitOplogEntries() { std::size_t getBatchLimitOplogBytes(OperationContext* opCtx, StorageInterface* storageInterface) { // We can't change the timestamp source within a write unit of work. invariant(!opCtx->lockState()->inAWriteUnitOfWork()); - // We're only reading oplog metadata, so the timestamp is not important. If we read with the - // default (which is lastApplied on secondaries), we may end up with a reader that is at - // lastApplied. If we then roll back, then when we reconstruct prepared transactions during - // rollback recovery we will be preparing transactions before the read timestamp, which triggers - // an assertion in WiredTiger. - ReadSourceScope readSourceScope(opCtx, RecoveryUnit::ReadSource::kNoTimestamp); auto oplogMaxSizeResult = storageInterface->getOplogMaxSize(opCtx); auto oplogMaxSize = fassert(40301, oplogMaxSizeResult); return std::min(oplogMaxSize / 10, std::size_t(replBatchLimitBytes.load())); diff --git a/src/mongo/db/repl/primary_only_service.cpp b/src/mongo/db/repl/primary_only_service.cpp index acf79190d1b..ab7f25ec861 100644 --- a/src/mongo/db/repl/primary_only_service.cpp +++ b/src/mongo/db/repl/primary_only_service.cpp @@ -227,9 +227,21 @@ void PrimaryOnlyServiceRegistry::onStepDown() { } } +void PrimaryOnlyServiceRegistry::reportServiceInfo(BSONObjBuilder* result) { + BSONObjBuilder subBuilder(result->subobjStart("primaryOnlyServices")); + for (auto& service : _servicesByName) { + subBuilder.appendNumber(service.first, service.second->getNumberOfInstances()); + } +} + PrimaryOnlyService::PrimaryOnlyService(ServiceContext* serviceContext) : _serviceContext(serviceContext) {} +size_t PrimaryOnlyService::getNumberOfInstances() { + stdx::lock_guard lk(_mutex); + return _instances.size(); +} + bool PrimaryOnlyService::isRunning() const { stdx::lock_guard lk(_mutex); return _state == State::kRunning; diff --git a/src/mongo/db/repl/primary_only_service.h b/src/mongo/db/repl/primary_only_service.h index 0d57cf96d9b..664ecd9fa11 100644 --- a/src/mongo/db/repl/primary_only_service.h +++ b/src/mongo/db/repl/primary_only_service.h @@ -217,6 +217,11 @@ public: */ bool isRunning() const; + /** + * Returns the number of currently running Instances of this service. + */ + size_t getNumberOfInstances(); + protected: /** * Constructs a new Instance object with the given initial state. @@ -328,6 +333,12 @@ public: */ PrimaryOnlyService* lookupServiceByNamespace(const NamespaceString& ns); + /** + * Adds a 'primaryOnlyServices' sub-obj to the 'result' BSONObjBuilder containing a count of the + * number of active instances for each registered service. + */ + void reportServiceInfo(BSONObjBuilder* result); + void onStartup(OperationContext*) final; void onShutdown() final; void onStepUpBegin(OperationContext*, long long term) final {} diff --git a/src/mongo/db/repl/primary_only_service_test.cpp b/src/mongo/db/repl/primary_only_service_test.cpp index d89005e8b1d..d7c76b6b7cb 100644 --- a/src/mongo/db/repl/primary_only_service_test.cpp +++ b/src/mongo/db/repl/primary_only_service_test.cpp @@ -367,6 +367,40 @@ TEST_F(PrimaryOnlyServiceTest, DoubleCreateInstance) { TestServiceHangDuringInitialization.setMode(FailPoint::off); } +TEST_F(PrimaryOnlyServiceTest, ReportServiceInfo) { + { + BSONObjBuilder resultBuilder; + _registry->reportServiceInfo(&resultBuilder); + + ASSERT_BSONOBJ_EQ(BSON("primaryOnlyServices" << BSON("TestService" << 0)), + resultBuilder.obj()); + } + + // Make sure the instance doesn't complete. + TestServiceHangDuringInitialization.setMode(FailPoint::alwaysOn); + auto instance = TestService::Instance::getOrCreate(_service, BSON("_id" << 0 << "state" << 0)); + + { + BSONObjBuilder resultBuilder; + _registry->reportServiceInfo(&resultBuilder); + + ASSERT_BSONOBJ_EQ(BSON("primaryOnlyServices" << BSON("TestService" << 1)), + resultBuilder.obj()); + } + + auto instance2 = TestService::Instance::getOrCreate(_service, BSON("_id" << 1 << "state" << 0)); + + { + BSONObjBuilder resultBuilder; + _registry->reportServiceInfo(&resultBuilder); + + ASSERT_BSONOBJ_EQ(BSON("primaryOnlyServices" << BSON("TestService" << 2)), + resultBuilder.obj()); + } + + TestServiceHangDuringInitialization.setMode(FailPoint::off); +} + TEST_F(PrimaryOnlyServiceTest, CreateWhenNotPrimary) { _registry->onStepDown(); diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index 3cfe7be562b..3f03ceb9d28 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -357,7 +357,8 @@ void ReplicationCoordinatorExternalStateImpl::clearAppliedThroughIfCleanShutdown // Ensure that all writes are visible before reading. If we failed mid-batch, it would be // possible to read from a kNoOverlap ReadSource where not all writes to the minValid document // are visible, generating a writeConflict that would not resolve. - opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); + invariant(RecoveryUnit::ReadSource::kNoTimestamp == + opCtx->recoveryUnit()->getTimestampReadSource()); auto loadLastOpTimeAndWallTimeResult = loadLastOpTimeAndWallTime(opCtx); if (_replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(opCtx).isNull() && diff --git a/src/mongo/db/repl/replication_info.cpp b/src/mongo/db/repl/replication_info.cpp index ec551d390ea..188de5e8d16 100644 --- a/src/mongo/db/repl/replication_info.cpp +++ b/src/mongo/db/repl/replication_info.cpp @@ -50,6 +50,7 @@ #include "mongo/db/ops/write_ops.h" #include "mongo/db/query/internal_plans.h" #include "mongo/db/repl/is_master_response.h" +#include "mongo/db/repl/primary_only_service.h" #include "mongo/db/repl/replication_auth.h" #include "mongo/db/repl/replication_coordinator.h" #include "mongo/db/repl/replication_process.h" @@ -86,12 +87,17 @@ constexpr auto kHelloString = "hello"_sd; constexpr auto kCamelCaseIsMasterString = "isMaster"_sd; constexpr auto kLowerCaseIsMasterString = "ismaster"_sd; +void appendPrimaryOnlyServiceInfo(ServiceContext* serviceContext, BSONObjBuilder* result) { + auto registry = PrimaryOnlyServiceRegistry::get(serviceContext); + registry->reportServiceInfo(result); +} + /** * Appends replication-related fields to the isMaster response. Returns the topology version that * was included in the response. */ TopologyVersion appendReplicationInfo(OperationContext* opCtx, - BSONObjBuilder& result, + BSONObjBuilder* result, bool appendReplicationProcess, bool useLegacyResponseFields, boost::optional<TopologyVersion> clientTopologyVersion, @@ -108,9 +114,9 @@ TopologyVersion appendReplicationInfo(OperationContext* opCtx, } auto isMasterResponse = replCoord->awaitIsMasterResponse(opCtx, horizonParams, clientTopologyVersion, deadline); - result.appendElements(isMasterResponse->toBSON(useLegacyResponseFields)); + result->appendElements(isMasterResponse->toBSON(useLegacyResponseFields)); if (appendReplicationProcess) { - replCoord->appendSlaveInfoData(&result); + replCoord->appendSlaveInfoData(result); } invariant(isMasterResponse->getTopologyVersion()); return isMasterResponse->getTopologyVersion().get(); @@ -142,10 +148,10 @@ TopologyVersion appendReplicationInfo(OperationContext* opCtx, opCtx->sleepFor(Milliseconds(*maxAwaitTimeMS)); } - result.appendBool((useLegacyResponseFields ? "ismaster" : "isWritablePrimary"), - ReplicationCoordinator::get(opCtx)->isMasterForReportingPurposes()); + result->appendBool((useLegacyResponseFields ? "ismaster" : "isWritablePrimary"), + ReplicationCoordinator::get(opCtx)->isMasterForReportingPurposes()); - BSONObjBuilder topologyVersionBuilder(result.subobjStart("topologyVersion")); + BSONObjBuilder topologyVersionBuilder(result->subobjStart("topologyVersion")); currentTopologyVersion.serialize(&topologyVersionBuilder); return currentTopologyVersion; @@ -171,12 +177,14 @@ public: // TODO SERVER-50219: Change useLegacyResponseFields to false once the serverStatus changes // to remove master-slave terminology are merged. appendReplicationInfo(opCtx, - result, + &result, appendReplicationProcess, true /* useLegacyResponseFields */, boost::none /* clientTopologyVersion */, boost::none /* maxAwaitTimeMS */); + appendPrimaryOnlyServiceInfo(opCtx->getServiceContext(), &result); + auto rbid = ReplicationProcess::get(opCtx)->getRollbackID(); if (ReplicationProcess::kUninitializedRollbackId != rbid) { result.append("rbid", rbid); @@ -426,7 +434,7 @@ public: auto result = replyBuilder->getBodyBuilder(); auto currentTopologyVersion = appendReplicationInfo( - opCtx, result, 0, useLegacyResponseFields, clientTopologyVersion, maxAwaitTimeMS); + opCtx, &result, 0, useLegacyResponseFields, clientTopologyVersion, maxAwaitTimeMS); if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { const int configServerModeNumber = 2; diff --git a/src/mongo/db/repl/replication_recovery.cpp b/src/mongo/db/repl/replication_recovery.cpp index bba59beb626..c0c242421f9 100644 --- a/src/mongo/db/repl/replication_recovery.cpp +++ b/src/mongo/db/repl/replication_recovery.cpp @@ -131,7 +131,9 @@ public: _oplogApplicationEndPoint(oplogApplicationEndPoint) {} void startup(OperationContext* opCtx) final { - opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); + invariant(opCtx->recoveryUnit()->getTimestampReadSource() == + RecoveryUnit::ReadSource::kNoTimestamp); + _client = std::make_unique<DBDirectClient>(opCtx); BSONObj predicate = _oplogApplicationEndPoint ? BSON("$gte" << _oplogApplicationStartPoint << "$lte" << *_oplogApplicationEndPoint) diff --git a/src/mongo/db/repl/storage_interface_impl.cpp b/src/mongo/db/repl/storage_interface_impl.cpp index 159179530a9..371a2c6af5f 100644 --- a/src/mongo/db/repl/storage_interface_impl.cpp +++ b/src/mongo/db/repl/storage_interface_impl.cpp @@ -74,6 +74,7 @@ #include "mongo/db/repl/replication_coordinator.h" #include "mongo/db/repl/rollback_gen.h" #include "mongo/db/service_context.h" +#include "mongo/db/storage/checkpointer.h" #include "mongo/db/storage/control/journal_flusher.h" #include "mongo/db/storage/control/storage_control.h" #include "mongo/db/storage/durable_catalog.h" @@ -1271,7 +1272,18 @@ void StorageInterfaceImpl::setStableTimestamp(ServiceContext* serviceCtx, Timest "holdStableTimestamp"_attr = holdStableTimestamp); } }); - serviceCtx->getStorageEngine()->setStableTimestamp(newStableTimestamp); + + StorageEngine* storageEngine = serviceCtx->getStorageEngine(); + Timestamp prevStableTimestamp = storageEngine->getStableTimestamp(); + + storageEngine->setStableTimestamp(newStableTimestamp); + + Checkpointer* checkpointer = Checkpointer::get(serviceCtx); + if (checkpointer && !checkpointer->hasTriggeredFirstStableCheckpoint()) { + checkpointer->triggerFirstStableCheckpoint(prevStableTimestamp, + storageEngine->getInitialDataTimestamp(), + storageEngine->getStableTimestamp()); + } } void StorageInterfaceImpl::setInitialDataTimestamp(ServiceContext* serviceCtx, diff --git a/src/mongo/db/repl/tenant_migration_donor_service.cpp b/src/mongo/db/repl/tenant_migration_donor_service.cpp index d821c6c3f26..a07833a2caf 100644 --- a/src/mongo/db/repl/tenant_migration_donor_service.cpp +++ b/src/mongo/db/repl/tenant_migration_donor_service.cpp @@ -90,6 +90,13 @@ Status TenantMigrationDonorService::Instance::checkIfOptionsConflict(BSONObj opt return Status::OK(); } +void TenantMigrationDonorService::Instance::onReceiveDonorForgetMigration() { + stdx::lock_guard<Latch> lg(_mutex); + if (!_receivedDonorForgetMigrationPromise.getFuture().isReady()) { + _receivedDonorForgetMigrationPromise.emplaceValue(); + } +} + repl::OpTime TenantMigrationDonorService::Instance::_insertStateDocument() { const auto stateDocBson = _stateDoc.toBSON(); diff --git a/src/mongo/db/repl/tenant_migration_donor_service.h b/src/mongo/db/repl/tenant_migration_donor_service.h index ddf178121e4..6d1da3ac6d4 100644 --- a/src/mongo/db/repl/tenant_migration_donor_service.h +++ b/src/mongo/db/repl/tenant_migration_donor_service.h @@ -87,9 +87,7 @@ public: return _decisionPromise.getFuture(); } - void onReceiveDonorForgetMigration() { - _receivedDonorForgetMigrationPromise.emplaceValue(); - } + void onReceiveDonorForgetMigration(); private: const NamespaceString _stateDocumentsNS = NamespaceString::kTenantMigrationDonorsNamespace; @@ -142,6 +140,8 @@ public: const std::shared_ptr<executor::ScopedTaskExecutor>& executor, RemoteCommandTargeter* recipientTargeter); + mutable Mutex _mutex = MONGO_MAKE_LATCH("TenantMigrationDonorService::_mutex"); + ServiceContext* _serviceContext; TenantMigrationDonorDocument _stateDoc; diff --git a/src/mongo/db/repl/transaction_oplog_application.cpp b/src/mongo/db/repl/transaction_oplog_application.cpp index 0c7a1f0727b..67fb840de64 100644 --- a/src/mongo/db/repl/transaction_oplog_application.cpp +++ b/src/mongo/db/repl/transaction_oplog_application.cpp @@ -262,8 +262,9 @@ std::pair<std::vector<OplogEntry>, bool> _readTransactionOperationsFromOplogChai const std::vector<OplogEntry*>& cachedOps, const bool checkForCommands) noexcept { bool isTransactionWithCommand = false; - // Traverse the oplog chain with its own snapshot and read timestamp. - ReadSourceScope readSourceScope(opCtx); + // Ensure future transactions read without a timestamp. + invariant(RecoveryUnit::ReadSource::kNoTimestamp == + opCtx->recoveryUnit()->getTimestampReadSource()); std::vector<OplogEntry> ops; @@ -538,11 +539,10 @@ void reconstructPreparedTransactions(OperationContext* opCtx, repl::OplogApplica LOGV2(21848, "Hit skipReconstructPreparedTransactions failpoint"); return; } - // Read the transactions table and the oplog collection without a timestamp. - // The below DBDirectClient read uses AutoGetCollectionForRead which could implicitly change the - // read source. So we need to explicitly set the read source to kNoTimestamp to force reads in - // this scope to be untimestamped. - ReadSourceScope readSourceScope(opCtx, RecoveryUnit::ReadSource::kNoTimestamp); + + // Ensure future transactions read without a timestamp. + invariant(RecoveryUnit::ReadSource::kNoTimestamp == + opCtx->recoveryUnit()->getTimestampReadSource()); DBDirectClient client(opCtx); const auto cursor = client.query(NamespaceString::kSessionTransactionsTableNamespace, diff --git a/src/mongo/db/s/README.md b/src/mongo/db/s/README.md index bf23835067c..a2a4547f1f8 100644 --- a/src/mongo/db/s/README.md +++ b/src/mongo/db/s/README.md @@ -103,7 +103,6 @@ collection or database. A full refresh occurs when: Methods that will mark routing table cache information as stale (sharded collection). * [invalidateShardOrEntireCollectionEntryForShardedCollection](https://github.com/mongodb/mongo/blob/62d9485657717bf61fbb870cb3d09b52b1a614dd/src/mongo/s/catalog_cache.h#L226-L236) -* [invalidateShardForShardedCollection](https://github.com/mongodb/mongo/blob/62d9485657717bf61fbb870cb3d09b52b1a614dd/src/mongo/s/catalog_cache.h#L262-L268) * [invalidateEntriesThatReferenceShard](https://github.com/mongodb/mongo/blob/62d9485657717bf61fbb870cb3d09b52b1a614dd/src/mongo/s/catalog_cache.h#L270-L274) * [purgeCollection](https://github.com/mongodb/mongo/blob/62d9485657717bf61fbb870cb3d09b52b1a614dd/src/mongo/s/catalog_cache.h#L276-L280) diff --git a/src/mongo/db/s/SConscript b/src/mongo/db/s/SConscript index 4f97f81b966..f35bda25acc 100644 --- a/src/mongo/db/s/SConscript +++ b/src/mongo/db/s/SConscript @@ -263,6 +263,7 @@ env.Library( '$BUILD_DIR/mongo/db/catalog_raii', '$BUILD_DIR/mongo/db/repl/read_concern_args', '$BUILD_DIR/mongo/db/rw_concern_d', + '$BUILD_DIR/mongo/db/transaction', '$BUILD_DIR/mongo/executor/network_interface', '$BUILD_DIR/mongo/s/catalog/sharding_catalog_client', '$BUILD_DIR/mongo/s/client/sharding_client', diff --git a/src/mongo/db/s/config/configsvr_drop_collection_command.cpp b/src/mongo/db/s/config/configsvr_drop_collection_command.cpp index fc74fafc0c5..6743958f1f5 100644 --- a/src/mongo/db/s/config/configsvr_drop_collection_command.cpp +++ b/src/mongo/db/s/config/configsvr_drop_collection_command.cpp @@ -59,8 +59,12 @@ class ConfigSvrDropCollectionCommand : public BasicCommand { public: ConfigSvrDropCollectionCommand() : BasicCommand("_configsvrDropCollection") {} - const std::set<std::string>& apiVersions() const { - return kApiVersions1; + /** + * We accept any apiVersion, apiStrict, and/or apiDeprecationErrors, and forward it with the + * "drop" command to shards. + */ + bool acceptsAnyApiVersionParameters() const override { + return true; } AllowedOnSecondary secondaryAllowed(ServiceContext*) const override { @@ -129,7 +133,9 @@ public: auto collDistLock = uassertStatusOK( catalogClient->getDistLockManager()->lock(opCtx, nss.ns(), "dropCollection", waitFor)); - ON_BLOCK_EXIT([opCtx, nss] { Grid::get(opCtx)->catalogCache()->onEpochChange(nss); }); + ON_BLOCK_EXIT([opCtx, nss] { + Grid::get(opCtx)->catalogCache()->invalidateCollectionEntry_LINEARIZABLE(nss); + }); _dropCollection(opCtx, nss); diff --git a/src/mongo/db/s/config/configsvr_drop_database_command.cpp b/src/mongo/db/s/config/configsvr_drop_database_command.cpp index eb3ef547e70..896569a0afe 100644 --- a/src/mongo/db/s/config/configsvr_drop_database_command.cpp +++ b/src/mongo/db/s/config/configsvr_drop_database_command.cpp @@ -29,6 +29,7 @@ #define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kSharding +#include "mongo/db/api_parameters.h" #include "mongo/db/auth/authorization_session.h" #include "mongo/db/client.h" #include "mongo/db/commands.h" @@ -54,8 +55,12 @@ class ConfigSvrDropDatabaseCommand : public BasicCommand { public: ConfigSvrDropDatabaseCommand() : BasicCommand("_configsvrDropDatabase") {} - const std::set<std::string>& apiVersions() const { - return kApiVersions1; + /** + * We accept any apiVersion, apiStrict, and/or apiDeprecationErrors, and forward it with the + * "dropDatabase" command to shards. + */ + bool acceptsAnyApiVersionParameters() const override { + return true; } AllowedOnSecondary secondaryAllowed(ServiceContext*) const override { @@ -177,6 +182,7 @@ public: status, str::stream() << "Could not remove database '" << dbname << "' from metadata"); // Send _flushDatabaseCacheUpdates to all shards + IgnoreAPIParametersBlock ignoreApiParametersBlock{opCtx}; for (const ShardId& shardId : allShardIds) { const auto shard = uassertStatusOK(Grid::get(opCtx)->shardRegistry()->getShard(opCtx, shardId)); diff --git a/src/mongo/db/s/config/configsvr_shard_collection_command.cpp b/src/mongo/db/s/config/configsvr_shard_collection_command.cpp index ee992bef2a9..3af7f601e95 100644 --- a/src/mongo/db/s/config/configsvr_shard_collection_command.cpp +++ b/src/mongo/db/s/config/configsvr_shard_collection_command.cpp @@ -357,7 +357,7 @@ public: result << "collectionUUID" << *uuid; } - catalogCache->onEpochChange(nss); + catalogCache->invalidateCollectionEntry_LINEARIZABLE(nss); return true; } diff --git a/src/mongo/db/s/config/sharding_catalog_manager_chunk_operations.cpp b/src/mongo/db/s/config/sharding_catalog_manager_chunk_operations.cpp index 204d8377764..2d40f65eaed 100644 --- a/src/mongo/db/s/config/sharding_catalog_manager_chunk_operations.cpp +++ b/src/mongo/db/s/config/sharding_catalog_manager_chunk_operations.cpp @@ -46,6 +46,7 @@ #include "mongo/db/s/sharding_logging.h" #include "mongo/db/server_options.h" #include "mongo/db/snapshot_window_options_gen.h" +#include "mongo/db/transaction_participant_gen.h" #include "mongo/logv2/log.h" #include "mongo/rpc/get_status_from_command_result.h" #include "mongo/s/catalog/sharding_catalog_client.h" @@ -334,12 +335,14 @@ BSONObj getShardAndCollectionVersion(OperationContext* opCtx, ChunkVersion shardVersion; if (!swDonorShardVersion.isOK()) { - // The query to find 'nss' chunks belonging to the donor shard didn't return any, meaning - // the last chunk was donated - uassert(505770, - str::stream() << "Couldn't retrieve donor chunks from config server", - swDonorShardVersion.getStatus().code() == 50577); - shardVersion = ChunkVersion(0, 0, collectionVersion.epoch()); + if (swDonorShardVersion.getStatus().code() == 50577) { + // The query to find 'nss' chunks belonging to the donor shard didn't return any chunks, + // meaning the last chunk for fromShard was donated. Gracefully handle the error. + shardVersion = ChunkVersion(0, 0, collectionVersion.epoch()); + } else { + // Bubble up any other error + uassertStatusOK(swDonorShardVersion); + } } else { shardVersion = swDonorShardVersion.getValue(); } @@ -844,8 +847,9 @@ StatusWith<BSONObj> ShardingCatalogManager::commitChunkMigration( // Drop old history. Keep at least 1 entry so ChunkInfo::getShardIdAt finds valid history for // any query younger than the history window. if (!MONGO_unlikely(skipExpiringOldChunkHistory.shouldFail())) { - const int kHistorySecs = 10; - auto windowInSeconds = std::max(minSnapshotHistoryWindowInSeconds.load(), kHistorySecs); + auto windowInSeconds = std::max(std::max(minSnapshotHistoryWindowInSeconds.load(), + gTransactionLifetimeLimitSeconds.load()), + 10); int entriesDeleted = 0; while (newHistory.size() > 1 && newHistory.back().getValidAfter().getSecs() + windowInSeconds < diff --git a/src/mongo/db/s/config/sharding_catalog_manager_collection_operations.cpp b/src/mongo/db/s/config/sharding_catalog_manager_collection_operations.cpp index d6544e922d2..381a5e62029 100644 --- a/src/mongo/db/s/config/sharding_catalog_manager_collection_operations.cpp +++ b/src/mongo/db/s/config/sharding_catalog_manager_collection_operations.cpp @@ -42,6 +42,7 @@ #include "mongo/client/read_preference.h" #include "mongo/client/remote_command_targeter.h" #include "mongo/client/replica_set_monitor.h" +#include "mongo/db/api_parameters.h" #include "mongo/db/auth/authorization_session_impl.h" #include "mongo/db/catalog/collection_options.h" #include "mongo/db/client.h" @@ -398,6 +399,7 @@ void sendSSVToAllShards(OperationContext* opCtx, const NamespaceString& nss) { auto* const shardRegistry = Grid::get(opCtx)->shardRegistry(); + IgnoreAPIParametersBlock ignoreApiParametersBlock(opCtx); for (const auto& shardEntry : allShards) { const auto& shard = uassertStatusOK(shardRegistry->getShard(opCtx, shardEntry.getName())); @@ -417,6 +419,7 @@ void sendSSVToAllShards(OperationContext* opCtx, const NamespaceString& nss) { } void removeChunksAndTagsForDroppedCollection(OperationContext* opCtx, const NamespaceString& nss) { + IgnoreAPIParametersBlock ignoreApiParametersBlock(opCtx); const auto catalogClient = Grid::get(opCtx)->catalogClient(); // Remove chunk data @@ -502,6 +505,8 @@ void ShardingCatalogManager::ensureDropCollectionCompleted(OperationContext* opC "Ensuring config entries from previous dropCollection are cleared", "namespace"_attr = nss.ns()); sendDropCollectionToAllShards(opCtx, nss); + + IgnoreAPIParametersBlock ignoreApiParametersBlock(opCtx); removeChunksAndTagsForDroppedCollection(opCtx, nss); sendSSVToAllShards(opCtx, nss); } diff --git a/src/mongo/db/s/migration_source_manager.cpp b/src/mongo/db/s/migration_source_manager.cpp index 11bce269425..07f8f94daf9 100644 --- a/src/mongo/db/s/migration_source_manager.cpp +++ b/src/mongo/db/s/migration_source_manager.cpp @@ -466,7 +466,7 @@ Status MigrationSourceManager::commitChunkMetadataOnConfig() { "Starting post-migration commit refresh on the shard", "migrationId"_attr = _coordinator->getMigrationId()); - forceShardFilteringMetadataRefresh(_opCtx, getNss(), true); + forceShardFilteringMetadataRefresh(_opCtx, getNss()); LOGV2_DEBUG_OPTIONS(4817405, 2, diff --git a/src/mongo/db/s/migration_util_test.cpp b/src/mongo/db/s/migration_util_test.cpp index 010f476773c..a2decb63c2d 100644 --- a/src/mongo/db/s/migration_util_test.cpp +++ b/src/mongo/db/s/migration_util_test.cpp @@ -522,7 +522,7 @@ TEST_F(SubmitRangeDeletionTaskTest, _mockCatalogCacheLoader->setDatabaseRefreshReturnValue(kDefaultDatabaseType); _mockCatalogCacheLoader->setCollectionRefreshReturnValue( Status(ErrorCodes::NamespaceNotFound, "dummy errmsg")); - forceShardFilteringMetadataRefresh(opCtx, kNss, true); + forceShardFilteringMetadataRefresh(opCtx, kNss); auto cleanupCompleteFuture = migrationutil::submitRangeDeletionTask(opCtx, deletionTask); @@ -553,7 +553,7 @@ TEST_F(SubmitRangeDeletionTaskTest, SucceedsIfFilteringMetadataUUIDMatchesTaskUU _mockCatalogCacheLoader->setChunkRefreshReturnValue( makeChangedChunks(ChunkVersion(1, 0, kEpoch))); _mockCatalogClient->setCollections({coll}); - forceShardFilteringMetadataRefresh(opCtx, kNss, true); + forceShardFilteringMetadataRefresh(opCtx, kNss); // The task should have been submitted successfully. auto cleanupCompleteFuture = migrationutil::submitRangeDeletionTask(opCtx, deletionTask); @@ -596,7 +596,7 @@ TEST_F(SubmitRangeDeletionTaskTest, _mockCatalogCacheLoader->setDatabaseRefreshReturnValue(kDefaultDatabaseType); _mockCatalogCacheLoader->setCollectionRefreshReturnValue( Status(ErrorCodes::NamespaceNotFound, "dummy errmsg")); - forceShardFilteringMetadataRefresh(opCtx, kNss, true); + forceShardFilteringMetadataRefresh(opCtx, kNss); auto collectionUUID = createCollectionAndGetUUID(kNss); auto deletionTask = createDeletionTask(kNss, collectionUUID, 0, 10, _myShardName); @@ -633,7 +633,7 @@ TEST_F(SubmitRangeDeletionTaskTest, _mockCatalogCacheLoader->setChunkRefreshReturnValue( makeChangedChunks(ChunkVersion(1, 0, staleEpoch))); _mockCatalogClient->setCollections({staleColl}); - forceShardFilteringMetadataRefresh(opCtx, kNss, true); + forceShardFilteringMetadataRefresh(opCtx, kNss); auto collectionUUID = createCollectionAndGetUUID(kNss); auto deletionTask = createDeletionTask(kNss, collectionUUID, 0, 10, _myShardName); diff --git a/src/mongo/db/s/set_shard_version_command.cpp b/src/mongo/db/s/set_shard_version_command.cpp index f8a321aea1a..aba2cd2f632 100644 --- a/src/mongo/db/s/set_shard_version_command.cpp +++ b/src/mongo/db/s/set_shard_version_command.cpp @@ -96,7 +96,7 @@ public: uassertStatusOK(shardingState->canAcceptShardedCommands()); // Steps - // 1. Set the `authoritative` and `forceRefresh` variables from the command object. + // 1. Set the `authoritative` variable from the command object. // // 2. Validate all command parameters against the info in our ShardingState, and return an // error if they do not match. @@ -117,12 +117,6 @@ public: LastError::get(client).disable(); const bool authoritative = cmdObj.getBoolField("authoritative"); - // A flag that specifies whether the set shard version catalog refresh - // is allowed to join an in-progress refresh triggered by an other - // thread, or whether it's required to either a) trigger its own - // refresh or b) wait for a refresh to be started after it has entered the - // getCollectionRoutingInfoWithRefresh function - const bool forceRefresh = cmdObj.getBoolField("forceRefresh"); // Step 2 @@ -241,11 +235,9 @@ public: const auto status = [&] { try { - // TODO SERVER-48990 remove this if-else: just call onShardVersionMismatch + // TODO (SERVER-50812) remove this if-else: just call onShardVersionMismatch if (requestedVersion == requestedVersion.DROPPED()) { - // Note: The forceRefresh flag controls whether we make sure to do our own - // refresh or if we're okay with joining another thread - forceShardFilteringMetadataRefresh(opCtx, nss, forceRefresh); + forceShardFilteringMetadataRefresh(opCtx, nss); } else { onShardVersionMismatch(opCtx, nss, requestedVersion); } diff --git a/src/mongo/db/s/shard_filtering_metadata_refresh.cpp b/src/mongo/db/s/shard_filtering_metadata_refresh.cpp index 1e39cd26dc8..317d80f2ec4 100644 --- a/src/mongo/db/s/shard_filtering_metadata_refresh.cpp +++ b/src/mongo/db/s/shard_filtering_metadata_refresh.cpp @@ -284,7 +284,7 @@ ScopedShardVersionCriticalSection::ScopedShardVersionCriticalSection(OperationCo migrationutil::recoverMigrationCoordinations(_opCtx, _nss); } - forceShardFilteringMetadataRefresh(_opCtx, _nss, true); + forceShardFilteringMetadataRefresh(_opCtx, _nss); } ScopedShardVersionCriticalSection::~ScopedShardVersionCriticalSection() { @@ -334,9 +334,8 @@ CollectionMetadata forceGetCurrentMetadata(OperationContext* opCtx, const Namesp invariant(shardingState->canAcceptShardedCommands()); try { - const auto cm = - uassertStatusOK(Grid::get(opCtx)->catalogCache()->getCollectionRoutingInfoWithRefresh( - opCtx, nss, true)); + const auto cm = uassertStatusOK( + Grid::get(opCtx)->catalogCache()->getCollectionRoutingInfoWithRefresh(opCtx, nss)); if (!cm.isSharded()) { return CollectionMetadata(); @@ -354,8 +353,7 @@ CollectionMetadata forceGetCurrentMetadata(OperationContext* opCtx, const Namesp } ChunkVersion forceShardFilteringMetadataRefresh(OperationContext* opCtx, - const NamespaceString& nss, - bool forceRefreshFromThisThread) { + const NamespaceString& nss) { invariant(!opCtx->lockState()->isLocked()); invariant(!opCtx->getClient()->isInDirectClient()); @@ -366,9 +364,8 @@ ChunkVersion forceShardFilteringMetadataRefresh(OperationContext* opCtx, auto* const shardingState = ShardingState::get(opCtx); invariant(shardingState->canAcceptShardedCommands()); - const auto cm = - uassertStatusOK(Grid::get(opCtx)->catalogCache()->getCollectionRoutingInfoWithRefresh( - opCtx, nss, forceRefreshFromThisThread)); + const auto cm = uassertStatusOK( + Grid::get(opCtx)->catalogCache()->getCollectionRoutingInfoWithRefresh(opCtx, nss)); if (!cm.isSharded()) { // The collection is not sharded. Avoid using AutoGetCollection() as it returns the diff --git a/src/mongo/db/s/shard_filtering_metadata_refresh.h b/src/mongo/db/s/shard_filtering_metadata_refresh.h index 774a370b9ef..317fab32f37 100644 --- a/src/mongo/db/s/shard_filtering_metadata_refresh.h +++ b/src/mongo/db/s/shard_filtering_metadata_refresh.h @@ -79,8 +79,7 @@ CollectionMetadata forceGetCurrentMetadata(OperationContext* opCtx, const Namesp * called with a lock */ ChunkVersion forceShardFilteringMetadataRefresh(OperationContext* opCtx, - const NamespaceString& nss, - bool forceRefreshFromThisThread = false); + const NamespaceString& nss); /** * Should be called when any client request on this shard generates a StaleDbVersion exception. diff --git a/src/mongo/db/s/shard_key_util.cpp b/src/mongo/db/s/shard_key_util.cpp index e216f9f682d..9b71b8e1ec9 100644 --- a/src/mongo/db/s/shard_key_util.cpp +++ b/src/mongo/db/s/shard_key_util.cpp @@ -230,18 +230,12 @@ void ValidationBehaviorsShardCollection::createShardKeyIndex( ValidationBehaviorsRefineShardKey::ValidationBehaviorsRefineShardKey(OperationContext* opCtx, const NamespaceString& nss) - : _opCtx(opCtx) { - const auto cm = uassertStatusOK( - Grid::get(opCtx)->catalogCache()->getShardedCollectionRoutingInfoWithRefresh(opCtx, nss)); - uassert(ErrorCodes::NamespaceNotSharded, - str::stream() << "refineCollectionShardKey namespace " << nss.toString() - << " is not sharded", - cm.isSharded()); - const auto minKeyShardId = cm.getMinKeyShardIdWithSimpleCollation(); - _indexShard = - uassertStatusOK(Grid::get(opCtx)->shardRegistry()->getShard(opCtx, minKeyShardId)); - _cm = std::move(cm); -} + : _opCtx(opCtx), + _cm(uassertStatusOK( + Grid::get(opCtx)->catalogCache()->getShardedCollectionRoutingInfoWithRefresh(opCtx, + nss))), + _indexShard(uassertStatusOK(Grid::get(opCtx)->shardRegistry()->getShard( + opCtx, _cm.getMinKeyShardIdWithSimpleCollation()))) {} std::vector<BSONObj> ValidationBehaviorsRefineShardKey::loadIndexes( const NamespaceString& nss) const { @@ -249,8 +243,7 @@ std::vector<BSONObj> ValidationBehaviorsRefineShardKey::loadIndexes( _opCtx, ReadPreferenceSetting(ReadPreference::PrimaryOnly), nss.db().toString(), - appendShardVersion(BSON("listIndexes" << nss.coll()), - _cm->getVersion(_indexShard->getId())), + appendShardVersion(BSON("listIndexes" << nss.coll()), _cm.getVersion(_indexShard->getId())), Milliseconds(-1)); if (indexesRes.getStatus().code() != ErrorCodes::NamespaceNotFound) { return uassertStatusOK(indexesRes).docs; @@ -266,7 +259,7 @@ void ValidationBehaviorsRefineShardKey::verifyUsefulNonMultiKeyIndex( "admin", appendShardVersion( BSON(kCheckShardingIndexCmdName << nss.ns() << kKeyPatternField << proposedKey), - _cm->getVersion(_indexShard->getId())), + _cm.getVersion(_indexShard->getId())), Shard::RetryPolicy::kIdempotent)); if (checkShardingIndexRes.commandStatus == ErrorCodes::UnknownError) { // CheckShardingIndex returns UnknownError if a compatible shard key index cannot be found, diff --git a/src/mongo/db/s/shard_key_util.h b/src/mongo/db/s/shard_key_util.h index d6e1802549c..e5ab23683eb 100644 --- a/src/mongo/db/s/shard_key_util.h +++ b/src/mongo/db/s/shard_key_util.h @@ -104,8 +104,10 @@ public: private: OperationContext* _opCtx; + + ChunkManager _cm; + std::shared_ptr<Shard> _indexShard; - boost::optional<ChunkManager> _cm; }; /** diff --git a/src/mongo/db/service_entry_point_common.cpp b/src/mongo/db/service_entry_point_common.cpp index 2b6d515148a..f01218bba70 100644 --- a/src/mongo/db/service_entry_point_common.cpp +++ b/src/mongo/db/service_entry_point_common.cpp @@ -113,16 +113,16 @@ MONGO_FAIL_POINT_DEFINE(waitAfterCommandFinishesExecution); MONGO_FAIL_POINT_DEFINE(failWithErrorCodeInRunCommand); // Tracks the number of times a legacy unacknowledged write failed due to -// not master error resulted in network disconnection. -Counter64 notMasterLegacyUnackWrites; -ServerStatusMetricField<Counter64> displayNotMasterLegacyUnackWrites( - "repl.network.notMasterLegacyUnacknowledgedWrites", ¬MasterLegacyUnackWrites); +// not primary error resulted in network disconnection. +Counter64 notPrimaryLegacyUnackWrites; +ServerStatusMetricField<Counter64> displayNotPrimaryLegacyUnackWrites( + "repl.network.notPrimaryLegacyUnacknowledgedWrites", ¬PrimaryLegacyUnackWrites); -// Tracks the number of times an unacknowledged write failed due to not master error +// Tracks the number of times an unacknowledged write failed due to not primary error // resulted in network disconnection. -Counter64 notMasterUnackWrites; -ServerStatusMetricField<Counter64> displayNotMasterUnackWrites( - "repl.network.notMasterUnacknowledgedWrites", ¬MasterUnackWrites); +Counter64 notPrimaryUnackWrites; +ServerStatusMetricField<Counter64> displayNotPrimaryUnackWrites( + "repl.network.notPrimaryUnacknowledgedWrites", ¬PrimaryUnackWrites); namespace { @@ -1479,7 +1479,7 @@ DbResponse receivedCommands(OperationContext* opCtx, // Close the connection to get client to go through server selection again. if (LastError::get(opCtx->getClient()).hadNotPrimaryError()) { if (c && c->getReadWriteType() == Command::ReadWriteType::kWrite) - notMasterUnackWrites.increment(); + notPrimaryUnackWrites.increment(); uasserted(ErrorCodes::NotWritablePrimary, str::stream() << "Not-master error while processing '" << request.getCommandName() @@ -1839,7 +1839,7 @@ DbResponse FireAndForgetOpRunner::run() { // Either way, we want to throw an exception here, which will cause the client to be // disconnected. if (LastError::get(hr->client()).hadNotPrimaryError()) { - notMasterLegacyUnackWrites.increment(); + notPrimaryLegacyUnackWrites.increment(); uasserted(ErrorCodes::NotWritablePrimary, str::stream() << "Not-master error while processing '" << networkOpToString(hr->op()) << "' operation on '" diff --git a/src/mongo/db/stats/api_version_metrics.h b/src/mongo/db/stats/api_version_metrics.h index fc1de1d9766..354312a3992 100644 --- a/src/mongo/db/stats/api_version_metrics.h +++ b/src/mongo/db/stats/api_version_metrics.h @@ -29,7 +29,7 @@ #pragma once -#include "mongo/db/initialize_api_parameters.h" +#include "mongo/db/api_parameters.h" #include "mongo/db/service_context.h" #include "mongo/platform/mutex.h" #include "mongo/rpc/metadata/client_metadata.h" @@ -70,4 +70,4 @@ private: APIVersionMetricsMap _apiVersionMetrics; }; -} // namespace mongo
\ No newline at end of file +} // namespace mongo diff --git a/src/mongo/db/storage/SConscript b/src/mongo/db/storage/SConscript index 53ac37b0e30..f60d463a976 100644 --- a/src/mongo/db/storage/SConscript +++ b/src/mongo/db/storage/SConscript @@ -121,11 +121,13 @@ env.Library( 'control/storage_control.cpp', ], LIBDEPS=[ + 'checkpointer', 'journal_flusher', ], LIBDEPS_PRIVATE=[ '$BUILD_DIR/mongo/base', '$BUILD_DIR/mongo/db/service_context', + 'storage_options', ], ) @@ -513,6 +515,19 @@ env.Library( ) env.Library( + target='checkpointer', + source=[ + 'checkpointer.cpp', + ], + LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/base', + '$BUILD_DIR/mongo/db/service_context', + '$BUILD_DIR/mongo/util/background_job', + 'storage_options', + ], +) + +env.Library( target='two_phase_index_build_knobs_idl', source=[ env.Idlc('two_phase_index_build_knobs.idl')[0], diff --git a/src/mongo/db/storage/checkpointer.cpp b/src/mongo/db/storage/checkpointer.cpp new file mode 100644 index 00000000000..825e914d062 --- /dev/null +++ b/src/mongo/db/storage/checkpointer.cpp @@ -0,0 +1,168 @@ +/** + * Copyright (C) 2020-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kStorage + +#include "mongo/platform/basic.h" + +#include "mongo/db/storage/checkpointer.h" + +#include "mongo/db/operation_context.h" +#include "mongo/db/service_context.h" +#include "mongo/db/storage/kv/kv_engine.h" +#include "mongo/logv2/log.h" +#include "mongo/util/concurrency/idle_thread_block.h" +#include "mongo/util/fail_point.h" + +namespace mongo { + +namespace { + +const auto getCheckpointer = ServiceContext::declareDecoration<std::unique_ptr<Checkpointer>>(); + +MONGO_FAIL_POINT_DEFINE(pauseCheckpointThread); + +} // namespace + +Checkpointer* Checkpointer::get(ServiceContext* serviceCtx) { + return getCheckpointer(serviceCtx).get(); +} + +Checkpointer* Checkpointer::get(OperationContext* opCtx) { + return get(opCtx->getServiceContext()); +} + +void Checkpointer::set(ServiceContext* serviceCtx, std::unique_ptr<Checkpointer> newCheckpointer) { + auto& checkpointer = getCheckpointer(serviceCtx); + if (checkpointer) { + invariant(!checkpointer->running(), + "Tried to reset the Checkpointer without shutting down the original instance."); + } + checkpointer = std::move(newCheckpointer); +} + +void Checkpointer::run() { + ThreadClient tc(name(), getGlobalServiceContext()); + LOGV2_DEBUG(22307, 1, "Starting thread", "threadName"_attr = name()); + + while (true) { + auto opCtx = tc->makeOperationContext(); + + { + stdx::unique_lock<Latch> lock(_mutex); + MONGO_IDLE_THREAD_BLOCK; + + // Wait for 'storageGlobalParams.checkpointDelaySecs' seconds; or until either shutdown + // is signaled or a checkpoint is triggered. + _sleepCV.wait_for(lock, + stdx::chrono::seconds(static_cast<std::int64_t>( + storageGlobalParams.checkpointDelaySecs)), + [&] { return _shuttingDown || _triggerCheckpoint; }); + + // If the checkpointDelaySecs is set to 0, that means we should skip checkpointing. + // However, checkpointDelaySecs is adjustable by a runtime server parameter, so we + // need to wake up to check periodically. The wakeup to check period is arbitrary. + while (storageGlobalParams.checkpointDelaySecs == 0 && !_shuttingDown && + !_triggerCheckpoint) { + _sleepCV.wait_for(lock, stdx::chrono::seconds(static_cast<std::int64_t>(3)), [&] { + return _shuttingDown || _triggerCheckpoint; + }); + } + + if (_shuttingDown) { + invariant(!_shutdownReason.isOK()); + LOGV2_DEBUG(22309, + 1, + "Stopping thread", + "threadName"_attr = name(), + "reason"_attr = _shutdownReason); + return; + } + + // Clear the trigger so we do not immediately checkpoint again after this. + _triggerCheckpoint = false; + } + + pauseCheckpointThread.pauseWhileSet(); + + const Date_t startTime = Date_t::now(); + + // TODO SERVER-50861: Access the storage engine via the ServiceContext. + _kvEngine->checkpoint(); + + const auto secondsElapsed = durationCount<Seconds>(Date_t::now() - startTime); + if (secondsElapsed >= 30) { + LOGV2_DEBUG(22308, + 1, + "Checkpoint was slow to complete", + "secondsElapsed"_attr = secondsElapsed); + } + } +} + +void Checkpointer::triggerFirstStableCheckpoint(Timestamp prevStable, + Timestamp initialData, + Timestamp currStable) { + stdx::unique_lock<Latch> lock(_mutex); + invariant(!_hasTriggeredFirstStableCheckpoint); + if (prevStable < initialData && currStable >= initialData) { + LOGV2(22310, + "Triggering the first stable checkpoint", + "initialDataTimestamp"_attr = initialData, + "prevStableTimestamp"_attr = prevStable, + "currStableTimestamp"_attr = currStable); + _hasTriggeredFirstStableCheckpoint = true; + _triggerCheckpoint = true; + _sleepCV.notify_one(); + } +} + +bool Checkpointer::hasTriggeredFirstStableCheckpoint() { + stdx::unique_lock<Latch> lock(_mutex); + return _hasTriggeredFirstStableCheckpoint; +} + +void Checkpointer::shutdown(const Status& reason) { + LOGV2(22322, "Shutting down checkpoint thread"); + + { + stdx::unique_lock<Latch> lock(_mutex); + _shuttingDown = true; + _shutdownReason = reason; + + // Wake up the checkpoint thread early, to take a final checkpoint before shutting down, if + // one has not coincidentally just been taken. + _sleepCV.notify_one(); + } + + wait(); + LOGV2(22323, "Finished shutting down checkpoint thread"); +} + +} // namespace mongo diff --git a/src/mongo/db/storage/checkpointer.h b/src/mongo/db/storage/checkpointer.h new file mode 100644 index 00000000000..6c50974c2ba --- /dev/null +++ b/src/mongo/db/storage/checkpointer.h @@ -0,0 +1,114 @@ +/** + * Copyright (C) 2020-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/platform/mutex.h" +#include "mongo/stdx/condition_variable.h" +#include "mongo/util/background.h" + +namespace mongo { + +class KVEngine; +class OperationContext; +class ServiceContext; +class Timestamp; + +class Checkpointer : public BackgroundJob { +public: + Checkpointer(KVEngine* kvEngine) + : BackgroundJob(false /* deleteSelf */), + _kvEngine(kvEngine), + _shuttingDown(false), + _shutdownReason(Status::OK()), + _hasTriggeredFirstStableCheckpoint(false), + _triggerCheckpoint(false) {} + + static Checkpointer* get(ServiceContext* serviceCtx); + static Checkpointer* get(OperationContext* opCtx); + static void set(ServiceContext* serviceCtx, std::unique_ptr<Checkpointer> newCheckpointer); + + std::string name() const override { + return "Checkpointer"; + } + + /** + * Starts the checkpoint thread that runs every storageGlobalParams.checkpointDelaySecs seconds. + */ + void run() override; + + /** + * Triggers taking the first stable checkpoint if the stable timestamp has advanced past the + * initial data timestamp. + * + * The checkpoint thread runs automatically every storageGlobalParams.checkpointDelaySecs + * seconds. This function avoids potentially waiting that full duration for a stable checkpoint, + * initiating one immediately. + * + * Do not call this function if hasTriggeredFirstStableCheckpoint() returns true. + */ + void triggerFirstStableCheckpoint(Timestamp prevStable, + Timestamp initialData, + Timestamp currStable); + + /** + * Returns whether the first stable checkpoint has already been triggered. + */ + bool hasTriggeredFirstStableCheckpoint(); + + /** + * Blocks until the checkpoint thread has been fully shutdown. + */ + void shutdown(const Status& reason); + +private: + // A pointer to the KVEngine is maintained only due to unit testing limitations that don't fully + // setup the ServiceContext. + // TODO SERVER-50861: Remove this pointer. + KVEngine* const _kvEngine; + + // Protects the state below. + Mutex _mutex = MONGO_MAKE_LATCH("Checkpointer::_mutex"); + + // The checkpoint thread idles on this condition variable for a particular time duration between + // taking checkpoints. It can be triggered early to expedite either: immediate checkpointing if + // _triggerCheckpoint is set; or shutdown cleanup if _shuttingDown is set. + stdx::condition_variable _sleepCV; + + bool _shuttingDown; + Status _shutdownReason; + + // This flag ensures the first stable checkpoint is only triggered once. + bool _hasTriggeredFirstStableCheckpoint; + + // This flag allows the checkpoint thread to wake up early when _sleepCV is signaled. + bool _triggerCheckpoint; +}; + +} // namespace mongo diff --git a/src/mongo/db/storage/control/storage_control.cpp b/src/mongo/db/storage/control/storage_control.cpp index f0b7e7d825f..50213d44dfc 100644 --- a/src/mongo/db/storage/control/storage_control.cpp +++ b/src/mongo/db/storage/control/storage_control.cpp @@ -35,7 +35,9 @@ #include "mongo/db/operation_context.h" #include "mongo/db/service_context.h" +#include "mongo/db/storage/checkpointer.h" #include "mongo/db/storage/control/journal_flusher.h" +#include "mongo/db/storage/storage_options.h" #include "mongo/logv2/log.h" namespace mongo { @@ -73,12 +75,25 @@ void startStorageControls(ServiceContext* serviceContext, bool forTestOnly) { journalFlusher->go(); JournalFlusher::set(serviceContext, std::move(journalFlusher)); + if (storageEngine->supportsCheckpoints() && !storageEngine->isEphemeral() && + !storageGlobalParams.readOnly) { + std::unique_ptr<Checkpointer> checkpointer = + std::make_unique<Checkpointer>(storageEngine->getEngine()); + checkpointer->go(); + Checkpointer::set(serviceContext, std::move(checkpointer)); + } + areControlsStarted = true; } void stopStorageControls(ServiceContext* serviceContext, const Status& reason) { if (areControlsStarted) { JournalFlusher::get(serviceContext)->shutdown(reason); + + auto checkpointer = Checkpointer::get(serviceContext); + if (checkpointer) { + checkpointer->shutdown(reason); + } } } diff --git a/src/mongo/db/storage/durable_catalog.h b/src/mongo/db/storage/durable_catalog.h index 2d9aecb32ee..b782b144f0b 100644 --- a/src/mongo/db/storage/durable_catalog.h +++ b/src/mongo/db/storage/durable_catalog.h @@ -128,6 +128,11 @@ public: virtual std::string newInternalIdent() = 0; /** + * Generate an internal resumable index build ident name. + */ + virtual std::string newInternalResumableIndexBuildIdent() = 0; + + /** * On success, returns the RecordId which identifies the new record store in the durable catalog * in addition to ownership of the new RecordStore. */ diff --git a/src/mongo/db/storage/durable_catalog_impl.cpp b/src/mongo/db/storage/durable_catalog_impl.cpp index b991c213bc3..de8b719918b 100644 --- a/src/mongo/db/storage/durable_catalog_impl.cpp +++ b/src/mongo/db/storage/durable_catalog_impl.cpp @@ -65,6 +65,7 @@ const char kNamespaceFieldName[] = "ns"; const char kNonRepairableFeaturesFieldName[] = "nonRepairable"; const char kRepairableFeaturesFieldName[] = "repairable"; const char kInternalIdentPrefix[] = "internal-"; +const char kResumableIndexBuildIdentStem[] = "resumable-index-build-"; void appendPositionsOfBitsSet(uint64_t value, StringBuilder* sb) { invariant(sb); @@ -427,8 +428,17 @@ bool DurableCatalogImpl::_hasEntryCollidingWithRand() const { } std::string DurableCatalogImpl::newInternalIdent() { + return _newInternalIdent(""); +} + +std::string DurableCatalogImpl::newInternalResumableIndexBuildIdent() { + return _newInternalIdent(kResumableIndexBuildIdentStem); +} + +std::string DurableCatalogImpl::_newInternalIdent(StringData identStem) { StringBuilder buf; buf << kInternalIdentPrefix; + buf << identStem; buf << _next.fetchAndAdd(1) << '-' << _rand; return buf.str(); } @@ -765,6 +775,11 @@ bool DurableCatalogImpl::isInternalIdent(StringData ident) const { return ident.find(kInternalIdentPrefix) != std::string::npos; } +bool DurableCatalogImpl::isResumableIndexBuildIdent(StringData ident) const { + invariant(isInternalIdent(ident), ident.toString()); + return ident.find(kResumableIndexBuildIdentStem) != std::string::npos; +} + bool DurableCatalogImpl::isCollectionIdent(StringData ident) const { // Internal idents prefixed "internal-" should not be considered collections, because // they are not eligible for orphan recovery through repair. diff --git a/src/mongo/db/storage/durable_catalog_impl.h b/src/mongo/db/storage/durable_catalog_impl.h index b24816d333d..9b8fad96825 100644 --- a/src/mongo/db/storage/durable_catalog_impl.h +++ b/src/mongo/db/storage/durable_catalog_impl.h @@ -92,6 +92,8 @@ public: bool isInternalIdent(StringData ident) const; + bool isResumableIndexBuildIdent(StringData ident) const; + bool isCollectionIdent(StringData ident) const; FeatureTracker* getFeatureTracker() const { @@ -108,6 +110,7 @@ public: std::string getFilesystemPathForDb(const std::string& dbName) const; std::string newInternalIdent(); + std::string newInternalResumableIndexBuildIdent(); StatusWith<std::pair<RecordId, std::unique_ptr<RecordStore>>> createCollection( OperationContext* opCtx, @@ -229,6 +232,8 @@ private: */ std::string _newUniqueIdent(NamespaceString nss, const char* kind); + std::string _newInternalIdent(StringData identStem); + // Helpers only used by constructor and init(). Don't call from elsewhere. static std::string _newRand(); bool _hasEntryCollidingWithRand() const; diff --git a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine.h b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine.h index b3da8bb0085..fd243b0c8c1 100644 --- a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine.h +++ b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine.h @@ -173,6 +173,10 @@ public: Timestamp getOldestTimestamp() const override; + Timestamp getStableTimestamp() const override { + return Timestamp(); + } + void setOldestTimestamp(Timestamp newOldestTimestamp, bool force) override; std::map<Timestamp, std::shared_ptr<StringStore>> getHistory_forTest(); diff --git a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine_test.cpp b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine_test.cpp index fcf49f74442..e249daed751 100644 --- a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine_test.cpp +++ b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine_test.cpp @@ -359,7 +359,7 @@ TEST_F(EphemeralForTestKVEngineTest, ReadOlderSnapshotsSimple) { ASSERT(!rs->findRecord(&opCtx, loc2, &rd)); opCtx.recoveryUnit()->abandonSnapshot(); - opCtx.recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset); + opCtx.recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); ASSERT(rs->findRecord(&opCtx, loc1, &rd)); ASSERT(rs->findRecord(&opCtx, loc2, &rd)); } @@ -452,7 +452,7 @@ TEST_F(EphemeralForTestKVEngineTest, SetReadTimestampBehindOldestTimestamp) { ASSERT_THROWS_CODE(rs->findRecord(&opCtx, loc2, &rd), DBException, ErrorCodes::SnapshotTooOld); opCtx.recoveryUnit()->abandonSnapshot(); - opCtx.recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset); + opCtx.recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); ASSERT(rs->findRecord(&opCtx, loc1, &rd)); ASSERT(rs->findRecord(&opCtx, loc2, &rd)); } diff --git a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.cpp b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.cpp index 5b2e77e6292..44d73995482 100644 --- a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.cpp +++ b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.cpp @@ -119,7 +119,6 @@ bool RecoveryUnit::forkIfNeeded() { boost::optional<Timestamp> readFrom = boost::none; switch (_timestampReadSource) { - case ReadSource::kUnset: case ReadSource::kNoTimestamp: case ReadSource::kMajorityCommitted: case ReadSource::kNoOverlap: diff --git a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.h b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.h index 0e0afbb1a13..c31d0d54d86 100644 --- a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.h +++ b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.h @@ -131,7 +131,7 @@ private: Timestamp _commitTimestamp = Timestamp::min(); // Specifies which external source to use when setting read timestamps on transactions. - ReadSource _timestampReadSource = ReadSource::kUnset; + ReadSource _timestampReadSource = ReadSource::kNoTimestamp; boost::optional<Timestamp> _readAtTimestamp = boost::none; }; diff --git a/src/mongo/db/storage/kv/kv_engine.h b/src/mongo/db/storage/kv/kv_engine.h index 46dad070544..6c8c67df3c4 100644 --- a/src/mongo/db/storage/kv/kv_engine.h +++ b/src/mongo/db/storage/kv/kv_engine.h @@ -53,18 +53,6 @@ class SnapshotManager; class KVEngine { public: /** - * This function should only be called after the StorageEngine is set on the ServiceContext. - * - * Starts asycnhronous threads for a storage engine's integration layer. Any such thread - * generating an OperationContext should be initialized here. - * - * In order for OperationContexts to be generated with real Locker objects, the generation must - * occur after the StorageEngine is instantiated and set on the ServiceContext. Otherwise, - * OperationContexts are created with LockerNoops. - */ - virtual void startAsyncThreads() {} - - /** * During the startup process, the storage engine is one of the first components to be started * up and fully initialized. But that fully initialized storage engine may not be recognized as * the end for the remaining storage startup tasks that still need to be performed. @@ -275,6 +263,8 @@ public: return false; } + virtual void checkpoint() {} + virtual bool isDurable() const = 0; /** diff --git a/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp b/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp index 57bf3bf714d..1e928738d57 100644 --- a/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp +++ b/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp @@ -372,7 +372,7 @@ TEST_F(SnapshotManagerTests, InsertAndReadOnLastAppliedSnapshot) { // Not reading on the last applied timestamp returns the most recent data. auto op = makeOperation(); auto ru = op->recoveryUnit(); - ru->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset); + ru->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); ASSERT_EQ(itCountOn(op), 1); ASSERT(readRecordOn(op, id)); @@ -408,7 +408,7 @@ TEST_F(SnapshotManagerTests, UpdateAndDeleteOnLocalSnapshot) { // Not reading on the last local timestamp returns the most recent data. auto op = makeOperation(); auto ru = op->recoveryUnit(); - ru->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset); + ru->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); ASSERT_EQ(itCountOn(op), 1); auto record = readRecordOn(op, id); ASSERT_EQ(std::string(record->data.data()), "Blue spotted stingray"); diff --git a/src/mongo/db/storage/recovery_unit.h b/src/mongo/db/storage/recovery_unit.h index 5c8be96b528..2057f8854b3 100644 --- a/src/mongo/db/storage/recovery_unit.h +++ b/src/mongo/db/storage/recovery_unit.h @@ -392,11 +392,7 @@ public: */ enum ReadSource { /** - * Do not read from a timestamp. This is the default. - */ - kUnset, - /** - * Read without a timestamp explicitly. + * Read without a timestamp. This is the default. */ kNoTimestamp, /** @@ -424,8 +420,6 @@ public: static std::string toString(ReadSource rs) { switch (rs) { - case ReadSource::kUnset: - return "kUnset"; case ReadSource::kNoTimestamp: return "kNoTimestamp"; case ReadSource::kMajorityCommitted: @@ -455,7 +449,7 @@ public: boost::optional<Timestamp> provided = boost::none) {} virtual ReadSource getTimestampReadSource() const { - return ReadSource::kUnset; + return ReadSource::kNoTimestamp; }; /** diff --git a/src/mongo/db/storage/snapshot_helper.cpp b/src/mongo/db/storage/snapshot_helper.cpp index 5acbcd3a513..84af208d391 100644 --- a/src/mongo/db/storage/snapshot_helper.cpp +++ b/src/mongo/db/storage/snapshot_helper.cpp @@ -38,29 +38,37 @@ #include "mongo/logv2/log.h" namespace mongo { -namespace SnapshotHelper { -bool canSwitchReadSource(OperationContext* opCtx) { - - // Most readConcerns have behavior controlled at higher levels. Local and available are the only - // ReadConcerns that should consider changing, since they read without a timestamp by default. +namespace { +bool canReadAtLastApplied(OperationContext* opCtx) { + // Local and available are the only ReadConcern levels that allow their ReadSource to be + // overridden to read at lastApplied. They read without a timestamp by default, but this check + // allows user secondary reads from conflicting with oplog batch application by reading at a + // consistent point in time. + // Internal operations use DBDirectClient as a loopback to perform local operations, and they + // expect the same level of consistency guarantees as any user operation. For that reason, + // DBDirectClient should be able to change the owning operation's ReadSource in order to serve + // consistent data. const auto readConcernLevel = repl::ReadConcernArgs::get(opCtx).getLevel(); - if (readConcernLevel == repl::ReadConcernLevel::kLocalReadConcern || - readConcernLevel == repl::ReadConcernLevel::kAvailableReadConcern) { + if ((opCtx->getClient()->isFromUserConnection() || opCtx->getClient()->isInDirectClient()) && + (readConcernLevel == repl::ReadConcernLevel::kLocalReadConcern || + readConcernLevel == repl::ReadConcernLevel::kAvailableReadConcern)) { return true; } - return false; } +} // namespace +namespace SnapshotHelper { bool shouldReadAtLastApplied(OperationContext* opCtx, const NamespaceString& nss, std::string* reason) { - // If this is true, then the operation opted-in to the PBWM lock, implying that it cannot change // its ReadSource. It's important to note that it is possible for this to be false, but still be // holding the PBWM lock, explained below. if (opCtx->lockState()->shouldConflictWithSecondaryBatchApplication()) { - *reason = "conflicts with batch application"; + if (reason) { + *reason = "conflicts with batch application"; + } return false; } @@ -71,16 +79,32 @@ bool shouldReadAtLastApplied(OperationContext* opCtx, // guaranteed to observe all previous writes. This may occur when multiple collection locks are // held concurrently, which is often the case when DBDirectClient is used. if (opCtx->lockState()->isLockHeldForMode(resourceIdParallelBatchWriterMode, MODE_IS)) { - *reason = "PBWM lock is held"; + if (reason) { + *reason = "PBWM lock is held"; + } LOGV2_DEBUG(20577, 1, "not reading at lastApplied because the PBWM lock is held"); return false; } - // If we are in a replication state (like secondary or primary catch-up) where we are not - // accepting writes, we should read at lastApplied. If this node can accept writes, then no - // conflicting replication batches are being applied and we can read from the default snapshot. + // If this node can accept writes (i.e. primary), then no conflicting replication batches are + // being applied and we can read from the default snapshot. If we are in a replication state + // (like secondary or primary catch-up) where we are not accepting writes, we should read at + // lastApplied. if (repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesForDatabase(opCtx, "admin")) { - *reason = "primary"; + if (reason) { + *reason = "primary"; + } + return false; + } + + // If we are not secondary, then we should not attempt to read at lastApplied because it may not + // be available or valid. Any operations reading outside of the primary or secondary states must + // be internal. We give these operations the benefit of the doubt rather than attempting to read + // at a lastApplied timestamp that is not valid. + if (!repl::ReplicationCoordinator::get(opCtx)->isInPrimaryOrSecondaryState(opCtx)) { + if (reason) { + *reason = "not primary or secondary"; + } return false; } @@ -88,7 +112,9 @@ bool shouldReadAtLastApplied(OperationContext* opCtx, // written by the replication system. However, the oplog is special, as it *is* written by the // replication system. if (!nss.isReplicated() && !nss.isOplog()) { - *reason = "unreplicated collection"; + if (reason) { + *reason = "unreplicated collection"; + } return false; } @@ -96,15 +122,14 @@ bool shouldReadAtLastApplied(OperationContext* opCtx, } boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opCtx, const NamespaceString& nss) { - const bool canSwitch = canSwitchReadSource(opCtx); - if (!canSwitch) { + if (!canReadAtLastApplied(opCtx)) { return boost::none; } const auto existing = opCtx->recoveryUnit()->getTimestampReadSource(); std::string reason; const bool readAtLastApplied = shouldReadAtLastApplied(opCtx, nss, &reason); - if (existing == RecoveryUnit::ReadSource::kUnset) { + if (existing == RecoveryUnit::ReadSource::kNoTimestamp) { // Shifting from reading without a timestamp to reading with a timestamp can be dangerous // because writes will appear to vanish. This case is intended for new reads on secondaries // and query yield recovery after state transitions from primary to secondary. @@ -122,13 +147,16 @@ boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opC // Given readers do not survive rollbacks, it's okay to go from reading with a timestamp to // reading without one. More writes will become visible. if (!readAtLastApplied) { - LOGV2_DEBUG( - 4452902, 2, "Changing ReadSource to kUnset", logAttrs(nss), "reason"_attr = reason); - // This shift to kUnset assumes that callers will not make future attempts to manipulate - // their ReadSources after performing reads at an un-timetamped snapshot. The only - // exception is callers of this function that may need to change from kUnset to - // kLastApplied in the event of a catalog conflict or query yield. - return RecoveryUnit::ReadSource::kUnset; + LOGV2_DEBUG(4452902, + 2, + "Changing ReadSource to kNoTimestamp", + logAttrs(nss), + "reason"_attr = reason); + // This shift to kNoTimestamp assumes that callers will not make future attempts to + // manipulate their ReadSources after performing reads at an un-timetamped snapshot. The + // only exception is callers of this function that may need to change from kNoTimestamp + // to kLastApplied in the event of a catalog conflict or query yield. + return RecoveryUnit::ReadSource::kNoTimestamp; } } return boost::none; diff --git a/src/mongo/db/storage/snapshot_helper.h b/src/mongo/db/storage/snapshot_helper.h index fa8fdd85f24..c24dfd16d8c 100644 --- a/src/mongo/db/storage/snapshot_helper.h +++ b/src/mongo/db/storage/snapshot_helper.h @@ -37,6 +37,10 @@ namespace SnapshotHelper { boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opCtx, const NamespaceString& nss); +bool shouldReadAtLastApplied(OperationContext* opCtx, + const NamespaceString& nss, + std::string* reason = nullptr); + bool collectionChangesConflictWithRead(boost::optional<Timestamp> collectionMin, boost::optional<Timestamp> readTimestamp); } // namespace SnapshotHelper diff --git a/src/mongo/db/storage/storage_engine.h b/src/mongo/db/storage/storage_engine.h index aa06d951c9f..edf31b874fe 100644 --- a/src/mongo/db/storage/storage_engine.h +++ b/src/mongo/db/storage/storage_engine.h @@ -376,6 +376,14 @@ public: OperationContext* opCtx) = 0; /** + * Creates a temporary RecordStore on the storage engine for a resumable index build. On + * startup after an unclean shutdown, the storage engine will drop any un-dropped temporary + * record stores. + */ + virtual std::unique_ptr<TemporaryRecordStore> makeTemporaryRecordStoreForResumableIndexBuild( + OperationContext* opCtx) = 0; + + /** * Creates a temporary RecordStore on the storage engine from an existing ident on disk. On * startup after an unclean shutdown, the storage engine will drop any un-dropped temporary * record stores. @@ -465,6 +473,12 @@ public: std::shared_ptr<Ident> ident) = 0; /** + * Called when the checkpoint thread instructs the storage engine to take a checkpoint. The + * underlying storage engine must take a checkpoint at this point. + */ + virtual void checkpoint() = 0; + + /** * Recovers the storage engine state to the last stable timestamp. "Stable" in this case * refers to a timestamp that is guaranteed to never be rolled back. The stable timestamp * used should be one provided by StorageEngine::setStableTimestamp(). @@ -509,6 +523,11 @@ public: virtual void setStableTimestamp(Timestamp stableTimestamp, bool force = false) = 0; /** + * Returns the stable timestamp. + */ + virtual Timestamp getStableTimestamp() const = 0; + + /** * Tells the storage engine the timestamp of the data at startup. This is necessary because * timestamps are not persisted in the storage layer. */ diff --git a/src/mongo/db/storage/storage_engine_impl.cpp b/src/mongo/db/storage/storage_engine_impl.cpp index 88c183919e1..22c82a09eba 100644 --- a/src/mongo/db/storage/storage_engine_impl.cpp +++ b/src/mongo/db/storage/storage_engine_impl.cpp @@ -326,7 +326,7 @@ Status StorageEngineImpl::_recoverOrphanedCollection(OperationContext* opCtx, return Status::OK(); } -bool StorageEngineImpl::_handleInternalIdents( +bool StorageEngineImpl::_handleInternalIdent( OperationContext* opCtx, const std::string& ident, InternalIdentReconcilePolicy internalIdentReconcilePolicy, @@ -345,14 +345,15 @@ bool StorageEngineImpl::_handleInternalIdents( return true; } + if (!_catalog->isResumableIndexBuildIdent(ident)) { + return false; + } + // When starting up after a clean shutdown and resumable index builds are supported, find the // internal idents that contain the relevant information to resume each index build and recover // the state. auto rs = _engine->getRecordStore(opCtx, "", ident, CollectionOptions()); - // Look at the contents to determine whether this ident will contain information for - // resuming an index build. - // TODO SERVER-49215: differentiate the internal idents without looking at the contents. auto cursor = rs->getCursor(opCtx); auto record = cursor->next(); if (record) { @@ -360,36 +361,35 @@ bool StorageEngineImpl::_handleInternalIdents( // Parse the documents here so that we can restart the build if the document doesn't // contain all the necessary information to be able to resume building the index. - if (doc.hasField("phase")) { - ResumeIndexInfo resumeInfo; - try { - if (MONGO_unlikely(failToParseResumeIndexInfo.shouldFail())) { - uasserted(ErrorCodes::FailPointEnabled, - "failToParseResumeIndexInfo fail point is enabled"); - } - - resumeInfo = ResumeIndexInfo::parse(IDLParserErrorContext("ResumeIndexInfo"), doc); - } catch (const DBException& e) { - LOGV2(4916300, "Failed to parse resumable index info", "error"_attr = e.toStatus()); - - // Ignore the error so that we can restart the index build instead of resume it. We - // should drop the internal ident if we failed to parse. - internalIdentsToDrop->insert(ident); - return true; + ResumeIndexInfo resumeInfo; + try { + if (MONGO_unlikely(failToParseResumeIndexInfo.shouldFail())) { + uasserted(ErrorCodes::FailPointEnabled, + "failToParseResumeIndexInfo fail point is enabled"); } - reconcileResult->indexBuildsToResume.push_back(resumeInfo); + resumeInfo = ResumeIndexInfo::parse(IDLParserErrorContext("ResumeIndexInfo"), doc); + } catch (const DBException& e) { + LOGV2(4916300, "Failed to parse resumable index info", "error"_attr = e.toStatus()); - // Once we have parsed the resume info, we can safely drop the internal ident. + // Ignore the error so that we can restart the index build instead of resume it. We + // should drop the internal ident if we failed to parse. internalIdentsToDrop->insert(ident); - - LOGV2(4916301, - "Found unfinished index build to resume", - "buildUUID"_attr = resumeInfo.getBuildUUID(), - "collectionUUID"_attr = resumeInfo.getCollectionUUID(), - "phase"_attr = IndexBuildPhase_serializer(resumeInfo.getPhase())); return true; } + + reconcileResult->indexBuildsToResume.push_back(resumeInfo); + + // Once we have parsed the resume info, we can safely drop the internal ident. + internalIdentsToDrop->insert(ident); + + LOGV2(4916301, + "Found unfinished index build to resume", + "buildUUID"_attr = resumeInfo.getBuildUUID(), + "collectionUUID"_attr = resumeInfo.getCollectionUUID(), + "phase"_attr = IndexBuildPhase_serializer(resumeInfo.getPhase())); + + return true; } return false; @@ -448,12 +448,12 @@ StatusWith<StorageEngine::ReconcileResult> StorageEngineImpl::reconcileCatalogAn continue; } - if (_handleInternalIdents(opCtx, - it, - internalIdentReconcilePolicy, - &reconcileResult, - &internalIdentsToDrop, - &allInternalIdents)) { + if (_handleInternalIdent(opCtx, + it, + internalIdentReconcilePolicy, + &reconcileResult, + &internalIdentsToDrop, + &allInternalIdents)) { continue; } @@ -670,8 +670,6 @@ void StorageEngineImpl::finishInit() { // A storage engine may need to start threads that require OperationsContexts with real Lockers, // as opposed to LockerNoops. Placing the start logic here, after the StorageEngine has been // instantiated, causes makeOperationContext() to create LockerImpls instead of LockerNoops. - _engine->startAsyncThreads(); - if (_engine->supportsRecoveryTimestamp()) { _timestampMonitor = std::make_unique<TimestampMonitor>( _engine.get(), getGlobalServiceContext()->getPeriodicRunner()); @@ -864,10 +862,18 @@ std::unique_ptr<TemporaryRecordStore> StorageEngineImpl::makeTemporaryRecordStor OperationContext* opCtx) { std::unique_ptr<RecordStore> rs = _engine->makeTemporaryRecordStore(opCtx, _catalog->newInternalIdent()); - LOGV2_DEBUG(22258, + LOGV2_DEBUG(22258, 1, "Created temporary record store", "ident"_attr = rs->getIdent()); + return std::make_unique<TemporaryKVRecordStore>(getEngine(), std::move(rs)); +} + +std::unique_ptr<TemporaryRecordStore> +StorageEngineImpl::makeTemporaryRecordStoreForResumableIndexBuild(OperationContext* opCtx) { + std::unique_ptr<RecordStore> rs = + _engine->makeTemporaryRecordStore(opCtx, _catalog->newInternalResumableIndexBuildIdent()); + LOGV2_DEBUG(4921500, 1, - "created temporary record store: {rs_getIdent}", - "rs_getIdent"_attr = rs->getIdent()); + "Created temporary record store for resumable index build", + "ident"_attr = rs->getIdent()); return std::make_unique<TemporaryKVRecordStore>(getEngine(), std::move(rs)); } @@ -885,6 +891,10 @@ void StorageEngineImpl::setStableTimestamp(Timestamp stableTimestamp, bool force _engine->setStableTimestamp(stableTimestamp, force); } +Timestamp StorageEngineImpl::getStableTimestamp() const { + return _engine->getStableTimestamp(); +} + void StorageEngineImpl::setInitialDataTimestamp(Timestamp initialDataTimestamp) { _engine->setInitialDataTimestamp(initialDataTimestamp); } @@ -1025,6 +1035,10 @@ void StorageEngineImpl::addDropPendingIdent(const Timestamp& dropTimestamp, _dropPendingIdentReaper.addDropPendingIdent(dropTimestamp, nss, ident); } +void StorageEngineImpl::checkpoint() { + _engine->checkpoint(); +} + void StorageEngineImpl::_onMinOfCheckpointAndOldestTimestampChanged(const Timestamp& timestamp) { if (timestamp.isNull()) { return; diff --git a/src/mongo/db/storage/storage_engine_impl.h b/src/mongo/db/storage/storage_engine_impl.h index 4761e1f3a38..fed128f9b59 100644 --- a/src/mongo/db/storage/storage_engine_impl.h +++ b/src/mongo/db/storage/storage_engine_impl.h @@ -113,6 +113,9 @@ public: virtual std::unique_ptr<TemporaryRecordStore> makeTemporaryRecordStore( OperationContext* opCtx) override; + virtual std::unique_ptr<TemporaryRecordStore> makeTemporaryRecordStoreForResumableIndexBuild( + OperationContext* opCtx) override; + virtual std::unique_ptr<TemporaryRecordStore> makeTemporaryRecordStoreFromExistingIdent( OperationContext* opCtx, StringData ident) override; @@ -120,6 +123,8 @@ public: virtual void setStableTimestamp(Timestamp stableTimestamp, bool force = false) override; + virtual Timestamp getStableTimestamp() const override; + virtual void setInitialDataTimestamp(Timestamp initialDataTimestamp) override; virtual Timestamp getInitialDataTimestamp() const override; @@ -312,6 +317,8 @@ public: const NamespaceString& nss, std::shared_ptr<Ident> ident) override; + void checkpoint() override; + DurableCatalog* getCatalog() override { return _catalog.get(); } @@ -386,12 +393,12 @@ private: * Returns whether the given ident is an internal ident and if it should be dropped or used to * resume an index build. */ - bool _handleInternalIdents(OperationContext* opCtx, - const std::string& ident, - InternalIdentReconcilePolicy internalIdentReconcilePolicy, - ReconcileResult* reconcileResult, - std::set<std::string>* internalIdentsToDrop, - std::set<std::string>* allInternalIdents); + bool _handleInternalIdent(OperationContext* opCtx, + const std::string& ident, + InternalIdentReconcilePolicy internalIdentReconcilePolicy, + ReconcileResult* reconcileResult, + std::set<std::string>* internalIdentsToDrop, + std::set<std::string>* allInternalIdents); class RemoveDBChange; diff --git a/src/mongo/db/storage/storage_engine_mock.h b/src/mongo/db/storage/storage_engine_mock.h index 3a4a14bd9e6..96eb8020b1d 100644 --- a/src/mongo/db/storage/storage_engine_mock.h +++ b/src/mongo/db/storage/storage_engine_mock.h @@ -93,6 +93,10 @@ public: std::unique_ptr<TemporaryRecordStore> makeTemporaryRecordStore(OperationContext* opCtx) final { return {}; } + std::unique_ptr<TemporaryRecordStore> makeTemporaryRecordStoreForResumableIndexBuild( + OperationContext* opCtx) final { + return {}; + } std::unique_ptr<TemporaryRecordStore> makeTemporaryRecordStoreFromExistingIdent( OperationContext* opCtx, StringData ident) final { return {}; @@ -134,6 +138,9 @@ public: MONGO_UNREACHABLE; } void setStableTimestamp(Timestamp stableTimestamp, bool force = false) final {} + Timestamp getStableTimestamp() const override { + return Timestamp(); + } void setInitialDataTimestamp(Timestamp timestamp) final {} Timestamp getInitialDataTimestamp() const override { return Timestamp(); @@ -168,6 +175,7 @@ public: void addDropPendingIdent(const Timestamp& dropTimestamp, const NamespaceString& nss, std::shared_ptr<Ident> ident) final {} + void checkpoint() final {} Status currentFilesCompatible(OperationContext* opCtx) const final { return Status::OK(); } diff --git a/src/mongo/db/storage/storage_options.cpp b/src/mongo/db/storage/storage_options.cpp index 7ba94afde29..431698a807d 100644 --- a/src/mongo/db/storage/storage_options.cpp +++ b/src/mongo/db/storage/storage_options.cpp @@ -58,6 +58,7 @@ void StorageGlobalParams::reset() { oplogMinRetentionHours.store(0.0); allowOplogTruncation = true; disableLockFreeReads = true; + checkpointDelaySecs = 0; } StorageGlobalParams storageGlobalParams; diff --git a/src/mongo/db/storage/storage_options.h b/src/mongo/db/storage/storage_options.h index f6284a06244..e7fe5331f96 100644 --- a/src/mongo/db/storage/storage_options.h +++ b/src/mongo/db/storage/storage_options.h @@ -123,6 +123,10 @@ struct StorageGlobalParams { // settings with which lock-free reads are incompatible: standalone mode; and // enableMajorityReadConcern=false. bool disableLockFreeReads; + + // Delay in seconds between triggering the next checkpoint after the completion of the previous + // one. A value of 0 indicates that checkpointing will be skipped. + size_t checkpointDelaySecs; }; extern StorageGlobalParams storageGlobalParams; diff --git a/src/mongo/db/storage/wiredtiger/SConscript b/src/mongo/db/storage/wiredtiger/SConscript index 0cf7d92ce08..5d24feec685 100644 --- a/src/mongo/db/storage/wiredtiger/SConscript +++ b/src/mongo/db/storage/wiredtiger/SConscript @@ -139,6 +139,7 @@ if wiredtiger: '$BUILD_DIR/mongo/db/service_context', '$BUILD_DIR/mongo/db/service_context_d', '$BUILD_DIR/mongo/db/service_context_test_fixture', + '$BUILD_DIR/mongo/db/storage/checkpointer', '$BUILD_DIR/mongo/db/storage/durable_catalog_impl', '$BUILD_DIR/mongo/db/storage/kv/kv_engine_test_harness', '$BUILD_DIR/mongo/db/storage/recovery_unit_test_harness', diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_global_options.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_global_options.cpp index d7bba3ee94d..8149bab8757 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_global_options.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_global_options.cpp @@ -43,11 +43,6 @@ WiredTigerGlobalOptions wiredTigerGlobalOptions; Status WiredTigerGlobalOptions::store(const moe::Environment& params) { // WiredTiger storage engine options - if (params.count("storage.syncPeriodSecs")) { - wiredTigerGlobalOptions.checkpointDelaySecs = - static_cast<size_t>(params["storage.syncPeriodSecs"].as<double>()); - } - if (!wiredTigerGlobalOptions.engineConfig.empty()) { LOGV2(22293, "Engine custom option: {wiredTigerGlobalOptions_engineConfig}", diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_global_options.h b/src/mongo/db/storage/wiredtiger/wiredtiger_global_options.h index 21d4c522f3b..51546164c39 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_global_options.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_global_options.h @@ -40,7 +40,6 @@ class WiredTigerGlobalOptions { public: WiredTigerGlobalOptions() : cacheSizeGB(0), - checkpointDelaySecs(0), statisticsLogDelaySecs(0), directoryForIndexes(false), maxCacheOverflowFileSizeGBDeprecated(0), @@ -50,7 +49,6 @@ public: Status store(const optionenvironment::Environment& params); double cacheSizeGB; - size_t checkpointDelaySecs; size_t statisticsLogDelaySecs; std::string journalCompressor; bool directoryForIndexes; diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp index 1553c1740fe..f169f952e05 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp @@ -119,8 +119,6 @@ namespace { MONGO_FAIL_POINT_DEFINE(WTPreserveSnapshotHistoryIndefinitely); MONGO_FAIL_POINT_DEFINE(WTSetOldestTSToStableTS); -MONGO_FAIL_POINT_DEFINE(pauseCheckpointThread); - } // namespace bool WiredTigerFileVersion::shouldDowngrade(bool readOnly, @@ -255,231 +253,6 @@ std::string toString(const StorageEngine::OldestActiveTransactionTimestampResult } } -class WiredTigerKVEngine::WiredTigerCheckpointThread : public BackgroundJob { -public: - explicit WiredTigerCheckpointThread(WiredTigerKVEngine* wiredTigerKVEngine, - WiredTigerSessionCache* sessionCache) - : BackgroundJob(false /* deleteSelf */), - _wiredTigerKVEngine(wiredTigerKVEngine), - _sessionCache(sessionCache) {} - - virtual string name() const { - return "WTCheckpointThread"; - } - - virtual void run() { - ThreadClient tc(name(), getGlobalServiceContext()); - LOGV2_DEBUG(22307, 1, "Starting thread", "threadName"_attr = name()); - - while (true) { - auto opCtx = tc->makeOperationContext(); - - { - stdx::unique_lock<Latch> lock(_mutex); - MONGO_IDLE_THREAD_BLOCK; - - // Wait for 'wiredTigerGlobalOptions.checkpointDelaySecs' seconds; or until either - // shutdown is signaled or a checkpoint is triggered. - _condvar.wait_for(lock, - stdx::chrono::seconds(static_cast<std::int64_t>( - wiredTigerGlobalOptions.checkpointDelaySecs)), - [&] { return _shuttingDown || _triggerCheckpoint; }); - - // If the checkpointDelaySecs is set to 0, that means we should skip checkpointing. - // However, checkpointDelaySecs is adjustable by a runtime server parameter, so we - // need to wake up to check periodically. The wakeup to check period is arbitrary. - while (wiredTigerGlobalOptions.checkpointDelaySecs == 0 && !_shuttingDown && - !_triggerCheckpoint) { - _condvar.wait_for(lock, - stdx::chrono::seconds(static_cast<std::int64_t>(3)), - [&] { return _shuttingDown || _triggerCheckpoint; }); - } - - if (_shuttingDown) { - LOGV2_DEBUG(22309, 1, "Stopping thread", "threadName"_attr = name()); - return; - } - - // Clear the trigger so we do not immediately checkpoint again after this. - _triggerCheckpoint = false; - } - - pauseCheckpointThread.pauseWhileSet(); - - const Date_t startTime = Date_t::now(); - - const Timestamp stableTimestamp = _wiredTigerKVEngine->getStableTimestamp(); - const Timestamp initialDataTimestamp = _wiredTigerKVEngine->getInitialDataTimestamp(); - - // The amount of oplog to keep is primarily dictated by a user setting. However, in - // unexpected cases, durable, recover to a timestamp storage engines may need to play - // forward from an oplog entry that would otherwise be truncated by the user - // setting. Furthermore, the entries in prepared or large transactions can refer to - // previous entries in the same transaction. - // - // Live (replication) rollback will replay oplogs from exactly the stable timestamp. - // With prepared or large transactions, it may require some additional entries prior to - // the stable timestamp. These requirements are summarized in getOplogNeededForRollback. - // Truncating the oplog at this point is sufficient for in-memory configurations, but - // could cause an unrecoverable scenario if the node crashed and has to play from the - // last stable checkpoint. - // - // By recording the oplog needed for rollback "now", then taking a stable checkpoint, - // we can safely assume that the oplog needed for crash recovery has caught up to the - // recorded value. After the checkpoint, this value will be published such that actors - // which truncate the oplog can read an updated value. - try { - // Three cases: - // - // First, initialDataTimestamp is Timestamp(0, 1) -> Take full checkpoint. This is - // when there is no consistent view of the data (i.e: during initial sync). - // - // Second, stableTimestamp < initialDataTimestamp: Skip checkpoints. The data on - // disk is prone to being rolled back. Hold off on checkpoints. Hope that the - // stable timestamp surpasses the data on disk, allowing storage to persist newer - // copies to disk. - // - // Third, stableTimestamp >= initialDataTimestamp: Take stable checkpoint. Steady - // state case. - if (initialDataTimestamp.asULL() <= 1) { - UniqueWiredTigerSession session = _sessionCache->getSession(); - WT_SESSION* s = session->getSession(); - invariantWTOK(s->checkpoint(s, "use_timestamp=false")); - } else if (stableTimestamp < initialDataTimestamp) { - LOGV2_FOR_RECOVERY( - 23985, - 2, - "Stable timestamp is behind the initial data timestamp, skipping " - "a checkpoint. StableTimestamp: {stableTimestamp} InitialDataTimestamp: " - "{initialDataTimestamp}", - "stableTimestamp"_attr = stableTimestamp.toString(), - "initialDataTimestamp"_attr = initialDataTimestamp.toString()); - } else { - auto oplogNeededForRollback = _wiredTigerKVEngine->getOplogNeededForRollback(); - - LOGV2_FOR_RECOVERY( - 23986, - 2, - "Performing stable checkpoint. StableTimestamp: {stableTimestamp}, " - "OplogNeededForRollback: {oplogNeededForRollback}", - "stableTimestamp"_attr = stableTimestamp, - "oplogNeededForRollback"_attr = toString(oplogNeededForRollback)); - - UniqueWiredTigerSession session = _sessionCache->getSession(); - WT_SESSION* s = session->getSession(); - invariantWTOK(s->checkpoint(s, "use_timestamp=true")); - - if (oplogNeededForRollback.isOK()) { - // Now that the checkpoint is durable, publish the oplog needed to recover - // from it. - stdx::lock_guard<Latch> lk(_oplogNeededForCrashRecoveryMutex); - _oplogNeededForCrashRecovery.store( - oplogNeededForRollback.getValue().asULL()); - } - } - - const auto secondsElapsed = durationCount<Seconds>(Date_t::now() - startTime); - if (secondsElapsed >= 30) { - LOGV2_DEBUG(22308, - 1, - "Checkpoint took {secondsElapsed} seconds to complete.", - "secondsElapsed"_attr = secondsElapsed); - } - } catch (const WriteConflictException&) { - // Temporary: remove this after WT-3483 - LOGV2_WARNING(22346, "Checkpoint encountered a write conflict exception."); - } catch (const AssertionException& exc) { - invariant(ErrorCodes::isShutdownError(exc.code()), exc.what()); - } - } - } - - /** - * Returns true if we have already triggered taking the first checkpoint. - */ - bool hasTriggeredFirstStableCheckpoint() { - stdx::unique_lock<Latch> lock(_mutex); - return _hasTriggeredFirstStableCheckpoint; - } - - /** - * Triggers taking the first stable checkpoint, which is when the stable timestamp advances past - * the initial data timestamp. - * - * The checkpoint thread runs automatically every wiredTigerGlobalOptions.checkpointDelaySecs - * seconds. This function avoids potentially waiting that full duration for a stable checkpoint, - * initiating one immediately. - * - * Do not call this function if hasTriggeredFirstStableCheckpoint() returns true. - */ - void triggerFirstStableCheckpoint(Timestamp prevStable, - Timestamp initialData, - Timestamp currStable) { - stdx::unique_lock<Latch> lock(_mutex); - invariant(!_hasTriggeredFirstStableCheckpoint); - if (prevStable < initialData && currStable >= initialData) { - LOGV2(22310, - "Triggering the first stable checkpoint. Initial Data: {initialData} PrevStable: " - "{prevStable} CurrStable: {currStable}", - "Triggering the first stable checkpoint", - "initialData"_attr = initialData, - "prevStable"_attr = prevStable, - "currStable"_attr = currStable); - _hasTriggeredFirstStableCheckpoint = true; - _triggerCheckpoint = true; - _condvar.notify_one(); - } - } - - std::uint64_t getOplogNeededForCrashRecovery() const { - return _oplogNeededForCrashRecovery.load(); - } - - /* - * Atomically assign _oplogNeededForCrashRecovery to a variable. - * _oplogNeededForCrashRecovery will not change during assignment. - */ - void assignOplogNeededForCrashRecoveryTo(boost::optional<Timestamp>* timestamp) { - stdx::lock_guard<Latch> lk(_oplogNeededForCrashRecoveryMutex); - *timestamp = Timestamp(_oplogNeededForCrashRecovery.load()); - } - - void shutdown() { - { - stdx::unique_lock<Latch> lock(_mutex); - _shuttingDown = true; - // Wake up the checkpoint thread early, to take a final checkpoint before shutting - // down, if one has not coincidentally just been taken. - _condvar.notify_one(); - } - wait(); - } - -private: - WiredTigerKVEngine* _wiredTigerKVEngine; - WiredTigerSessionCache* _sessionCache; - - Mutex _oplogNeededForCrashRecoveryMutex = - MONGO_MAKE_LATCH("WiredTigerCheckpointThread::_oplogNeededForCrashRecoveryMutex"); - AtomicWord<std::uint64_t> _oplogNeededForCrashRecovery; - - // Protects the state below. - Mutex _mutex = MONGO_MAKE_LATCH("WiredTigerCheckpointThread::_mutex"); - - // The checkpoint thread idles on this condition variable for a particular time duration between - // taking checkpoints. It can be triggered early to expedite either: immediate checkpointing if - // _triggerCheckpoint is set; or shutdown cleanup if _shuttingDown is set. - stdx::condition_variable _condvar; - - bool _shuttingDown = false; - - // This flag ensures the first stable checkpoint is only triggered once. - bool _hasTriggeredFirstStableCheckpoint = false; - - // This flag allows the checkpoint thread to wake up early when _condvar is signaled. - bool _triggerCheckpoint = false; -}; - namespace { TicketHolder openWriteTransaction(128); TicketHolder openReadTransaction(128); @@ -759,16 +532,6 @@ WiredTigerKVEngine::~WiredTigerKVEngine() { _sessionCache.reset(nullptr); } -void WiredTigerKVEngine::startAsyncThreads() { - if (!_ephemeral) { - if (!_readOnly) { - _checkpointThread = - std::make_unique<WiredTigerCheckpointThread>(this, _sessionCache.get()); - _checkpointThread->go(); - } - } -} - void WiredTigerKVEngine::notifyStartupComplete() { WiredTigerUtil::notifyStartupComplete(); } @@ -898,11 +661,6 @@ void WiredTigerKVEngine::cleanShutdown() { _sessionSweeper->shutdown(); LOGV2(22319, "Finished shutting down session sweeper thread"); } - if (_checkpointThread) { - LOGV2(22322, "Shutting down checkpoint thread"); - _checkpointThread->shutdown(); - LOGV2(22323, "Finished shutting down checkpoint thread"); - } LOGV2_FOR_RECOVERY(23988, 2, "Shutdown timestamps.", @@ -1385,7 +1143,7 @@ WiredTigerKVEngine::beginNonBlockingBackup(OperationContext* opCtx, // Oplog truncation thread won't remove oplog since the checkpoint pinned by the backup cursor. stdx::lock_guard<Latch> lock(_oplogPinnedByBackupMutex); - _checkpointThread->assignOplogNeededForCrashRecoveryTo(&_oplogPinnedByBackup); + _oplogPinnedByBackup = Timestamp(_oplogNeededForCrashRecovery.load()); auto pinOplogGuard = makeGuard([&] { _oplogPinnedByBackup = boost::none; }); // Persist the sizeStorer information to disk before opening the backup cursor. We aren't @@ -1907,6 +1665,74 @@ bool WiredTigerKVEngine::supportsDirectoryPerDB() const { return true; } +void WiredTigerKVEngine::checkpoint() { + const Timestamp stableTimestamp = getStableTimestamp(); + const Timestamp initialDataTimestamp = getInitialDataTimestamp(); + + // The amount of oplog to keep is primarily dictated by a user setting. However, in unexpected + // cases, durable, recover to a timestamp storage engines may need to play forward from an oplog + // entry that would otherwise be truncated by the user setting. Furthermore, the entries in + // prepared or large transactions can refer to previous entries in the same transaction. + // + // Live (replication) rollback will replay the oplog from exactly the stable timestamp. With + // prepared or large transactions, it may require some additional entries prior to the stable + // timestamp. These requirements are summarized in getOplogNeededForRollback. Truncating the + // oplog at this point is sufficient for in-memory configurations, but could cause an + // unrecoverable scenario if the node crashed and has to play from the last stable checkpoint. + // + // By recording the oplog needed for rollback "now", then taking a stable checkpoint, we can + // safely assume that the oplog needed for crash recovery has caught up to the recorded value. + // After the checkpoint, this value will be published such that actors which truncate the oplog + // can read an updated value. + try { + // Three cases: + // + // First, initialDataTimestamp is Timestamp(0, 1) -> Take full checkpoint. This is when + // there is no consistent view of the data (i.e: during initial sync). + // + // Second, stableTimestamp < initialDataTimestamp: Skip checkpoints. The data on disk is + // prone to being rolled back. Hold off on checkpoints. Hope that the stable timestamp + // surpasses the data on disk, allowing storage to persist newer copies to disk. + // + // Third, stableTimestamp >= initialDataTimestamp: Take stable checkpoint. Steady state + // case. + if (initialDataTimestamp.asULL() <= 1) { + UniqueWiredTigerSession session = _sessionCache->getSession(); + WT_SESSION* s = session->getSession(); + invariantWTOK(s->checkpoint(s, "use_timestamp=false")); + } else if (stableTimestamp < initialDataTimestamp) { + LOGV2_FOR_RECOVERY( + 23985, + 2, + "Stable timestamp is behind the initial data timestamp, skipping a checkpoint.", + "stableTimestamp"_attr = stableTimestamp.toString(), + "initialDataTimestamp"_attr = initialDataTimestamp.toString()); + } else { + auto oplogNeededForRollback = getOplogNeededForRollback(); + + LOGV2_FOR_RECOVERY(23986, + 2, + "Performing stable checkpoint.", + "stableTimestamp"_attr = stableTimestamp, + "oplogNeededForRollback"_attr = toString(oplogNeededForRollback)); + + UniqueWiredTigerSession session = _sessionCache->getSession(); + WT_SESSION* s = session->getSession(); + invariantWTOK(s->checkpoint(s, "use_timestamp=true")); + + if (oplogNeededForRollback.isOK()) { + // Now that the checkpoint is durable, publish the oplog needed to recover from it. + _oplogNeededForCrashRecovery.store(oplogNeededForRollback.getValue().asULL()); + } + } + } catch (const WriteConflictException&) { + // TODO SERVER-50824: Check if this can be removed now that WT-3483 is done. + LOGV2_WARNING(22346, "Checkpoint encountered a write conflict exception."); + } catch (const AssertionException& exc) { + invariant(ErrorCodes::isShutdownError(exc.code()), exc.what()); + } +} + bool WiredTigerKVEngine::hasIdent(OperationContext* opCtx, StringData ident) const { return _hasUri(WiredTigerRecoveryUnit::get(opCtx)->getSession()->getSession(), _uri(ident)); } @@ -2045,10 +1871,6 @@ void WiredTigerKVEngine::setStableTimestamp(Timestamp stableTimestamp, bool forc // After publishing a stable timestamp to WT, we can record the updated stable timestamp value // for the necessary oplog to keep. _stableTimestamp.store(stableTimestamp.asULL()); - if (_checkpointThread && !_checkpointThread->hasTriggeredFirstStableCheckpoint()) { - _checkpointThread->triggerFirstStableCheckpoint( - prevStable, Timestamp(_initialDataTimestamp.load()), stableTimestamp); - } // If 'force' is set, then we have already set the oldest timestamp equal to the stable // timestamp, so there is nothing left to do. @@ -2193,13 +2015,6 @@ StatusWith<Timestamp> WiredTigerKVEngine::recoverToStableTimestamp(OperationCont 23989, 2, "WiredTiger::RecoverToStableTimestamp syncing size storer to disk."); syncSizeInfo(true); - if (!_ephemeral) { - LOGV2_FOR_ROLLBACK( - 23990, 2, "WiredTiger::RecoverToStableTimestamp shutting down checkpoint thread."); - // Shutdown WiredTigerKVEngine owned accesses into the storage engine. - _checkpointThread->shutdown(); - } - const Timestamp stableTimestamp(_stableTimestamp.load()); const Timestamp initialDataTimestamp(_initialDataTimestamp.load()); @@ -2216,11 +2031,6 @@ StatusWith<Timestamp> WiredTigerKVEngine::recoverToStableTimestamp(OperationCont str::stream() << "Error rolling back to stable. Err: " << wiredtiger_strerror(ret)}; } - if (!_ephemeral) { - _checkpointThread = std::make_unique<WiredTigerCheckpointThread>(this, _sessionCache.get()); - _checkpointThread->go(); - } - _sizeStorer = std::make_unique<WiredTigerSizeStorer>(_conn, _sizeStorerUri, _readOnly); return {stableTimestamp}; @@ -2345,7 +2155,7 @@ boost::optional<Timestamp> WiredTigerKVEngine::getOplogNeededForCrashRecovery() return boost::none; } - return Timestamp(_checkpointThread->getOplogNeededForCrashRecovery()); + return Timestamp(_oplogNeededForCrashRecovery.load()); } Timestamp WiredTigerKVEngine::getPinnedOplog() const { diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h index 9327ae7454f..bfd539e7815 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h @@ -103,8 +103,6 @@ public: ~WiredTigerKVEngine(); - void startAsyncThreads() override; - void notifyStartupComplete() override; void setRecordStoreExtraOptions(const std::string& options); @@ -119,6 +117,8 @@ public: return !isEphemeral(); } + void checkpoint() override; + bool isDurable() const override { return _durable; } @@ -369,7 +369,6 @@ public: private: class WiredTigerSessionSweeper; - class WiredTigerCheckpointThread; /** * Opens a connection on the WiredTiger database 'path' with the configuration 'wtOpenConfig'. @@ -458,7 +457,6 @@ private: const bool _keepDataHistory = true; std::unique_ptr<WiredTigerSessionSweeper> _sessionSweeper; - std::unique_ptr<WiredTigerCheckpointThread> _checkpointThread; std::string _rsOptions; std::string _indexOptions; @@ -485,6 +483,8 @@ private: // timestamp. Provided by replication layer because WT does not persist timestamps. AtomicWord<std::uint64_t> _initialDataTimestamp; + AtomicWord<std::uint64_t> _oplogNeededForCrashRecovery; + std::unique_ptr<WiredTigerEngineRuntimeConfigParameter> _runTimeConfigParam; mutable Mutex _highestDurableTimestampMutex = diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine_test.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine_test.cpp index b870c017798..2580960a76c 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine_test.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine_test.cpp @@ -43,7 +43,7 @@ #include "mongo/db/repl/replication_coordinator_mock.h" #include "mongo/db/service_context.h" #include "mongo/db/service_context_test_fixture.h" -#include "mongo/db/storage/wiredtiger/wiredtiger_global_options.h" +#include "mongo/db/storage/checkpointer.h" #include "mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h" #include "mongo/db/storage/wiredtiger/wiredtiger_record_store.h" #include "mongo/logv2/log.h" @@ -82,19 +82,16 @@ public: private: std::unique_ptr<WiredTigerKVEngine> makeEngine() { - auto engine = std::make_unique<WiredTigerKVEngine>(kWiredTigerEngineName, - _dbpath.path(), - _cs.get(), - "", - 1, - 0, - false, - false, - _forRepair, - false); - // There are unit tests expecting checkpoints to occur asynchronously. - engine->startAsyncThreads(); - return engine; + return std::make_unique<WiredTigerKVEngine>(kWiredTigerEngineName, + _dbpath.path(), + _cs.get(), + "", + 1, + 0, + false, + false, + _forRepair, + false); } const std::unique_ptr<ClockSource> _cs = std::make_unique<ClockSourceMock>(); @@ -246,6 +243,9 @@ TEST_F(WiredTigerKVEngineRepairTest, UnrecoverableOrphanedDataFilesAreRebuilt) { } TEST_F(WiredTigerKVEngineTest, TestOplogTruncation) { + std::unique_ptr<Checkpointer> checkpointer = std::make_unique<Checkpointer>(_engine); + checkpointer->go(); + auto opCtxPtr = makeOperationContext(); // The initial data timestamp has to be set to take stable checkpoints. The first stable // timestamp greater than this will also trigger a checkpoint. The following loop of the @@ -262,7 +262,7 @@ TEST_F(WiredTigerKVEngineTest, TestOplogTruncation) { #endif #endif { - wiredTigerGlobalOptions.checkpointDelaySecs = 1; + storageGlobalParams.checkpointDelaySecs = 1; } (); @@ -341,6 +341,8 @@ TEST_F(WiredTigerKVEngineTest, TestOplogTruncation) { _engine->setStableTimestamp(Timestamp(30, 1), false); callbackShouldFail.store(false); assertPinnedMovesSoon(Timestamp(40, 1)); + + checkpointer->shutdown({ErrorCodes::ShutdownInProgress, "Test finished"}); } std::unique_ptr<KVHarnessHelper> makeHelper() { diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp index 1167fd673f3..b3cc4c6dde7 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp @@ -445,7 +445,6 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp() // transaction to establish a read timestamp, but only for ReadSources that are expected to have // read timestamps. switch (_timestampReadSource) { - case ReadSource::kUnset: case ReadSource::kNoTimestamp: return boost::none; case ReadSource::kMajorityCommitted: @@ -484,7 +483,6 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp() return _readAtTimestamp; // The follow ReadSources returned values in the first switch block. - case ReadSource::kUnset: case ReadSource::kNoTimestamp: case ReadSource::kMajorityCommitted: case ReadSource::kProvided: @@ -507,7 +505,6 @@ void WiredTigerRecoveryUnit::_txnOpen() { WT_SESSION* session = _session->getSession(); switch (_timestampReadSource) { - case ReadSource::kUnset: case ReadSource::kNoTimestamp: { if (_isOplogReader) { _oplogVisibleTs = static_cast<std::int64_t>(_oplogManager->getOplogReadTimestamp()); @@ -827,7 +824,6 @@ void WiredTigerRecoveryUnit::setTimestampReadSource(ReadSource readSource, "setting timestamp read source", "readSource"_attr = toString(readSource), "provided"_attr = ((provided) ? provided->toString() : "none")); - invariant(!_isActive() || _timestampReadSource == readSource, str::stream() << "Current state: " << toString(_getState()) << ". Invalid internal state while setting timestamp read source: " diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h index 312a46f5c09..0d557fc6329 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h @@ -250,7 +250,7 @@ private: bool _isTimestamped = false; // Specifies which external source to use when setting read timestamps on transactions. - ReadSource _timestampReadSource = ReadSource::kUnset; + ReadSource _timestampReadSource = ReadSource::kNoTimestamp; // Commits are assumed ordered. Unordered commits are assumed to always need to reserve a // new optime, and thus always call oplogDiskLocRegister() on the record store. diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp index b50d4b79889..2dde320ceeb 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp @@ -61,13 +61,6 @@ public: false, // .repair false // .readOnly ) { - // Deliberately not calling _engine->startAsyncThreads() because it starts an asynchronous - // checkpointing thread that can interfere with unit tests manipulating checkpoints - // manually. - // - // Alternatively, we would have to start using wiredTigerGlobalOptions.checkpointDelaySecs - // to set a high enough value such that the async thread never runs during testing. - repl::ReplicationCoordinator::set( getGlobalServiceContext(), std::unique_ptr<repl::ReplicationCoordinator>(new repl::ReplicationCoordinatorMock( @@ -203,7 +196,8 @@ TEST_F(WiredTigerRecoveryUnitTestFixture, NoOverlapReadSource) { } // Read without a timestamp. The write should be visible. - ASSERT_EQ(opCtx1->recoveryUnit()->getTimestampReadSource(), RecoveryUnit::ReadSource::kUnset); + ASSERT_EQ(opCtx1->recoveryUnit()->getTimestampReadSource(), + RecoveryUnit::ReadSource::kNoTimestamp); RecordData unused; ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused)); @@ -237,7 +231,7 @@ TEST_F(WiredTigerRecoveryUnitTestFixture, NoOverlapReadSource) { // Read without a timestamp, and we should see the first and third records. opCtx1->recoveryUnit()->abandonSnapshot(); - opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset); + opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused)); ASSERT_FALSE(rs->findRecord(opCtx1, rid2, &unused)); ASSERT_TRUE(rs->findRecord(opCtx1, rid3, &unused)); diff --git a/src/mongo/db/transaction_participant.cpp b/src/mongo/db/transaction_participant.cpp index 742bfd087b4..de5874ae3a6 100644 --- a/src/mongo/db/transaction_participant.cpp +++ b/src/mongo/db/transaction_participant.cpp @@ -124,8 +124,9 @@ struct ActiveTransactionHistory { ActiveTransactionHistory fetchActiveTransactionHistory(OperationContext* opCtx, const LogicalSessionId& lsid) { - // Restore the current timestamp read source after fetching transaction history. - ReadSourceScope readSourceScope(opCtx); + // Restore the current timestamp read source after fetching transaction history using + // DBDirectClient, which may change our ReadSource. + ReadSourceScope readSourceScope(opCtx, RecoveryUnit::ReadSource::kNoTimestamp); ActiveTransactionHistory result; diff --git a/src/mongo/db/transaction_participant.h b/src/mongo/db/transaction_participant.h index f898b21c112..37b71ce8589 100644 --- a/src/mongo/db/transaction_participant.h +++ b/src/mongo/db/transaction_participant.h @@ -33,11 +33,11 @@ #include <iostream> #include <map> +#include "mongo/db/api_parameters.h" #include "mongo/db/catalog/uncommitted_collections.h" #include "mongo/db/commands/txn_cmds_gen.h" #include "mongo/db/concurrency/d_concurrency.h" #include "mongo/db/concurrency/locker.h" -#include "mongo/db/initialize_api_parameters.h" #include "mongo/db/logical_session_id.h" #include "mongo/db/multi_key_path_tracker.h" #include "mongo/db/ops/update_request.h" diff --git a/src/mongo/dbtests/querytests.cpp b/src/mongo/dbtests/querytests.cpp index 84d533e2069..022dfb970ed 100644 --- a/src/mongo/dbtests/querytests.cpp +++ b/src/mongo/dbtests/querytests.cpp @@ -117,7 +117,7 @@ protected: uassertStatusOK(indexer.insertAllDocumentsInCollection(&_opCtx, _collection)); uassertStatusOK( indexer.drainBackgroundWrites(&_opCtx, - RecoveryUnit::ReadSource::kUnset, + RecoveryUnit::ReadSource::kNoTimestamp, IndexBuildInterceptor::DrainYieldPolicy::kNoYield)); uassertStatusOK(indexer.checkConstraints(&_opCtx)); { diff --git a/src/mongo/dbtests/storage_timestamp_tests.cpp b/src/mongo/dbtests/storage_timestamp_tests.cpp index 750c8ac447d..d270e0467d7 100644 --- a/src/mongo/dbtests/storage_timestamp_tests.cpp +++ b/src/mongo/dbtests/storage_timestamp_tests.cpp @@ -103,7 +103,7 @@ public: OneOffRead(OperationContext* opCtx, const Timestamp& ts) : _opCtx(opCtx) { _opCtx->recoveryUnit()->abandonSnapshot(); if (ts.isNull()) { - _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset); + _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); } else { _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kProvided, ts); } @@ -111,7 +111,7 @@ public: ~OneOffRead() { _opCtx->recoveryUnit()->abandonSnapshot(); - _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset); + _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); } private: @@ -234,7 +234,7 @@ public: */ void reset(NamespaceString nss) const { ::mongo::writeConflictRetry(_opCtx, "deleteAll", nss.ns(), [&] { - _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset); + _opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp); AutoGetCollection collRaii(_opCtx, nss, LockMode::MODE_X); if (collRaii) { @@ -2057,7 +2057,7 @@ public: firstInsert.asTimestamp()); ASSERT_OK(indexer.drainBackgroundWrites(_opCtx, - RecoveryUnit::ReadSource::kUnset, + RecoveryUnit::ReadSource::kNoTimestamp, IndexBuildInterceptor::DrainYieldPolicy::kNoYield)); auto indexCatalog = autoColl.getCollection()->getIndexCatalog(); @@ -2100,7 +2100,7 @@ public: setReplCoordAppliedOpTime(repl::OpTime(afterSecondInsert.asTimestamp(), presentTerm)); ASSERT_OK(indexer.drainBackgroundWrites(_opCtx, - RecoveryUnit::ReadSource::kUnset, + RecoveryUnit::ReadSource::kNoTimestamp, IndexBuildInterceptor::DrainYieldPolicy::kNoYield)); { @@ -2843,7 +2843,7 @@ public: ASSERT_FALSE(buildingIndex->indexBuildInterceptor()->areAllWritesApplied(_opCtx)); ASSERT_OK(indexer.drainBackgroundWrites(_opCtx, - RecoveryUnit::ReadSource::kUnset, + RecoveryUnit::ReadSource::kNoTimestamp, IndexBuildInterceptor::DrainYieldPolicy::kNoYield)); diff --git a/src/mongo/executor/SConscript b/src/mongo/executor/SConscript index 76956c8818e..b10cf01369a 100644 --- a/src/mongo/executor/SConscript +++ b/src/mongo/executor/SConscript @@ -31,6 +31,7 @@ env.Library( 'remote_command_response.cpp', ], LIBDEPS=[ + '$BUILD_DIR/mongo/db/api_parameters', '$BUILD_DIR/mongo/rpc/metadata', '$BUILD_DIR/mongo/util/net/network', ] diff --git a/src/mongo/executor/remote_command_request.cpp b/src/mongo/executor/remote_command_request.cpp index 875da25ef9f..4c35525e6a9 100644 --- a/src/mongo/executor/remote_command_request.cpp +++ b/src/mongo/executor/remote_command_request.cpp @@ -34,6 +34,7 @@ #include <fmt/format.h> #include "mongo/bson/simple_bsonobj_comparator.h" +#include "mongo/db/api_parameters.h" #include "mongo/db/operation_context.h" #include "mongo/db/query/query_request.h" #include "mongo/platform/atomic_word.h" @@ -86,6 +87,12 @@ RemoteCommandRequestBase::RemoteCommandRequestBase(RequestId requestId, cmdObj = cmdObj.addField(BSON("clientOperationKey" << operationKey.get()).firstElement()); } + if (opCtx && APIParameters::get(opCtx).getParamsPassed()) { + BSONObjBuilder bob(std::move(cmdObj)); + APIParameters::get(opCtx).appendInfo(&bob); + cmdObj = bob.obj(); + } + _updateTimeoutFromOpCtxDeadline(opCtx); } diff --git a/src/mongo/s/catalog_cache.cpp b/src/mongo/s/catalog_cache.cpp index 19846e62b48..d9c2500f2d3 100644 --- a/src/mongo/s/catalog_cache.cpp +++ b/src/mongo/s/catalog_cache.cpp @@ -55,6 +55,7 @@ #include "mongo/util/timer.h" namespace mongo { + const OperationContext::Decoration<bool> operationShouldBlockBehindCatalogCacheRefresh = OperationContext::declareDecoration<bool>(); @@ -68,81 +69,8 @@ namespace { const int kMaxInconsistentRoutingInfoRefreshAttempts = 3; const int kDatabaseCacheSize = 10000; -/** - * Returns whether two shard versions have a matching epoch. - */ -bool shardVersionsHaveMatchingEpoch(boost::optional<ChunkVersion> wanted, - const ChunkVersion& received) { - return wanted && wanted->epoch() == received.epoch(); -}; - -/** - * Given an (optional) initial routing table and a set of changed chunks returned by the catalog - * cache loader, produces a new routing table with the changes applied. - * - * If the collection is no longer sharded returns nullptr. If the epoch has changed, expects that - * the 'collectionChunksList' contains the full contents of the chunks collection for that namespace - * so that the routing table can be built from scratch. - * - * Throws ConflictingOperationInProgress if the chunk metadata was found to be inconsistent (not - * containing all the necessary chunks, contains overlaps or chunks' epoch values are not the same - * as that of the collection). Since this situation may be transient, due to the collection being - * dropped or having its shard key refined concurrently, the caller must retry the reload up to some - * configurable number of attempts. - */ -std::shared_ptr<RoutingTableHistory> refreshCollectionRoutingInfo( - OperationContext* opCtx, - const NamespaceString& nss, - std::shared_ptr<RoutingTableHistory> existingRoutingInfo, - StatusWith<CatalogCacheLoader::CollectionAndChangedChunks> swCollectionAndChangedChunks) { - if (swCollectionAndChangedChunks == ErrorCodes::NamespaceNotFound) { - return nullptr; - } - const auto collectionAndChunks = uassertStatusOK(std::move(swCollectionAndChangedChunks)); - - auto chunkManager = [&] { - // If we have routing info already and it's for the same collection epoch, we're updating. - // Otherwise, we're making a whole new routing table. - if (existingRoutingInfo && - existingRoutingInfo->getVersion().epoch() == collectionAndChunks.epoch) { - if (collectionAndChunks.changedChunks.size() == 1 && - collectionAndChunks.changedChunks[0].getVersion() == - existingRoutingInfo->getVersion()) - return existingRoutingInfo; - - return std::make_shared<RoutingTableHistory>( - existingRoutingInfo->makeUpdated(std::move(collectionAndChunks.reshardingFields), - collectionAndChunks.changedChunks)); - } - - auto defaultCollator = [&]() -> std::unique_ptr<CollatorInterface> { - if (!collectionAndChunks.defaultCollation.isEmpty()) { - // The collation should have been validated upon collection creation - return uassertStatusOK(CollatorFactoryInterface::get(opCtx->getServiceContext()) - ->makeFromBSON(collectionAndChunks.defaultCollation)); - } - return nullptr; - }(); - - return std::make_shared<RoutingTableHistory>( - RoutingTableHistory::makeNew(nss, - collectionAndChunks.uuid, - KeyPattern(collectionAndChunks.shardKeyPattern), - std::move(defaultCollator), - collectionAndChunks.shardKeyIsUnique, - collectionAndChunks.epoch, - std::move(collectionAndChunks.reshardingFields), - collectionAndChunks.changedChunks)); - }(); - - std::set<ShardId> shardIds; - chunkManager->getAllShardIds(&shardIds); - for (const auto& shardId : shardIds) { - uassertStatusOK(Grid::get(opCtx)->shardRegistry()->getShard(opCtx, shardId)); - } - return chunkManager; -} +const int kCollectionCacheSize = 10000; } // namespace @@ -155,7 +83,8 @@ CatalogCache::CatalogCache(ServiceContext* const service, CatalogCacheLoader& ca options.maxThreads = 6; return options; }())), - _databaseCache(service, *_executor, _cacheLoader) { + _databaseCache(service, *_executor, _cacheLoader), + _collectionCache(service, *_executor, _cacheLoader) { _executor->startup(); } @@ -190,111 +119,89 @@ StatusWith<CachedDatabaseInfo> CatalogCache::getDatabase(OperationContext* opCtx } } -StatusWith<ChunkManager> CatalogCache::getCollectionRoutingInfo(OperationContext* opCtx, - const NamespaceString& nss) { - return _getCollectionRoutingInfo(opCtx, nss).statusWithInfo; -} - -CatalogCache::RefreshResult CatalogCache::_getCollectionRoutingInfoWithForcedRefresh( - OperationContext* opCtx, const NamespaceString& nss) { - setOperationShouldBlockBehindCatalogCacheRefresh(opCtx, true); - _createOrGetCollectionEntryAndMarkAsNeedsRefresh(nss); - return _getCollectionRoutingInfo(opCtx, nss); -} - -CatalogCache::RefreshResult CatalogCache::_getCollectionRoutingInfo(OperationContext* opCtx, - const NamespaceString& nss) { - return _getCollectionRoutingInfoAt(opCtx, nss, boost::none); -} - - -StatusWith<ChunkManager> CatalogCache::getCollectionRoutingInfoAt(OperationContext* opCtx, - const NamespaceString& nss, - Timestamp atClusterTime) { - return _getCollectionRoutingInfoAt(opCtx, nss, atClusterTime).statusWithInfo; -} - -CatalogCache::RefreshResult CatalogCache::_getCollectionRoutingInfoAt( +StatusWith<ChunkManager> CatalogCache::_getCollectionRoutingInfoAt( OperationContext* opCtx, const NamespaceString& nss, boost::optional<Timestamp> atClusterTime) { - invariant(!opCtx->lockState() || !opCtx->lockState()->isLocked(), - "Do not hold a lock while refreshing the catalog cache. Doing so would potentially " - "hold the lock during a network call, and can lead to a deadlock as described in " - "SERVER-37398."); - // This default value can cause a single unnecessary extra refresh if this thread did do the - // refresh but the refresh failed, or if the database or collection was not found, but only if - // the caller is getCollectionRoutingInfoWithRefresh with the parameter - // forceRefreshFromThisThread set to true - RefreshAction refreshActionTaken(RefreshAction::kDidNotPerformRefresh); - while (true) { + invariant( + !opCtx->lockState() || !opCtx->lockState()->isLocked(), + "Do not hold a lock while refreshing the catalog cache. Doing so would potentially hold " + "the lock during a network call, and can lead to a deadlock as described in SERVER-37398."); + + try { const auto swDbInfo = getDatabase(opCtx, nss.db()); + if (!swDbInfo.isOK()) { if (swDbInfo == ErrorCodes::NamespaceNotFound) { LOGV2_FOR_CATALOG_REFRESH( - 4947102, + 4947103, 2, "Invalidating cached collection entry because its database has been dropped", "namespace"_attr = nss); - purgeCollection(nss); + invalidateCollectionEntry_LINEARIZABLE(nss); } - return {swDbInfo.getStatus(), refreshActionTaken}; + return swDbInfo.getStatus(); } const auto dbInfo = std::move(swDbInfo.getValue()); - stdx::unique_lock<Latch> ul(_mutex); - - auto collEntry = _createOrGetCollectionEntry(ul, nss); + const auto cacheConsistency = gEnableFinerGrainedCatalogCacheRefresh && + !operationShouldBlockBehindCatalogCacheRefresh(opCtx) + ? CacheCausalConsistency::kLatestCached + : CacheCausalConsistency::kLatestKnown; - if (collEntry->needsRefresh && - (!gEnableFinerGrainedCatalogCacheRefresh || collEntry->epochHasChanged || - operationShouldBlockBehindCatalogCacheRefresh(opCtx))) { + auto collEntryFuture = _collectionCache.acquireAsync(nss, cacheConsistency); - operationBlockedBehindCatalogCacheRefresh(opCtx) = true; + // If the entry is in the cache return inmediately. + if (collEntryFuture.isReady()) { + setOperationShouldBlockBehindCatalogCacheRefresh(opCtx, false); + return ChunkManager(dbInfo.primaryId(), + dbInfo.databaseVersion(), + collEntryFuture.get(opCtx), + atClusterTime); + } - auto refreshNotification = collEntry->refreshCompletionNotification; - if (!refreshNotification) { - refreshNotification = (collEntry->refreshCompletionNotification = - std::make_shared<Notification<Status>>()); - _scheduleCollectionRefresh(ul, opCtx->getServiceContext(), collEntry, nss, 1); - refreshActionTaken = RefreshAction::kPerformedRefresh; - } + operationBlockedBehindCatalogCacheRefresh(opCtx) = true; - // Wait on the notification outside of the mutex - ul.unlock(); + size_t acquireTries = 0; + Timer t; - auto refreshStatus = [&]() { - Timer t; - ON_BLOCK_EXIT([&] { _stats.totalRefreshWaitTimeMicros.addAndFetch(t.micros()); }); + while (true) { + try { + auto collEntry = collEntryFuture.get(opCtx); + _stats.totalRefreshWaitTimeMicros.addAndFetch(t.micros()); - try { - const Milliseconds kReportingInterval{250}; - while (!refreshNotification->waitFor(opCtx, kReportingInterval)) { - _stats.totalRefreshWaitTimeMicros.addAndFetch(t.micros()); - t.reset(); - } + setOperationShouldBlockBehindCatalogCacheRefresh(opCtx, false); - return refreshNotification->get(opCtx); - } catch (const DBException& ex) { + return ChunkManager(dbInfo.primaryId(), + dbInfo.databaseVersion(), + std::move(collEntry), + atClusterTime); + } catch (ExceptionFor<ErrorCodes::ConflictingOperationInProgress>& ex) { + _stats.totalRefreshWaitTimeMicros.addAndFetch(t.micros()); + acquireTries++; + if (acquireTries == kMaxInconsistentRoutingInfoRefreshAttempts) { return ex.toStatus(); } - }(); - - if (!refreshStatus.isOK()) { - return {refreshStatus, refreshActionTaken}; } - // Once the refresh is complete, loop around to get the latest value - continue; + collEntryFuture = _collectionCache.acquireAsync(nss, cacheConsistency); + t.reset(); } - - return {ChunkManager(dbInfo.primaryId(), - dbInfo.databaseVersion(), - collEntry->routingInfo, - atClusterTime), - refreshActionTaken}; + } catch (const DBException& ex) { + return ex.toStatus(); } } +StatusWith<ChunkManager> CatalogCache::getCollectionRoutingInfo(OperationContext* opCtx, + const NamespaceString& nss) { + return _getCollectionRoutingInfoAt(opCtx, nss, boost::none); +} + +StatusWith<ChunkManager> CatalogCache::getCollectionRoutingInfoAt(OperationContext* opCtx, + const NamespaceString& nss, + Timestamp atClusterTime) { + return _getCollectionRoutingInfoAt(opCtx, nss, atClusterTime); +} + StatusWith<CachedDatabaseInfo> CatalogCache::getDatabaseWithRefresh(OperationContext* opCtx, StringData dbName) { // TODO SERVER-49724: Make ReadThroughCache support StringData keys @@ -303,32 +210,20 @@ StatusWith<CachedDatabaseInfo> CatalogCache::getDatabaseWithRefresh(OperationCon } StatusWith<ChunkManager> CatalogCache::getCollectionRoutingInfoWithRefresh( - OperationContext* opCtx, const NamespaceString& nss, bool forceRefreshFromThisThread) { - auto refreshResult = _getCollectionRoutingInfoWithForcedRefresh(opCtx, nss); - // We want to ensure that we don't join an in-progress refresh because that - // could violate causal consistency for this client. We don't need to actually perform the - // refresh ourselves but we do need the refresh to begin *after* this function is - // called, so calling it twice is enough regardless of what happens the - // second time. See SERVER-33954 for reasoning. - if (forceRefreshFromThisThread && - refreshResult.actionTaken == RefreshAction::kDidNotPerformRefresh) { - refreshResult = _getCollectionRoutingInfoWithForcedRefresh(opCtx, nss); - } - return refreshResult.statusWithInfo; + OperationContext* opCtx, const NamespaceString& nss) { + _collectionCache.invalidate(nss); + setOperationShouldBlockBehindCatalogCacheRefresh(opCtx, true); + return getCollectionRoutingInfo(opCtx, nss); } StatusWith<ChunkManager> CatalogCache::getShardedCollectionRoutingInfoWithRefresh( OperationContext* opCtx, const NamespaceString& nss) { - auto swRoutingInfo = _getCollectionRoutingInfoWithForcedRefresh(opCtx, nss).statusWithInfo; - if (!swRoutingInfo.isOK()) - return swRoutingInfo; - - auto cri(std::move(swRoutingInfo.getValue())); - if (!cri.isSharded()) + auto routingInfoStatus = getCollectionRoutingInfoWithRefresh(opCtx, nss); + if (routingInfoStatus.isOK() && !routingInfoStatus.getValue().isSharded()) { return {ErrorCodes::NamespaceNotSharded, str::stream() << "Collection " << nss.ns() << " is not sharded."}; - - return cri; + } + return routingInfoStatus; } void CatalogCache::onStaleDatabaseVersion(const StringData dbName, @@ -350,48 +245,49 @@ void CatalogCache::setOperationShouldBlockBehindCatalogCacheRefresh(OperationCon if (gEnableFinerGrainedCatalogCacheRefresh) { operationShouldBlockBehindCatalogCacheRefresh(opCtx) = shouldBlock; } -}; +} void CatalogCache::invalidateShardOrEntireCollectionEntryForShardedCollection( - OperationContext* opCtx, const NamespaceString& nss, - boost::optional<ChunkVersion> wantedVersion, - const ChunkVersion& receivedVersion, - ShardId shardId) { - if (shardVersionsHaveMatchingEpoch(wantedVersion, receivedVersion)) { - _createOrGetCollectionEntryAndMarkShardStale(nss, shardId); - } else { - _createOrGetCollectionEntryAndMarkEpochStale(nss); + const boost::optional<ChunkVersion>& wantedVersion, + const ShardId& shardId) { + _stats.countStaleConfigErrors.addAndFetch(1); + + auto collectionEntry = _collectionCache.peekLatestCached(nss); + if (collectionEntry && collectionEntry->optRt) { + collectionEntry->optRt->setShardStale(shardId); } -}; -void CatalogCache::onEpochChange(const NamespaceString& nss) { - _createOrGetCollectionEntryAndMarkEpochStale(nss); -}; + if (wantedVersion) { + _collectionCache.advanceTimeInStore( + nss, ComparableChunkVersion::makeComparableChunkVersion(*wantedVersion)); + } else { + _collectionCache.advanceTimeInStore( + nss, ComparableChunkVersion::makeComparableChunkVersionForForcedRefresh()); + } +} void CatalogCache::checkEpochOrThrow(const NamespaceString& nss, - ChunkVersion targetCollectionVersion, - const ShardId& shardId) const { - stdx::lock_guard<Latch> lg(_mutex); - const auto itDb = _collectionsByDb.find(nss.db()); + const ChunkVersion& targetCollectionVersion, + const ShardId& shardId) { uassert(StaleConfigInfo(nss, targetCollectionVersion, boost::none, shardId), str::stream() << "could not act as router for " << nss.ns() << ", no entry for database " << nss.db(), - itDb != _collectionsByDb.end()); + _databaseCache.peekLatestCached(nss.db().toString())); - auto itColl = itDb->second.find(nss.ns()); + auto collectionValueHandle = _collectionCache.peekLatestCached(nss); uassert(StaleConfigInfo(nss, targetCollectionVersion, boost::none, shardId), str::stream() << "could not act as router for " << nss.ns() << ", no entry for collection.", - itColl != itDb->second.end()); + collectionValueHandle); uassert(StaleConfigInfo(nss, targetCollectionVersion, boost::none, shardId), str::stream() << "could not act as router for " << nss.ns() << ", wanted " << targetCollectionVersion.toString() << ", but found the collection was unsharded", - itColl->second->routingInfo); + collectionValueHandle->optRt); - auto foundVersion = itColl->second->routingInfo->getVersion(); + auto foundVersion = collectionValueHandle->optRt->getVersion(); uassert(StaleConfigInfo(nss, targetCollectionVersion, foundVersion, shardId), str::stream() << "could not act as router for " << nss.ns() << ", wanted " << targetCollectionVersion.toString() << ", but found " @@ -399,11 +295,6 @@ void CatalogCache::checkEpochOrThrow(const NamespaceString& nss, foundVersion.epoch() == targetCollectionVersion.epoch()); } -void CatalogCache::invalidateShardForShardedCollection(const NamespaceString& nss, - const ShardId& staleShardId) { - _createOrGetCollectionEntryAndMarkShardStale(nss, staleShardId); -} - void CatalogCache::invalidateEntriesThatReferenceShard(const ShardId& shardId) { LOGV2_DEBUG(4997600, 1, @@ -413,32 +304,24 @@ void CatalogCache::invalidateEntriesThatReferenceShard(const ShardId& shardId) { _databaseCache.invalidateCachedValueIf( [&](const DatabaseType& dbt) { return dbt.getPrimary() == shardId; }); - stdx::lock_guard<Latch> lg(_mutex); - // Invalidate collections which contain data on this shard. - for (const auto& [db, collInfoMap] : _collectionsByDb) { - for (const auto& [collNs, collRoutingInfoEntry] : collInfoMap) { - if (!collRoutingInfoEntry->needsRefresh && collRoutingInfoEntry->routingInfo) { - // The set of shards on which this collection contains chunks. - std::set<ShardId> shardsOwningDataForCollection; - collRoutingInfoEntry->routingInfo->getAllShardIds(&shardsOwningDataForCollection); - - if (shardsOwningDataForCollection.find(shardId) != - shardsOwningDataForCollection.end()) { - LOGV2_DEBUG(22647, - 3, - "Invalidating cached collection {namespace} that has data " - "on shard {shardId}", - "Invalidating cached collection", - "namespace"_attr = collNs, - "shardId"_attr = shardId); - - collRoutingInfoEntry->needsRefresh = true; - collRoutingInfoEntry->routingInfo->setShardStale(shardId); - } - } - } - } + _collectionCache.invalidateCachedValueIf([&](const OptionalRoutingTableHistory& ort) { + if (!ort.optRt) + return false; + const auto& rt = *ort.optRt; + + std::set<ShardId> shardIds; + rt.getAllShardIds(&shardIds); + + LOGV2_DEBUG(22647, + 3, + "Invalidating cached collection {namespace} that has data " + "on shard {shardId}", + "Invalidating cached collection", + "namespace"_attr = rt.nss(), + "shardId"_attr = shardId); + return shardIds.find(shardId) != shardIds.end(); + }); LOGV2(22648, "Finished invalidating databases and collections with data on shard: {shardId}", @@ -446,46 +329,28 @@ void CatalogCache::invalidateEntriesThatReferenceShard(const ShardId& shardId) { "shardId"_attr = shardId); } -void CatalogCache::purgeCollection(const NamespaceString& nss) { - stdx::lock_guard<Latch> lg(_mutex); - - auto itDb = _collectionsByDb.find(nss.db()); - if (itDb == _collectionsByDb.end()) { - return; - } - - itDb->second.erase(nss.ns()); -} - void CatalogCache::purgeDatabase(StringData dbName) { _databaseCache.invalidate(dbName.toString()); - stdx::lock_guard<Latch> lg(_mutex); - _collectionsByDb.erase(dbName); + _collectionCache.invalidateKeyIf( + [&](const NamespaceString& nss) { return nss.db() == dbName; }); } void CatalogCache::purgeAllDatabases() { _databaseCache.invalidateAll(); - stdx::lock_guard<Latch> lg(_mutex); - _collectionsByDb.clear(); + _collectionCache.invalidateAll(); } void CatalogCache::report(BSONObjBuilder* builder) const { BSONObjBuilder cacheStatsBuilder(builder->subobjStart("catalogCache")); - size_t numDatabaseEntries; - size_t numCollectionEntries{0}; - { - numDatabaseEntries = _databaseCache.getCacheInfo().size(); - stdx::lock_guard<Latch> ul(_mutex); - for (const auto& entry : _collectionsByDb) { - numCollectionEntries += entry.second.size(); - } - } + const size_t numDatabaseEntries = _databaseCache.getCacheInfo().size(); + const size_t numCollectionEntries = _collectionCache.getCacheInfo().size(); cacheStatsBuilder.append("numDatabaseEntries", static_cast<long long>(numDatabaseEntries)); cacheStatsBuilder.append("numCollectionEntries", static_cast<long long>(numCollectionEntries)); _stats.report(&cacheStatsBuilder); + _collectionCache.reportStats(&cacheStatsBuilder); } void CatalogCache::checkAndRecordOperationBlockedByRefresh(OperationContext* opCtx, @@ -519,188 +384,8 @@ void CatalogCache::checkAndRecordOperationBlockedByRefresh(OperationContext* opC } } -void CatalogCache::_scheduleCollectionRefresh(WithLock lk, - ServiceContext* service, - std::shared_ptr<CollectionRoutingInfoEntry> collEntry, - NamespaceString const& nss, - int refreshAttempt) { - const auto existingRoutingInfo = collEntry->routingInfo; - - // If we have an existing chunk manager, the refresh is considered "incremental", regardless of - // how many chunks are in the differential - const bool isIncremental(existingRoutingInfo); - - if (isIncremental) { - _stats.numActiveIncrementalRefreshes.addAndFetch(1); - _stats.countIncrementalRefreshesStarted.addAndFetch(1); - } else { - _stats.numActiveFullRefreshes.addAndFetch(1); - _stats.countFullRefreshesStarted.addAndFetch(1); - } - - // Invoked when one iteration of getChunksSince has completed, whether with success or error - const auto onRefreshCompleted = [this, t = Timer(), nss, isIncremental, existingRoutingInfo]( - const Status& status, - RoutingTableHistory* routingInfoAfterRefresh) { - if (isIncremental) { - _stats.numActiveIncrementalRefreshes.subtractAndFetch(1); - } else { - _stats.numActiveFullRefreshes.subtractAndFetch(1); - } - - if (!status.isOK()) { - _stats.countFailedRefreshes.addAndFetch(1); - - LOGV2_OPTIONS(24103, - {logv2::LogComponent::kShardingCatalogRefresh}, - "Error refreshing cached collection {namespace}; Took {duration} and " - "failed due to {error}", - "Error refreshing cached collection", - "namespace"_attr = nss, - "duration"_attr = Milliseconds(t.millis()), - "error"_attr = redact(status)); - } else if (routingInfoAfterRefresh) { - const int logLevel = - (!existingRoutingInfo || - (existingRoutingInfo && - routingInfoAfterRefresh->getVersion() != existingRoutingInfo->getVersion())) - ? 0 - : 1; - LOGV2_FOR_CATALOG_REFRESH( - 24104, - logLevel, - "Refreshed cached collection {namespace} to version {newVersion} from version " - "{oldVersion}. Took {duration}", - "Refreshed cached collection", - "namespace"_attr = nss, - "newVersion"_attr = routingInfoAfterRefresh->getVersion(), - "oldVersion"_attr = - (existingRoutingInfo - ? (" from version " + existingRoutingInfo->getVersion().toString()) - : ""), - "duration"_attr = Milliseconds(t.millis())); - } else { - LOGV2_OPTIONS(24105, - {logv2::LogComponent::kShardingCatalogRefresh}, - "Collection {namespace} was found to be unsharded after refresh that " - "took {duration}", - "Collection has found to be unsharded after refresh", - "namespace"_attr = nss, - "duration"_attr = Milliseconds(t.millis())); - } - }; - - // Invoked if getChunksSince resulted in error or threw an exception - const auto onRefreshFailed = - [ this, service, collEntry, nss, refreshAttempt, - onRefreshCompleted ](WithLock lk, const Status& status) noexcept { - onRefreshCompleted(status, nullptr); - - // It is possible that the metadata is being changed concurrently, so retry the - // refresh again - if (status == ErrorCodes::ConflictingOperationInProgress && - refreshAttempt < kMaxInconsistentRoutingInfoRefreshAttempts) { - _scheduleCollectionRefresh(lk, service, collEntry, nss, refreshAttempt + 1); - } else { - // Leave needsRefresh to true so that any subsequent get attempts will kick off - // another round of refresh - collEntry->refreshCompletionNotification->set(status); - collEntry->refreshCompletionNotification = nullptr; - } - }; - - const auto refreshCallback = - [ this, service, collEntry, nss, existingRoutingInfo, onRefreshFailed, onRefreshCompleted ]( - StatusWith<CatalogCacheLoader::CollectionAndChangedChunks> swCollAndChunks) noexcept { - - ThreadClient tc("CatalogCache::collectionRefresh", service); - auto opCtx = tc->makeOperationContext(); - - std::shared_ptr<RoutingTableHistory> newRoutingInfo; - try { - newRoutingInfo = refreshCollectionRoutingInfo( - opCtx.get(), nss, std::move(existingRoutingInfo), std::move(swCollAndChunks)); - - onRefreshCompleted(Status::OK(), newRoutingInfo.get()); - } catch (const DBException& ex) { - stdx::lock_guard<Latch> lg(_mutex); - onRefreshFailed(lg, ex.toStatus()); - return; - } - - stdx::lock_guard<Latch> lg(_mutex); - - collEntry->epochHasChanged = false; - collEntry->needsRefresh = false; - collEntry->refreshCompletionNotification->set(Status::OK()); - collEntry->refreshCompletionNotification = nullptr; - - setOperationShouldBlockBehindCatalogCacheRefresh(opCtx.get(), false); - - // TODO(SERVER-49876): remove clang-tidy NOLINT comments. - if (existingRoutingInfo && newRoutingInfo && // NOLINT(bugprone-use-after-move) - existingRoutingInfo->getVersion() == // NOLINT(bugprone-use-after-move) - newRoutingInfo->getVersion()) { // NOLINT(bugprone-use-after-move) - // If the routingInfo hasn't changed, we need to manually reset stale shards. - newRoutingInfo->setAllShardsRefreshed(); - } - - collEntry->routingInfo = std::move(newRoutingInfo); - }; - - const ChunkVersion startingCollectionVersion = - (existingRoutingInfo ? existingRoutingInfo->getVersion() : ChunkVersion::UNSHARDED()); - - LOGV2_FOR_CATALOG_REFRESH( - 24106, - 1, - "Refreshing cached collection {namespace} with version {currentCollectionVersion}", - "namespace"_attr = nss, - "currentCollectionVersion"_attr = startingCollectionVersion); - - _cacheLoader.getChunksSince(nss, startingCollectionVersion) - .thenRunOn(_executor) - .getAsync(refreshCallback); - - // The routing info for this collection shouldn't change, as other threads may try to use the - // CatalogCache while we are waiting for the refresh to complete. - invariant(collEntry->routingInfo.get() == existingRoutingInfo.get()); -} - -void CatalogCache::_createOrGetCollectionEntryAndMarkEpochStale(const NamespaceString& nss) { - stdx::lock_guard<Latch> lg(_mutex); - auto collRoutingInfoEntry = _createOrGetCollectionEntry(lg, nss); - collRoutingInfoEntry->needsRefresh = true; - collRoutingInfoEntry->epochHasChanged = true; -} - -void CatalogCache::_createOrGetCollectionEntryAndMarkShardStale(const NamespaceString& nss, - const ShardId& staleShardId) { - stdx::lock_guard<Latch> lg(_mutex); - auto collRoutingInfoEntry = _createOrGetCollectionEntry(lg, nss); - collRoutingInfoEntry->needsRefresh = true; - if (collRoutingInfoEntry->routingInfo) { - collRoutingInfoEntry->routingInfo->setShardStale(staleShardId); - } -} - -void CatalogCache::_createOrGetCollectionEntryAndMarkAsNeedsRefresh(const NamespaceString& nss) { - stdx::lock_guard<Latch> lg(_mutex); - auto collRoutingInfoEntry = _createOrGetCollectionEntry(lg, nss); - collRoutingInfoEntry->needsRefresh = true; -} - -std::shared_ptr<CatalogCache::CollectionRoutingInfoEntry> CatalogCache::_createOrGetCollectionEntry( - WithLock wl, const NamespaceString& nss) { - auto& collectionsForDb = _collectionsByDb[nss.db()]; - if (!collectionsForDb.contains(nss.ns())) { - // TODO SERVER-46199: ensure collections cache size is capped - // currently no routine except for dropDatabase is removing cached collection entries and - // the cache for a specific DB can grow indefinitely. - collectionsForDb[nss.ns()] = std::make_shared<CollectionRoutingInfoEntry>(); - } - - return collectionsForDb[nss.ns()]; +void CatalogCache::invalidateCollectionEntry_LINEARIZABLE(const NamespaceString& nss) { + _collectionCache.invalidate(nss); } void CatalogCache::Stats::report(BSONObjBuilder* builder) const { @@ -708,14 +393,6 @@ void CatalogCache::Stats::report(BSONObjBuilder* builder) const { builder->append("totalRefreshWaitTimeMicros", totalRefreshWaitTimeMicros.load()); - builder->append("numActiveIncrementalRefreshes", numActiveIncrementalRefreshes.load()); - builder->append("countIncrementalRefreshesStarted", countIncrementalRefreshesStarted.load()); - - builder->append("numActiveFullRefreshes", numActiveFullRefreshes.load()); - builder->append("countFullRefreshesStarted", countFullRefreshesStarted.load()); - - builder->append("countFailedRefreshes", countFailedRefreshes.load()); - if (isMongos()) { BSONObjBuilder operationsBlockedByRefreshBuilder( builder->subobjStart("operationsBlockedByRefresh")); @@ -756,7 +433,6 @@ CatalogCache::DatabaseCache::LookupResult CatalogCache::DatabaseCache::_lookupDa OperationContext* opCtx, const std::string& dbName, const ComparableDatabaseVersion& previousDbVersion) { - // TODO (SERVER-34164): Track and increment stats for database refreshes LOGV2_FOR_CATALOG_REFRESH(24102, 2, "Refreshing cached database entry", "db"_attr = dbName); @@ -788,73 +464,199 @@ CatalogCache::DatabaseCache::LookupResult CatalogCache::DatabaseCache::_lookupDa } } -AtomicWord<uint64_t> ComparableDatabaseVersion::_localSequenceNumSource{1ULL}; +CatalogCache::CollectionCache::CollectionCache(ServiceContext* service, + ThreadPoolInterface& threadPool, + CatalogCacheLoader& catalogCacheLoader) + : ReadThroughCache(_mutex, + service, + threadPool, + [this](OperationContext* opCtx, + const NamespaceString& nss, + const ValueHandle& collectionHistory, + const ComparableChunkVersion& previousChunkVersion) { + return _lookupCollection( + opCtx, nss, collectionHistory, previousChunkVersion); + }, + kCollectionCacheSize), + _catalogCacheLoader(catalogCacheLoader) {} -ComparableDatabaseVersion ComparableDatabaseVersion::makeComparableDatabaseVersion( - const DatabaseVersion& version) { - return ComparableDatabaseVersion(version, _localSequenceNumSource.fetchAndAdd(1)); +void CatalogCache::CollectionCache::reportStats(BSONObjBuilder* builder) const { + _stats.report(builder); } -const DatabaseVersion& ComparableDatabaseVersion::getVersion() const { - return _dbVersion; +void CatalogCache::CollectionCache::_updateRefreshesStats(const bool isIncremental, + const bool add) { + if (add) { + if (isIncremental) { + _stats.numActiveIncrementalRefreshes.addAndFetch(1); + _stats.countIncrementalRefreshesStarted.addAndFetch(1); + } else { + _stats.numActiveFullRefreshes.addAndFetch(1); + _stats.countFullRefreshesStarted.addAndFetch(1); + } + } else { + if (isIncremental) { + _stats.numActiveIncrementalRefreshes.subtractAndFetch(1); + } else { + _stats.numActiveFullRefreshes.subtractAndFetch(1); + } + } } -uint64_t ComparableDatabaseVersion::getLocalSequenceNum() const { - return _localSequenceNum; -} +void CatalogCache::CollectionCache::Stats::report(BSONObjBuilder* builder) const { + builder->append("numActiveIncrementalRefreshes", numActiveIncrementalRefreshes.load()); + builder->append("countIncrementalRefreshesStarted", countIncrementalRefreshesStarted.load()); -BSONObj ComparableDatabaseVersion::toBSON() const { - BSONObjBuilder builder; - _dbVersion.getUuid().appendToBuilder(&builder, "uuid"); - builder.append("lastMod", _dbVersion.getLastMod()); - builder.append("localSequenceNum", std::to_string(_localSequenceNum)); - return builder.obj(); -} + builder->append("numActiveFullRefreshes", numActiveFullRefreshes.load()); + builder->append("countFullRefreshesStarted", countFullRefreshesStarted.load()); -std::string ComparableDatabaseVersion::toString() const { - return toBSON().toString(); + builder->append("countFailedRefreshes", countFailedRefreshes.load()); } +CatalogCache::CollectionCache::LookupResult CatalogCache::CollectionCache::_lookupCollection( + OperationContext* opCtx, + const NamespaceString& nss, + const RoutingTableHistoryValueHandle& existingHistory, + const ComparableChunkVersion& previousVersion) { + const bool isIncremental(existingHistory && existingHistory->optRt); + _updateRefreshesStats(isIncremental, true); -CachedDatabaseInfo::CachedDatabaseInfo(DatabaseType dbt, std::shared_ptr<Shard> primaryShard) - : _dbt(std::move(dbt)), _primaryShard(std::move(primaryShard)) {} + Timer t{}; + try { + auto lookupVersion = + isIncremental ? existingHistory->optRt->getVersion() : ChunkVersion::UNSHARDED(); -const ShardId& CachedDatabaseInfo::primaryId() const { - return _dbt.getPrimary(); + LOGV2_FOR_CATALOG_REFRESH(4619900, + 1, + "Refreshing cached collection", + "namespace"_attr = nss, + "currentVersion"_attr = previousVersion); + + auto collectionAndChunks = _catalogCacheLoader.getChunksSince(nss, lookupVersion).get(); + + auto newRoutingHistory = [&] { + // If we have routing info already and it's for the same collection epoch, we're + // updating. Otherwise, we're making a whole new routing table. + if (isIncremental && + existingHistory->optRt->getVersion().epoch() == collectionAndChunks.epoch) { + return existingHistory->optRt->makeUpdated(collectionAndChunks.reshardingFields, + collectionAndChunks.changedChunks); + } + + auto defaultCollator = [&]() -> std::unique_ptr<CollatorInterface> { + if (!collectionAndChunks.defaultCollation.isEmpty()) { + // The collation should have been validated upon collection creation + return uassertStatusOK( + CollatorFactoryInterface::get(opCtx->getServiceContext()) + ->makeFromBSON(collectionAndChunks.defaultCollation)); + } + return nullptr; + }(); + + return RoutingTableHistory::makeNew(nss, + collectionAndChunks.uuid, + KeyPattern(collectionAndChunks.shardKeyPattern), + std::move(defaultCollator), + collectionAndChunks.shardKeyIsUnique, + collectionAndChunks.epoch, + std::move(collectionAndChunks.reshardingFields), + collectionAndChunks.changedChunks); + }(); + + newRoutingHistory.setAllShardsRefreshed(); + + // Check that the shards all match with what is on the config server + std::set<ShardId> shardIds; + newRoutingHistory.getAllShardIds(&shardIds); + for (const auto& shardId : shardIds) { + uassertStatusOK(Grid::get(opCtx)->shardRegistry()->getShard(opCtx, shardId)); + } + + const auto newVersion = + ComparableChunkVersion::makeComparableChunkVersion(newRoutingHistory.getVersion()); + + LOGV2_FOR_CATALOG_REFRESH(4619901, + isIncremental || newVersion != previousVersion ? 0 : 1, + "Refreshed cached collection", + "namespace"_attr = nss, + "newVersion"_attr = newVersion, + "oldVersion"_attr = previousVersion, + "duration"_attr = Milliseconds(t.millis())); + _updateRefreshesStats(isIncremental, false); + + return LookupResult(OptionalRoutingTableHistory(std::move(newRoutingHistory)), newVersion); + } catch (const DBException& ex) { + _stats.countFailedRefreshes.addAndFetch(1); + _updateRefreshesStats(isIncremental, false); + + if (ex.code() == ErrorCodes::NamespaceNotFound) { + LOGV2_FOR_CATALOG_REFRESH(4619902, + 0, + "Collection has found to be unsharded after refresh", + "namespace"_attr = nss, + "duration"_attr = Milliseconds(t.millis())); + + return LookupResult( + OptionalRoutingTableHistory(), + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion::UNSHARDED())); + } + + LOGV2_FOR_CATALOG_REFRESH(4619903, + 0, + "Error refreshing cached collection", + "namespace"_attr = nss, + "duration"_attr = Milliseconds(t.millis()), + "error"_attr = redact(ex)); + + throw; + } } -bool CachedDatabaseInfo::shardingEnabled() const { - return _dbt.getSharded(); +AtomicWord<uint64_t> ComparableDatabaseVersion::_uuidDisambiguatingSequenceNumSource{1ULL}; + +ComparableDatabaseVersion ComparableDatabaseVersion::makeComparableDatabaseVersion( + const DatabaseVersion& version) { + return ComparableDatabaseVersion(version, _uuidDisambiguatingSequenceNumSource.fetchAndAdd(1)); } -DatabaseVersion CachedDatabaseInfo::databaseVersion() const { - return _dbt.getVersion(); +std::string ComparableDatabaseVersion::toString() const { + return str::stream() << (_dbVersion ? _dbVersion->toBSON().toString() : "NONE") << "|" + << _uuidDisambiguatingSequenceNum; } -AtomicWord<uint64_t> ComparableChunkVersion::_localSequenceNumSource{1ULL}; +bool ComparableDatabaseVersion::operator==(const ComparableDatabaseVersion& other) const { + if (!_dbVersion && !other._dbVersion) + return true; // Default constructed value + if (_dbVersion.is_initialized() != other._dbVersion.is_initialized()) + return false; // One side is default constructed value -ComparableChunkVersion ComparableChunkVersion::makeComparableChunkVersion( - const ChunkVersion& version) { - return ComparableChunkVersion(version, _localSequenceNumSource.fetchAndAdd(1)); + return sameUuid(other) && (_dbVersion->getLastMod() == other._dbVersion->getLastMod()); } -const ChunkVersion& ComparableChunkVersion::getVersion() const { - return _chunkVersion; +bool ComparableDatabaseVersion::operator<(const ComparableDatabaseVersion& other) const { + if (!_dbVersion && !other._dbVersion) + return false; // Default constructed value + + if (_dbVersion && other._dbVersion && sameUuid(other)) { + return _dbVersion->getLastMod() < other._dbVersion->getLastMod(); + } else { + return _uuidDisambiguatingSequenceNum < other._uuidDisambiguatingSequenceNum; + } } -uint64_t ComparableChunkVersion::getLocalSequenceNum() const { - return _localSequenceNum; +CachedDatabaseInfo::CachedDatabaseInfo(DatabaseType dbt, std::shared_ptr<Shard> primaryShard) + : _dbt(std::move(dbt)), _primaryShard(std::move(primaryShard)) {} + +const ShardId& CachedDatabaseInfo::primaryId() const { + return _dbt.getPrimary(); } -BSONObj ComparableChunkVersion::toBSON() const { - BSONObjBuilder builder; - _chunkVersion.appendToCommand(&builder); - builder.append("localSequenceNum", std::to_string(_localSequenceNum)); - return builder.obj(); +bool CachedDatabaseInfo::shardingEnabled() const { + return _dbt.getSharded(); } -std::string ComparableChunkVersion::toString() const { - return toBSON().toString(); +DatabaseVersion CachedDatabaseInfo::databaseVersion() const { + return _dbt.getVersion(); } } // namespace mongo diff --git a/src/mongo/s/catalog_cache.h b/src/mongo/s/catalog_cache.h index a957189183a..796b9e10136 100644 --- a/src/mongo/s/catalog_cache.h +++ b/src/mongo/s/catalog_cache.h @@ -45,8 +45,6 @@ namespace mongo { class BSONObjBuilder; -class CachedDatabaseInfo; -class OperationContext; static constexpr int kMaxNumStaleVersionRetries = 10; @@ -64,21 +62,21 @@ extern const OperationContext::Decoration<bool> operationShouldBlockBehindCatalo * in fact is impossible to compare two different DatabaseVersion that have different UUIDs. * * This class wrap a DatabaseVersion object to make it always comparable by timestamping it with a - * node-local sequence number (_dbVersionLocalSequence). + * node-local sequence number (_uuidDisambiguatingSequenceNum). * * This class class should go away once a cluster-wide comparable DatabaseVersion will be * implemented. */ class ComparableDatabaseVersion { public: - /* - * Create a ComparableDatabaseVersion that wraps the given DatabaseVersion. - * Each object created through this method will have a local sequence number grater then the + /** + * Creates a ComparableDatabaseVersion that wraps the given DatabaseVersion. + * Each object created through this method will have a local sequence number greater than the * previously created ones. */ static ComparableDatabaseVersion makeComparableDatabaseVersion(const DatabaseVersion& version); - /* + /** * Empty constructor needed by the ReadThroughCache. * * Instances created through this constructor will be always less then the ones created through @@ -86,39 +84,28 @@ public: */ ComparableDatabaseVersion() = default; - const DatabaseVersion& getVersion() const; - - uint64_t getLocalSequenceNum() const; - - BSONObj toBSON() const; + const DatabaseVersion& getVersion() const { + return *_dbVersion; + } std::string toString() const; - // Rerturns true if the two versions have the same UUID bool sameUuid(const ComparableDatabaseVersion& other) const { - return _dbVersion.getUuid() == other._dbVersion.getUuid(); + return _dbVersion->getUuid() == other._dbVersion->getUuid(); } - bool operator==(const ComparableDatabaseVersion& other) const { - return sameUuid(other) && (_dbVersion.getLastMod() == other._dbVersion.getLastMod()); - } + bool operator==(const ComparableDatabaseVersion& other) const; bool operator!=(const ComparableDatabaseVersion& other) const { return !(*this == other); } - /* - * In the case the two compared instances have different UUIDs the most recently created one - * will be grater, otherwise the comparision will be driven by the lastMod field of the - * underlying DatabaseVersion. + /** + * In case the two compared instances have different UUIDs, the most recently created one will + * be greater, otherwise the comparison will be driven by the lastMod field of the underlying + * DatabaseVersion. */ - bool operator<(const ComparableDatabaseVersion& other) const { - if (sameUuid(other)) { - return _dbVersion.getLastMod() < other._dbVersion.getLastMod(); - } else { - return _localSequenceNum < other._localSequenceNum; - } - } + bool operator<(const ComparableDatabaseVersion& other) const; bool operator>(const ComparableDatabaseVersion& other) const { return other < *this; @@ -133,92 +120,18 @@ public: } private: - static AtomicWord<uint64_t> _localSequenceNumSource; + static AtomicWord<uint64_t> _uuidDisambiguatingSequenceNumSource; + + ComparableDatabaseVersion(const DatabaseVersion& version, + uint64_t uuidDisambiguatingSequenceNum) + : _dbVersion(version), _uuidDisambiguatingSequenceNum(uuidDisambiguatingSequenceNum) {} - ComparableDatabaseVersion(const DatabaseVersion& version, uint64_t localSequenceNum) - : _dbVersion(version), _localSequenceNum(localSequenceNum) {} + boost::optional<DatabaseVersion> _dbVersion; - DatabaseVersion _dbVersion; // Locally incremented sequence number that allows to compare two database versions with // different UUIDs. Each new comparableDatabaseVersion will have a greater sequence number then // the ones created before. - uint64_t _localSequenceNum{0}; -}; - -/** - * Constructed to be used exclusively by the CatalogCache as a vector clock (Time) to drive - * CollectionCache's lookups. - * - * The ChunkVersion class contains an non comparable epoch, which makes impossible to compare two - * ChunkVersions when their epochs's differ. - * - * This class wraps a ChunkVersion object with a node-local sequence number (_localSequenceNum) that - * allows the comparision. - * - * This class should go away once a cluster-wide comparable ChunkVersion is implemented. - */ -class ComparableChunkVersion { -public: - static ComparableChunkVersion makeComparableChunkVersion(const ChunkVersion& version); - - ComparableChunkVersion() = default; - - const ChunkVersion& getVersion() const; - - uint64_t getLocalSequenceNum() const; - - BSONObj toBSON() const; - - std::string toString() const; - - bool sameEpoch(const ComparableChunkVersion& other) const { - return _chunkVersion.epoch() == other._chunkVersion.epoch(); - } - - bool operator==(const ComparableChunkVersion& other) const { - return sameEpoch(other) && - (_chunkVersion.majorVersion() == other._chunkVersion.majorVersion() && - _chunkVersion.minorVersion() == other._chunkVersion.minorVersion()); - } - - bool operator!=(const ComparableChunkVersion& other) const { - return !(*this == other); - } - - bool operator<(const ComparableChunkVersion& other) const { - if (sameEpoch(other)) { - return _chunkVersion.majorVersion() < other._chunkVersion.majorVersion() || - (_chunkVersion.majorVersion() == other._chunkVersion.majorVersion() && - _chunkVersion.minorVersion() < other._chunkVersion.minorVersion()); - } else { - return _localSequenceNum < other._localSequenceNum; - } - } - - bool operator>(const ComparableChunkVersion& other) const { - return other < *this; - } - - bool operator<=(const ComparableChunkVersion& other) const { - return !(*this > other); - } - - bool operator>=(const ComparableChunkVersion& other) const { - return !(*this < other); - } - -private: - static AtomicWord<uint64_t> _localSequenceNumSource; - - ComparableChunkVersion(const ChunkVersion& version, uint64_t localSequenceNum) - : _chunkVersion(version), _localSequenceNum(localSequenceNum) {} - - ChunkVersion _chunkVersion; - - // Locally incremented sequence number that allows to compare two colection versions with - // different epochs. Each new comparableChunkVersion will have a greater sequence number than - // the ones created before. - uint64_t _localSequenceNum{0}; + uint64_t _uuidDisambiguatingSequenceNum{0}; }; /** @@ -298,21 +211,9 @@ public: /** * Same as getCollectionRoutingInfo above, but in addition causes the namespace to be refreshed. - * - * When forceRefreshFromThisThread is false, it's possible for this call to - * join an ongoing refresh from another thread forceRefreshFromThisThread. - * forceRefreshFromThisThread checks whether it joined another thread and - * then forces it to try again, which is necessary in cases where calls to - * getCollectionRoutingInfoWithRefresh must be causally consistent - * - * TODO: Remove this parameter in favor of using collection creation time + - * collection version to decide when a refresh is necessary and provide - * proper causal consistency */ - StatusWith<ChunkManager> getCollectionRoutingInfoWithRefresh( - OperationContext* opCtx, - const NamespaceString& nss, - bool forceRefreshFromThisThread = false); + StatusWith<ChunkManager> getCollectionRoutingInfoWithRefresh(OperationContext* opCtx, + const NamespaceString& nss); /** * Same as getCollectionRoutingInfoWithRefresh above, but in addition returns a @@ -333,11 +234,6 @@ public: const boost::optional<DatabaseVersion>& wantedVersion); /** - * Gets whether this operation should block behind a catalog cache refresh. - */ - static bool getOperationShouldBlockBehindCatalogCacheRefresh(OperationContext* opCtx); - - /** * Sets whether this operation should block behind a catalog cache refresh. */ static void setOperationShouldBlockBehindCatalogCacheRefresh(OperationContext* opCtx, @@ -349,18 +245,9 @@ public: * requests to block on an upcoming catalog cache refresh. */ void invalidateShardOrEntireCollectionEntryForShardedCollection( - OperationContext* opCtx, const NamespaceString& nss, - boost::optional<ChunkVersion> wantedVersion, - const ChunkVersion& receivedVersion, - ShardId shardId); - - /** - * Non-blocking method that marks the current collection entry for the namespace as needing - * refresh due to an epoch change. Will cause all further targetting attempts for this - * namespace to block on a catalog cache refresh. - */ - void onEpochChange(const NamespaceString& nss); + const boost::optional<ChunkVersion>& wantedVersion, + const ShardId& shardId); /** * Throws a StaleConfigException if this catalog cache does not have an entry for the given @@ -370,16 +257,8 @@ public: * version to throw a StaleConfigException. */ void checkEpochOrThrow(const NamespaceString& nss, - ChunkVersion targetCollectionVersion, - const ShardId& shardId) const; - - /** - * Non-blocking method, which invalidates the shard for the routing table for the specified - * namespace. If that shard is targetted in the future, getCollectionRoutingInfo will wait on a - * refresh. - */ - void invalidateShardForShardedCollection(const NamespaceString& nss, - const ShardId& staleShardId); + const ChunkVersion& targetCollectionVersion, + const ShardId& shardId); /** * Non-blocking method, which invalidates all namespaces which contain data on the specified @@ -388,12 +267,6 @@ public: void invalidateEntriesThatReferenceShard(const ShardId& shardId); /** - * Non-blocking method, which removes the entire specified collection from the cache (resulting - * in full refresh on subsequent access) - */ - void purgeCollection(const NamespaceString& nss); - - /** * Non-blocking method, which removes the entire specified database (including its collections) * from the cache. */ @@ -416,35 +289,17 @@ public: */ void checkAndRecordOperationBlockedByRefresh(OperationContext* opCtx, mongo::LogicalOp opType); + /** + * Non-blocking method that marks the current collection entry for the namespace as needing + * refresh. Will cause all further targetting attempts to block on a catalog cache refresh, + * even if they do not require causal consistency. + */ + void invalidateCollectionEntry_LINEARIZABLE(const NamespaceString& nss); + private: // Make the cache entries friends so they can access the private classes below friend class CachedDatabaseInfo; - /** - * Cache entry describing a collection. - */ - struct CollectionRoutingInfoEntry { - CollectionRoutingInfoEntry() = default; - // Disable copy (and move) semantics - CollectionRoutingInfoEntry(const CollectionRoutingInfoEntry&) = delete; - CollectionRoutingInfoEntry& operator=(const CollectionRoutingInfoEntry&) = delete; - - // Specifies whether this cache entry needs a refresh (in which case routingInfo should not - // be relied on) or it doesn't, in which case there should be a non-null routingInfo. - bool needsRefresh{true}; - - // Specifies whether the namespace has had an epoch change, which indicates that every - // shard should block on an upcoming refresh. - bool epochHasChanged{true}; - - // Contains a notification to be waited on for the refresh to complete (only available if - // needsRefresh is true) - std::shared_ptr<Notification<Status>> refreshCompletionNotification; - - // Contains the cached routing information (only available if needsRefresh is false) - std::shared_ptr<RoutingTableHistory> routingInfo; - }; - class DatabaseCache : public ReadThroughCache<std::string, DatabaseType, ComparableDatabaseVersion> { public: @@ -461,88 +316,54 @@ private: Mutex _mutex = MONGO_MAKE_LATCH("DatabaseCache::_mutex"); }; - /** - * Non-blocking call which schedules an asynchronous refresh for the specified namespace. The - * namespace must be in the 'needRefresh' state. - */ - void _scheduleCollectionRefresh(WithLock, - ServiceContext* service, - std::shared_ptr<CollectionRoutingInfoEntry> collEntry, - NamespaceString const& nss, - int refreshAttempt); + class CollectionCache : public RoutingTableHistoryCache { + public: + CollectionCache(ServiceContext* service, + ThreadPoolInterface& threadPool, + CatalogCacheLoader& catalogCacheLoader); - /** - * Marks a collection entry as needing refresh. Will create the collection entry if one does - * not exist. Also marks the epoch as changed, which will cause all further targetting requests - * against this namespace to block upon a catalog cache refresh. - */ - void _createOrGetCollectionEntryAndMarkEpochStale(const NamespaceString& nss); + void reportStats(BSONObjBuilder* builder) const; - /** - * Marks a collection entry as needing refresh. Will create the collection entry if one does - * not exist. Will mark the given shard ID as stale, which will cause all further targetting - * requests for the given shard for this namespace to block upon a catalog cache refresh. - */ - void _createOrGetCollectionEntryAndMarkShardStale(const NamespaceString& nss, - const ShardId& shardId); + private: + LookupResult _lookupCollection(OperationContext* opCtx, + const NamespaceString& nss, + const ValueHandle& collectionHistory, + const ComparableChunkVersion& previousChunkVersion); - /** - * Marks a collection entry as needing refresh. Will create the collection entry if one does - * not exist. - */ - void _createOrGetCollectionEntryAndMarkAsNeedsRefresh(const NamespaceString& nss); + CatalogCacheLoader& _catalogCacheLoader; + Mutex _mutex = MONGO_MAKE_LATCH("CollectionCache::_mutex"); - /** - * Retrieves the collection entry for the given namespace, creating the entry if one does not - * already exist. - */ - std::shared_ptr<CollectionRoutingInfoEntry> _createOrGetCollectionEntry( - WithLock wl, const NamespaceString& nss); + struct Stats { + // Tracks how many incremental refreshes are waiting to complete currently + AtomicWord<long long> numActiveIncrementalRefreshes{0}; - /** - * Used as a flag to indicate whether or not this thread performed its own - * refresh for certain helper functions - * - * kPerformedRefresh is used only when the calling thread performed the - * refresh *itself* - * - * kDidNotPerformRefresh is used either when there was an error or when - * this thread joined an ongoing refresh - */ - enum class RefreshAction { - kPerformedRefresh, - kDidNotPerformRefresh, - }; + // Cumulative, always-increasing counter of how many incremental refreshes have been + // kicked off + AtomicWord<long long> countIncrementalRefreshesStarted{0}; - /** - * Return type for helper functions performing refreshes so that they can - * indicate both status and whether or not this thread performed its own - * refresh - */ - struct RefreshResult { - // Status containing result of refresh - StatusWith<ChunkManager> statusWithInfo; - RefreshAction actionTaken; - }; + // Tracks how many full refreshes are waiting to complete currently + AtomicWord<long long> numActiveFullRefreshes{0}; - /** - * Retrieves the collection routing info for this namespace after blocking on a catalog cache - * refresh. - */ - CatalogCache::RefreshResult _getCollectionRoutingInfoWithForcedRefresh( - OperationContext* opctx, const NamespaceString& nss); + // Cumulative, always-increasing counter of how many full refreshes have been kicked off + AtomicWord<long long> countFullRefreshesStarted{0}; - /** - * Helper function used when we need the refresh action taken (e.g. when we - * want to force refresh) - */ - CatalogCache::RefreshResult _getCollectionRoutingInfo(OperationContext* opCtx, - const NamespaceString& nss); + // Cumulative, always-increasing counter of how many full or incremental refreshes + // failed for whatever reason + AtomicWord<long long> countFailedRefreshes{0}; - CatalogCache::RefreshResult _getCollectionRoutingInfoAt( - OperationContext* opCtx, - const NamespaceString& nss, - boost::optional<Timestamp> atClusterTime); + /** + * Reports the accumulated statistics for serverStatus. + */ + void report(BSONObjBuilder* builder) const; + + } _stats; + + void _updateRefreshesStats(const bool isIncremental, const bool add); + }; + + StatusWith<ChunkManager> _getCollectionRoutingInfoAt(OperationContext* opCtx, + const NamespaceString& nss, + boost::optional<Timestamp> atClusterTime); // Interface from which chunks will be retrieved CatalogCacheLoader& _cacheLoader; @@ -557,23 +378,6 @@ private: // combined AtomicWord<long long> totalRefreshWaitTimeMicros{0}; - // Tracks how many incremental refreshes are waiting to complete currently - AtomicWord<long long> numActiveIncrementalRefreshes{0}; - - // Cumulative, always-increasing counter of how many incremental refreshes have been kicked - // off - AtomicWord<long long> countIncrementalRefreshesStarted{0}; - - // Tracks how many full refreshes are waiting to complete currently - AtomicWord<long long> numActiveFullRefreshes{0}; - - // Cumulative, always-increasing counter of how many full refreshes have been kicked off - AtomicWord<long long> countFullRefreshesStarted{0}; - - // Cumulative, always-increasing counter of how many full or incremental refreshes failed - // for whatever reason - AtomicWord<long long> countFailedRefreshes{0}; - // Cumulative, always-increasing counter of how many operations have been blocked by a // catalog cache refresh. Broken down by operation type to match the operations tracked // by the OpCounters class. @@ -595,15 +399,9 @@ private: std::shared_ptr<ThreadPool> _executor; - DatabaseCache _databaseCache; - // Mutex to serialize access to the collection cache - mutable Mutex _mutex = MONGO_MAKE_LATCH("CatalogCache::_mutex"); - // Map from full collection name to the routing info for that collection, grouped by database - using CollectionInfoMap = StringMap<std::shared_ptr<CollectionRoutingInfoEntry>>; - using CollectionsByDbMap = StringMap<CollectionInfoMap>; - CollectionsByDbMap _collectionsByDb; + CollectionCache _collectionCache; }; } // namespace mongo diff --git a/src/mongo/s/catalog_cache_refresh_test.cpp b/src/mongo/s/catalog_cache_refresh_test.cpp index 70b56845eb1..1e21135a15b 100644 --- a/src/mongo/s/catalog_cache_refresh_test.cpp +++ b/src/mongo/s/catalog_cache_refresh_test.cpp @@ -440,7 +440,7 @@ TEST_F(CatalogCacheRefreshTest, IncrementalLoadMissingChunkWithLowestVersion) { ASSERT_EQ(1, initialRoutingInfo.numChunks()); - auto future = scheduleRoutingInfoForcedRefresh(kNss); + auto future = scheduleRoutingInfoIncrementalRefresh(kNss); const auto incompleteChunks = [&]() { ChunkVersion version(1, 0, epoch); @@ -497,7 +497,7 @@ TEST_F(CatalogCacheRefreshTest, IncrementalLoadMissingChunkWithHighestVersion) { ASSERT_EQ(1, initialRoutingInfo.numChunks()); - auto future = scheduleRoutingInfoForcedRefresh(kNss); + auto future = scheduleRoutingInfoIncrementalRefresh(kNss); const auto incompleteChunks = [&]() { ChunkVersion version(1, 0, epoch); @@ -551,7 +551,7 @@ TEST_F(CatalogCacheRefreshTest, ChunkEpochChangeDuringIncrementalLoad) { auto initialRoutingInfo(makeChunkManager(kNss, shardKeyPattern, nullptr, true, {})); ASSERT_EQ(1, initialRoutingInfo.numChunks()); - auto future = scheduleRoutingInfoForcedRefresh(kNss); + auto future = scheduleRoutingInfoIncrementalRefresh(kNss); ChunkVersion version = initialRoutingInfo.getVersion(); @@ -598,7 +598,7 @@ TEST_F(CatalogCacheRefreshTest, ChunkEpochChangeDuringIncrementalLoadRecoveryAft setupNShards(2); - auto future = scheduleRoutingInfoForcedRefresh(kNss); + auto future = scheduleRoutingInfoIncrementalRefresh(kNss); ChunkVersion oldVersion = initialRoutingInfo.getVersion(); const OID newEpoch = OID::gen(); @@ -683,7 +683,7 @@ TEST_F(CatalogCacheRefreshTest, IncrementalLoadAfterCollectionEpochChange) { setupNShards(2); - auto future = scheduleRoutingInfoForcedRefresh(kNss); + auto future = scheduleRoutingInfoIncrementalRefresh(kNss); ChunkVersion newVersion(1, 0, OID::gen()); @@ -730,7 +730,7 @@ TEST_F(CatalogCacheRefreshTest, IncrementalLoadAfterSplit) { ChunkVersion version = initialRoutingInfo.getVersion(); - auto future = scheduleRoutingInfoForcedRefresh(kNss); + auto future = scheduleRoutingInfoIncrementalRefresh(kNss); expectGetCollection(version.epoch(), shardKeyPattern); @@ -776,7 +776,7 @@ TEST_F(CatalogCacheRefreshTest, IncrementalLoadAfterMoveWithReshardingFieldsAdde ChunkVersion version = initialRoutingInfo.getVersion(); - auto future = scheduleRoutingInfoForcedRefresh(kNss); + auto future = scheduleRoutingInfoIncrementalRefresh(kNss); ChunkVersion expectedDestShardVersion; @@ -824,7 +824,7 @@ TEST_F(CatalogCacheRefreshTest, IncrementalLoadAfterMoveLastChunkWithReshardingF ChunkVersion version = initialRoutingInfo.getVersion(); - auto future = scheduleRoutingInfoForcedRefresh(kNss); + auto future = scheduleRoutingInfoIncrementalRefresh(kNss); // The collection type won't have resharding fields this time. expectGetCollection(version.epoch(), shardKeyPattern); diff --git a/src/mongo/s/catalog_cache_test.cpp b/src/mongo/s/catalog_cache_test.cpp index fce177bdd4f..8fdb461aca3 100644 --- a/src/mongo/s/catalog_cache_test.cpp +++ b/src/mongo/s/catalog_cache_test.cpp @@ -35,6 +35,7 @@ #include "mongo/s/catalog_cache.h" #include "mongo/s/catalog_cache_loader_mock.h" #include "mongo/s/sharding_router_test_fixture.h" +#include "mongo/s/stale_exception.h" namespace mongo { namespace { @@ -72,7 +73,54 @@ protected: _catalogCacheLoader->setDatabaseRefreshReturnValue(kErrorStatus); } + void loadCollection(const ChunkVersion& version) { + const auto coll = makeCollectionType(version); + _catalogCacheLoader->setCollectionRefreshReturnValue(coll); + _catalogCacheLoader->setChunkRefreshReturnValue(makeChunks(version)); + + const auto swChunkManager = + _catalogCache->getCollectionRoutingInfo(operationContext(), coll.getNs()); + ASSERT_OK(swChunkManager.getStatus()); + + // Reset the loader return values to avoid false positive results + _catalogCacheLoader->setCollectionRefreshReturnValue(kErrorStatus); + _catalogCacheLoader->setChunkRefreshReturnValue(kErrorStatus); + } + + void loadUnshardedCollection(const NamespaceString& nss) { + _catalogCacheLoader->setCollectionRefreshReturnValue( + Status(ErrorCodes::NamespaceNotFound, "collection not found")); + + const auto swChunkManager = + _catalogCache->getCollectionRoutingInfo(operationContext(), nss); + ASSERT_OK(swChunkManager.getStatus()); + + // Reset the loader return value to avoid false positive results + _catalogCacheLoader->setCollectionRefreshReturnValue(kErrorStatus); + } + + std::vector<ChunkType> makeChunks(ChunkVersion version) { + ChunkType chunk(kNss, + {kShardKeyPattern.getKeyPattern().globalMin(), + kShardKeyPattern.getKeyPattern().globalMax()}, + version, + {"0"}); + chunk.setName(OID::gen()); + return {chunk}; + } + + CollectionType makeCollectionType(const ChunkVersion& collVersion) { + CollectionType coll; + coll.setNs(kNss); + coll.setEpoch(collVersion.epoch()); + coll.setKeyPattern(kShardKeyPattern.getKeyPattern()); + coll.setUnique(false); + return coll; + } + const NamespaceString kNss{"catalgoCacheTestDB.foo"}; + const std::string kPattern{"_id"}; + const ShardKeyPattern kShardKeyPattern{BSON(kPattern << 1)}; const int kDummyPort{12345}; const HostAndPort kConfigHostAndPort{"DummyConfig", kDummyPort}; const std::vector<ShardId> kShards{{"0"}, {"1"}}; @@ -129,5 +177,86 @@ TEST_F(CatalogCacheTest, InvalidateSingleDbOnShardRemoval) { ASSERT_EQ(cachedDb.primaryId(), kShards[1]); } +TEST_F(CatalogCacheTest, CheckEpochNoDatabase) { + const auto collVersion = ChunkVersion(1, 0, OID::gen()); + ASSERT_THROWS_WITH_CHECK(_catalogCache->checkEpochOrThrow(kNss, collVersion, kShards[0]), + StaleConfigException, + [&](const StaleConfigException& ex) { + const auto staleInfo = ex.extraInfo<StaleConfigInfo>(); + ASSERT(staleInfo); + ASSERT_EQ(staleInfo->getNss(), kNss); + ASSERT_EQ(staleInfo->getVersionReceived(), collVersion); + ASSERT_EQ(staleInfo->getShardId(), kShards[0]); + ASSERT(staleInfo->getVersionWanted() == boost::none); + }); +} + +TEST_F(CatalogCacheTest, CheckEpochNoCollection) { + const auto dbVersion = DatabaseVersion(); + const auto collVersion = ChunkVersion(1, 0, OID::gen()); + + loadDatabases({DatabaseType(kNss.db().toString(), kShards[0], true, dbVersion)}); + ASSERT_THROWS_WITH_CHECK(_catalogCache->checkEpochOrThrow(kNss, collVersion, kShards[0]), + StaleConfigException, + [&](const StaleConfigException& ex) { + const auto staleInfo = ex.extraInfo<StaleConfigInfo>(); + ASSERT(staleInfo); + ASSERT_EQ(staleInfo->getNss(), kNss); + ASSERT_EQ(staleInfo->getVersionReceived(), collVersion); + ASSERT_EQ(staleInfo->getShardId(), kShards[0]); + ASSERT(staleInfo->getVersionWanted() == boost::none); + }); +} + +TEST_F(CatalogCacheTest, CheckEpochUnshardedCollection) { + const auto dbVersion = DatabaseVersion(); + const auto collVersion = ChunkVersion(1, 0, OID::gen()); + + loadDatabases({DatabaseType(kNss.db().toString(), kShards[0], true, dbVersion)}); + loadUnshardedCollection(kNss); + ASSERT_THROWS_WITH_CHECK(_catalogCache->checkEpochOrThrow(kNss, collVersion, kShards[0]), + StaleConfigException, + [&](const StaleConfigException& ex) { + const auto staleInfo = ex.extraInfo<StaleConfigInfo>(); + ASSERT(staleInfo); + ASSERT_EQ(staleInfo->getNss(), kNss); + ASSERT_EQ(staleInfo->getVersionReceived(), collVersion); + ASSERT_EQ(staleInfo->getShardId(), kShards[0]); + ASSERT(staleInfo->getVersionWanted() == boost::none); + }); +} + +TEST_F(CatalogCacheTest, CheckEpochWithMismatch) { + const auto dbVersion = DatabaseVersion(); + const auto wantedCollVersion = ChunkVersion(1, 0, OID::gen()); + const auto receivedCollVersion = ChunkVersion(1, 0, OID::gen()); + + loadDatabases({DatabaseType(kNss.db().toString(), kShards[0], true, dbVersion)}); + loadCollection(wantedCollVersion); + + ASSERT_THROWS_WITH_CHECK( + _catalogCache->checkEpochOrThrow(kNss, receivedCollVersion, kShards[0]), + StaleConfigException, + [&](const StaleConfigException& ex) { + const auto staleInfo = ex.extraInfo<StaleConfigInfo>(); + ASSERT(staleInfo); + ASSERT_EQ(staleInfo->getNss(), kNss); + ASSERT_EQ(staleInfo->getVersionReceived(), receivedCollVersion); + ASSERT(staleInfo->getVersionWanted() != boost::none); + ASSERT_EQ(*(staleInfo->getVersionWanted()), wantedCollVersion); + ASSERT_EQ(staleInfo->getShardId(), kShards[0]); + }); +} + +TEST_F(CatalogCacheTest, CheckEpochWithMatch) { + const auto dbVersion = DatabaseVersion(); + const auto collVersion = ChunkVersion(1, 0, OID::gen()); + + loadDatabases({DatabaseType(kNss.db().toString(), kShards[0], true, dbVersion)}); + loadCollection(collVersion); + + _catalogCache->checkEpochOrThrow(kNss, collVersion, kShards[0]); +} + } // namespace } // namespace mongo diff --git a/src/mongo/s/catalog_cache_test_fixture.cpp b/src/mongo/s/catalog_cache_test_fixture.cpp index 71e02e67fac..4f59eeaef8a 100644 --- a/src/mongo/s/catalog_cache_test_fixture.cpp +++ b/src/mongo/s/catalog_cache_test_fixture.cpp @@ -81,6 +81,26 @@ CatalogCacheTestFixture::scheduleRoutingInfoUnforcedRefresh(const NamespaceStrin }); } +executor::NetworkTestEnv::FutureHandle<boost::optional<ChunkManager>> +CatalogCacheTestFixture::scheduleRoutingInfoIncrementalRefresh(const NamespaceString& nss) { + auto catalogCache = Grid::get(getServiceContext())->catalogCache(); + const auto cm = + uassertStatusOK(catalogCache->getCollectionRoutingInfo(operationContext(), nss)); + ASSERT(cm.isSharded()); + + // Simulates the shard wanting a higher version than the one sent by the router. + catalogCache->invalidateShardOrEntireCollectionEntryForShardedCollection( + nss, boost::none, cm.dbPrimary()); + + return launchAsync([this, nss] { + auto client = getServiceContext()->makeClient("Test"); + auto const catalogCache = Grid::get(getServiceContext())->catalogCache(); + + return boost::make_optional( + uassertStatusOK(catalogCache->getCollectionRoutingInfo(operationContext(), nss))); + }); +} + std::vector<ShardType> CatalogCacheTestFixture::setupNShards(int numShards) { std::vector<ShardType> shards; for (int i = 0; i < numShards; i++) { diff --git a/src/mongo/s/catalog_cache_test_fixture.h b/src/mongo/s/catalog_cache_test_fixture.h index fb5238a2ba9..3d58f6a8557 100644 --- a/src/mongo/s/catalog_cache_test_fixture.h +++ b/src/mongo/s/catalog_cache_test_fixture.h @@ -84,6 +84,17 @@ protected: scheduleRoutingInfoUnforcedRefresh(const NamespaceString& nss); /** + * Advance the time in the cache for 'kNss' and schedules a thread to make an incremental + * refresh. + * + * NOTE: The returned value is always set. The reason to use optional is a deficiency of + * std::future with the MSVC STL library, which requires the templated type to be default + * constructible. + */ + executor::NetworkTestEnv::FutureHandle<boost::optional<ChunkManager>> + scheduleRoutingInfoIncrementalRefresh(const NamespaceString& nss); + + /** * Ensures that there are 'numShards' available in the shard registry. The shard ids are * generated as "0", "1", etc. * diff --git a/src/mongo/s/chunk_manager.cpp b/src/mongo/s/chunk_manager.cpp index 5713855e01f..9ded562066c 100644 --- a/src/mongo/s/chunk_manager.cpp +++ b/src/mongo/s/chunk_manager.cpp @@ -336,22 +336,23 @@ void RoutingTableHistory::setAllShardsRefreshed() { } Chunk ChunkManager::findIntersectingChunk(const BSONObj& shardKey, const BSONObj& collation) const { - const bool hasSimpleCollation = (collation.isEmpty() && !_rt->getDefaultCollator()) || + const bool hasSimpleCollation = (collation.isEmpty() && !_rt->optRt->getDefaultCollator()) || SimpleBSONObjComparator::kInstance.evaluate(collation == CollationSpec::kSimpleSpec); if (!hasSimpleCollation) { for (BSONElement elt : shardKey) { uassert(ErrorCodes::ShardKeyNotFound, str::stream() << "Cannot target single shard due to collation of key " - << elt.fieldNameStringData() << " for namespace " << _rt->nss(), + << elt.fieldNameStringData() << " for namespace " + << _rt->optRt->nss(), !CollationIndexKey::isCollatableType(elt.type())); } } - auto chunkInfo = _rt->findIntersectingChunk(shardKey); + auto chunkInfo = _rt->optRt->findIntersectingChunk(shardKey); uassert(ErrorCodes::ShardKeyNotFound, str::stream() << "Cannot target single shard using key " << shardKey - << " for namespace " << _rt->nss(), + << " for namespace " << _rt->optRt->nss(), chunkInfo && chunkInfo->containsKey(shardKey)); return Chunk(*chunkInfo, _clusterTime); @@ -361,7 +362,7 @@ bool ChunkManager::keyBelongsToShard(const BSONObj& shardKey, const ShardId& sha if (shardKey.isEmpty()) return false; - auto chunkInfo = _rt->findIntersectingChunk(shardKey); + auto chunkInfo = _rt->optRt->findIntersectingChunk(shardKey); if (!chunkInfo) return false; @@ -374,7 +375,7 @@ void ChunkManager::getShardIdsForQuery(boost::intrusive_ptr<ExpressionContext> e const BSONObj& query, const BSONObj& collation, std::set<ShardId>* shardIds) const { - auto qr = std::make_unique<QueryRequest>(_rt->nss()); + auto qr = std::make_unique<QueryRequest>(_rt->optRt->nss()); qr->setFilter(query); if (auto uuid = getUUID()) @@ -382,8 +383,8 @@ void ChunkManager::getShardIdsForQuery(boost::intrusive_ptr<ExpressionContext> e if (!collation.isEmpty()) { qr->setCollation(collation); - } else if (_rt->getDefaultCollator()) { - auto defaultCollator = _rt->getDefaultCollator(); + } else if (_rt->optRt->getDefaultCollator()) { + auto defaultCollator = _rt->optRt->getDefaultCollator(); qr->setCollation(defaultCollator->getSpec().toBSON()); expCtx->setCollator(defaultCollator->clone()); } @@ -396,7 +397,7 @@ void ChunkManager::getShardIdsForQuery(boost::intrusive_ptr<ExpressionContext> e MatchExpressionParser::kAllowAllSpecialFeatures)); // Fast path for targeting equalities on the shard key. - auto shardKeyToFind = _rt->getShardKeyPattern().extractShardKeyFromQuery(*cq); + auto shardKeyToFind = _rt->optRt->getShardKeyPattern().extractShardKeyFromQuery(*cq); if (!shardKeyToFind.isEmpty()) { try { auto chunk = findIntersectingChunk(shardKeyToFind, collation); @@ -413,14 +414,14 @@ void ChunkManager::getShardIdsForQuery(boost::intrusive_ptr<ExpressionContext> e // Query { a : { $gte : 1, $lt : 2 }, // b : { $gte : 3, $lt : 4 } } // => Bounds { a : [1, 2), b : [3, 4) } - IndexBounds bounds = getIndexBoundsForQuery(_rt->getShardKeyPattern().toBSON(), *cq); + IndexBounds bounds = getIndexBoundsForQuery(_rt->optRt->getShardKeyPattern().toBSON(), *cq); // Transforms bounds for each shard key field into full shard key ranges // for example : // Key { a : 1, b : 1 } // Bounds { a : [1, 2), b : [3, 4) } // => Ranges { a : 1, b : 3 } => { a : 2, b : 4 } - BoundList ranges = _rt->getShardKeyPattern().flattenBounds(bounds); + BoundList ranges = _rt->optRt->getShardKeyPattern().flattenBounds(bounds); for (BoundList::const_iterator it = ranges.begin(); it != ranges.end(); ++it) { getShardIdsForRange(it->first /*min*/, it->second /*max*/, shardIds); @@ -430,7 +431,7 @@ void ChunkManager::getShardIdsForQuery(boost::intrusive_ptr<ExpressionContext> e // because _shardVersions contains shards with chunks and is built based on the last // refresh. Therefore, it is possible for _shardVersions to have fewer entries if a shard // no longer owns chunks when it used to at _clusterTime. - if (!_clusterTime && shardIds->size() == _rt->_shardVersions.size()) { + if (!_clusterTime && shardIds->size() == _rt->optRt->_shardVersions.size()) { break; } } @@ -439,7 +440,7 @@ void ChunkManager::getShardIdsForQuery(boost::intrusive_ptr<ExpressionContext> e // For now, we satisfy that assumption by adding a shard with no matches rather than returning // an empty set of shards. if (shardIds->empty()) { - _rt->forEachChunk([&](const std::shared_ptr<ChunkInfo>& chunkInfo) { + _rt->optRt->forEachChunk([&](const std::shared_ptr<ChunkInfo>& chunkInfo) { shardIds->insert(chunkInfo->getShardIdAt(_clusterTime)); return false; }); @@ -459,7 +460,7 @@ void ChunkManager::getShardIdsForRange(const BSONObj& min, return; } - _rt->forEachOverlappingChunk(min, max, true, [&](auto& chunkInfo) { + _rt->optRt->forEachOverlappingChunk(min, max, true, [&](auto& chunkInfo) { shardIds->insert(chunkInfo->getShardIdAt(_clusterTime)); // No need to iterate through the rest of the ranges, because we already know we need to use @@ -467,7 +468,7 @@ void ChunkManager::getShardIdsForRange(const BSONObj& min, // because _shardVersions contains shards with chunks and is built based on the last // refresh. Therefore, it is possible for _shardVersions to have fewer entries if a shard // no longer owns chunks when it used to at _clusterTime. - if (!_clusterTime && shardIds->size() == _rt->_shardVersions.size()) { + if (!_clusterTime && shardIds->size() == _rt->optRt->_shardVersions.size()) { return false; } @@ -478,14 +479,15 @@ void ChunkManager::getShardIdsForRange(const BSONObj& min, bool ChunkManager::rangeOverlapsShard(const ChunkRange& range, const ShardId& shardId) const { bool overlapFound = false; - _rt->forEachOverlappingChunk(range.getMin(), range.getMax(), false, [&](auto& chunkInfo) { - if (chunkInfo->getShardIdAt(_clusterTime) == shardId) { - overlapFound = true; - return false; - } + _rt->optRt->forEachOverlappingChunk( + range.getMin(), range.getMax(), false, [&](auto& chunkInfo) { + if (chunkInfo->getShardIdAt(_clusterTime) == shardId) { + overlapFound = true; + return false; + } - return true; - }); + return true; + }); return overlapFound; } @@ -494,7 +496,7 @@ boost::optional<Chunk> ChunkManager::getNextChunkOnShard(const BSONObj& shardKey const ShardId& shardId) const { boost::optional<Chunk> chunk; - _rt->forEachChunk( + _rt->optRt->forEachChunk( [&](auto& chunkInfo) { if (chunkInfo->getShardIdAt(_clusterTime) == shardId) { chunk.emplace(*chunkInfo, _clusterTime); @@ -654,7 +656,7 @@ ChunkManager ChunkManager::makeAtTime(const ChunkManager& cm, Timestamp clusterT } std::string ChunkManager::toString() const { - return _rt ? _rt->toString() : "UNSHARDED"; + return _rt->optRt ? _rt->optRt->toString() : "UNSHARDED"; } bool RoutingTableHistory::compatibleWith(const RoutingTableHistory& other, @@ -733,7 +735,7 @@ RoutingTableHistory RoutingTableHistory::makeUpdated( auto changedChunkInfos = flatten(changedChunks); auto chunkMap = _chunkMap.createMerged(changedChunkInfos); - // If at least one diff was applied, the collection's version must have advanced + // Only update the same collection. invariant(getVersion().epoch() == chunkMap.getVersion().epoch()); return RoutingTableHistory(_nss, @@ -745,4 +747,60 @@ RoutingTableHistory RoutingTableHistory::makeUpdated( std::move(chunkMap)); } +AtomicWord<uint64_t> ComparableChunkVersion::_epochDisambiguatingSequenceNumSource{1ULL}; +AtomicWord<uint64_t> ComparableChunkVersion::_forcedRefreshSequenceNumSource{1ULL}; + +ComparableChunkVersion ComparableChunkVersion::makeComparableChunkVersion( + const ChunkVersion& version) { + return ComparableChunkVersion(_forcedRefreshSequenceNumSource.load(), + version, + _epochDisambiguatingSequenceNumSource.fetchAndAdd(1)); +} + +ComparableChunkVersion ComparableChunkVersion::makeComparableChunkVersionForForcedRefresh() { + return ComparableChunkVersion(_forcedRefreshSequenceNumSource.addAndFetch(2) - 1, + boost::none, + _epochDisambiguatingSequenceNumSource.fetchAndAdd(1)); +} + +std::string ComparableChunkVersion::toString() const { + return str::stream() << _forcedRefreshSequenceNum << "|" + << (_chunkVersion ? _chunkVersion->toString() : "NONE") << "|" + << _epochDisambiguatingSequenceNum; +} + +bool ComparableChunkVersion::operator==(const ComparableChunkVersion& other) const { + if (_forcedRefreshSequenceNum == other._forcedRefreshSequenceNum) { + if (_forcedRefreshSequenceNum == 0) + return true; // Default constructed value + + if (sameEpoch(other)) { + if (_chunkVersion->majorVersion() == 0 && other._chunkVersion->majorVersion() == 0) { + return _chunkVersion->epoch() == OID(); + } + return _chunkVersion->majorVersion() == other._chunkVersion->majorVersion() && + _chunkVersion->minorVersion() == other._chunkVersion->minorVersion(); + } + } + return false; +} + +bool ComparableChunkVersion::operator<(const ComparableChunkVersion& other) const { + if (_forcedRefreshSequenceNum < other._forcedRefreshSequenceNum) + return true; + if (_forcedRefreshSequenceNum > other._forcedRefreshSequenceNum) + return false; + if (_forcedRefreshSequenceNum == 0) + return false; // Default constructed value + + if (sameEpoch(other) && other._chunkVersion->epoch() != OID() && + _chunkVersion->majorVersion() != 0 && other._chunkVersion->majorVersion() != 0) { + return _chunkVersion->majorVersion() < other._chunkVersion->majorVersion() || + (_chunkVersion->majorVersion() == other._chunkVersion->majorVersion() && + _chunkVersion->minorVersion() < other._chunkVersion->minorVersion()); + } else { + return _epochDisambiguatingSequenceNum < other._epochDisambiguatingSequenceNum; + } +} + } // namespace mongo diff --git a/src/mongo/s/chunk_manager.h b/src/mongo/s/chunk_manager.h index 7f25a810a4a..e694a94c201 100644 --- a/src/mongo/s/chunk_manager.h +++ b/src/mongo/s/chunk_manager.h @@ -43,6 +43,7 @@ #include "mongo/s/shard_key_pattern.h" #include "mongo/stdx/unordered_map.h" #include "mongo/util/concurrency/ticketholder.h" +#include "mongo/util/read_through_cache.h" namespace mongo { @@ -324,13 +325,128 @@ private: }; /** + * Constructed to be used exclusively by the CatalogCache as a vector clock (Time) to drive + * CollectionCache's lookups. + * + * The ChunkVersion class contains a non comparable epoch, which makes impossible to compare two + * ChunkVersions when their epochs's differ. + * + * This class wraps a ChunkVersion object with a node-local sequence number + * (_epochDisambiguatingSequenceNum) that allows the comparision. + * + * This class should go away once a cluster-wide comparable ChunkVersion is implemented. + */ +class ComparableChunkVersion { +public: + /** + * Creates a ComparableChunkVersion that wraps the given ChunkVersion. + * Each object created through this method will have a local sequence number greater than the + * previously created ones. + */ + static ComparableChunkVersion makeComparableChunkVersion(const ChunkVersion& version); + + /** + * Creates a ComparableChunkVersion object, which will artificially be greater than any that + * were previously created by `makeComparableChunkVersion`. Used as means to cause the + * collections cache to attempt a refresh in situations where causal consistency cannot be + * inferred. + */ + static ComparableChunkVersion makeComparableChunkVersionForForcedRefresh(); + + /** + * Empty constructor needed by the ReadThroughCache. + * + * Instances created through this constructor will be always less then the ones created through + * the two static constructors, but they do not carry any meaningful value and can only be used + * for comparison purposes. + */ + ComparableChunkVersion() = default; + + const ChunkVersion& getVersion() const { + return *_chunkVersion; + } + + std::string toString() const; + + bool sameEpoch(const ComparableChunkVersion& other) const { + return _chunkVersion->epoch() == other._chunkVersion->epoch(); + } + + bool operator==(const ComparableChunkVersion& other) const; + + bool operator!=(const ComparableChunkVersion& other) const { + return !(*this == other); + } + + /** + * In case the two compared instances have different epochs, the most recently created one will + * be greater, otherwise the comparision will be driven by the major/minor versions of the + * underlying ChunkVersion. + */ + bool operator<(const ComparableChunkVersion& other) const; + + bool operator>(const ComparableChunkVersion& other) const { + return other < *this; + } + + bool operator<=(const ComparableChunkVersion& other) const { + return !(*this > other); + } + + bool operator>=(const ComparableChunkVersion& other) const { + return !(*this < other); + } + +private: + static AtomicWord<uint64_t> _epochDisambiguatingSequenceNumSource; + static AtomicWord<uint64_t> _forcedRefreshSequenceNumSource; + + ComparableChunkVersion(uint64_t forcedRefreshSequenceNum, + boost::optional<ChunkVersion> version, + uint64_t epochDisambiguatingSequenceNum) + : _forcedRefreshSequenceNum(forcedRefreshSequenceNum), + _chunkVersion(std::move(version)), + _epochDisambiguatingSequenceNum(epochDisambiguatingSequenceNum) {} + + uint64_t _forcedRefreshSequenceNum{0}; + + boost::optional<ChunkVersion> _chunkVersion; + + // Locally incremented sequence number that allows to compare two colection versions with + // different epochs. Each new comparableChunkVersion will have a greater sequence number than + // the ones created before. + uint64_t _epochDisambiguatingSequenceNum{0}; +}; + +/** + * This intermediate structure is necessary to be able to store UNSHARDED collections in the routing + * table history cache below. The reason is that currently the RoutingTableHistory class only + * supports sharded collections (i.e., collections which have entries in config.collections and + * config.chunks). + */ +struct OptionalRoutingTableHistory { + // UNSHARDED collection constructor + OptionalRoutingTableHistory() = default; + + // SHARDED collection constructor + OptionalRoutingTableHistory(RoutingTableHistory&& rt) : optRt(std::move(rt)) {} + + // If boost::none, the collection is UNSHARDED, otherwise it is SHARDED + boost::optional<RoutingTableHistory> optRt; +}; + +using RoutingTableHistoryCache = + ReadThroughCache<NamespaceString, OptionalRoutingTableHistory, ComparableChunkVersion>; +using RoutingTableHistoryValueHandle = RoutingTableHistoryCache::ValueHandle; + +/** * Wrapper around a RoutingTableHistory, which pins it to a particular point in time. */ class ChunkManager { public: ChunkManager(ShardId dbPrimary, DatabaseVersion dbVersion, - std::shared_ptr<RoutingTableHistory> rt, + RoutingTableHistoryValueHandle rt, boost::optional<Timestamp> clusterTime) : _dbPrimary(std::move(dbPrimary)), _dbVersion(std::move(dbVersion)), @@ -340,7 +456,7 @@ public: // Methods supported on both sharded and unsharded collections bool isSharded() const { - return bool(_rt); + return bool(_rt->optRt); } const ShardId& dbPrimary() const { @@ -352,7 +468,7 @@ public: } int numChunks() const { - return _rt ? _rt->numChunks() : 1; + return _rt->optRt ? _rt->optRt->numChunks() : 1; } std::string toString() const; @@ -360,32 +476,32 @@ public: // Methods only supported on sharded collections (caller must check isSharded()) const ShardKeyPattern& getShardKeyPattern() const { - return _rt->getShardKeyPattern(); + return _rt->optRt->getShardKeyPattern(); } const CollatorInterface* getDefaultCollator() const { - return _rt->getDefaultCollator(); + return _rt->optRt->getDefaultCollator(); } bool isUnique() const { - return _rt->isUnique(); + return _rt->optRt->isUnique(); } ChunkVersion getVersion() const { - return _rt->getVersion(); + return _rt->optRt->getVersion(); } ChunkVersion getVersion(const ShardId& shardId) const { - return _rt->getVersion(shardId); + return _rt->optRt->getVersion(shardId); } ChunkVersion getVersionForLogging(const ShardId& shardId) const { - return _rt->getVersionForLogging(shardId); + return _rt->optRt->getVersionForLogging(shardId); } template <typename Callable> void forEachChunk(Callable&& handler) const { - _rt->forEachChunk( + _rt->optRt->forEachChunk( [this, handler = std::forward<Callable>(handler)](const auto& chunkInfo) mutable { if (!handler(Chunk{*chunkInfo, _clusterTime})) return false; @@ -461,14 +577,14 @@ public: * Returns the ids of all shards on which the collection has any chunks. */ void getAllShardIds(std::set<ShardId>* all) const { - _rt->getAllShardIds(all); + _rt->optRt->getAllShardIds(all); } /** * Returns the number of shards on which the collection has any chunks */ int getNShardsOwningChunks() const { - return _rt->getNShardsOwningChunks(); + return _rt->optRt->getNShardsOwningChunks(); } // Transforms query into bounds for each field in the shard key @@ -500,30 +616,30 @@ public: * Returns true if, for this shard, the chunks are identical in both chunk managers */ bool compatibleWith(const ChunkManager& other, const ShardId& shard) const { - return _rt->compatibleWith(*other._rt, shard); + return _rt->optRt->compatibleWith(*other._rt->optRt, shard); } bool uuidMatches(UUID uuid) const { - return _rt->uuidMatches(uuid); + return _rt->optRt->uuidMatches(uuid); } boost::optional<UUID> getUUID() const { - return _rt->getUUID(); + return _rt->optRt->getUUID(); } const boost::optional<TypeCollectionReshardingFields>& getReshardingFields() const { - return _rt->getReshardingFields(); + return _rt->optRt->getReshardingFields(); } const RoutingTableHistory& getRoutingTableHistory_ForTest() const { - return *_rt; + return *_rt->optRt; } private: ShardId _dbPrimary; DatabaseVersion _dbVersion; - std::shared_ptr<RoutingTableHistory> _rt; + RoutingTableHistoryValueHandle _rt; boost::optional<Timestamp> _clusterTime; }; diff --git a/src/mongo/s/chunk_manager_refresh_bm.cpp b/src/mongo/s/chunk_manager_refresh_bm.cpp index a3feba2de1e..bd9b133301c 100644 --- a/src/mongo/s/chunk_manager_refresh_bm.cpp +++ b/src/mongo/s/chunk_manager_refresh_bm.cpp @@ -43,8 +43,10 @@ namespace { const NamespaceString kNss("test", "foo"); -std::shared_ptr<RoutingTableHistory> makeStandaloneRoutingTableHistory(RoutingTableHistory rt) { - return std::make_shared<RoutingTableHistory>(std::move(rt)); +RoutingTableHistoryValueHandle makeStandaloneRoutingTableHistory(RoutingTableHistory rt) { + const auto version = rt.getVersion(); + return RoutingTableHistoryValueHandle( + std::move(rt), ComparableChunkVersion::makeComparableChunkVersion(version)); } ChunkRange getRangeForChunk(int i, int nChunks) { @@ -69,6 +71,7 @@ CollectionMetadata makeChunkManagerWithShardSelector(int nShards, std::vector<ChunkType> chunks; chunks.reserve(nChunks); + for (uint32_t i = 0; i < nChunks; ++i) { chunks.emplace_back(kNss, getRangeForChunk(i, nChunks), @@ -144,13 +147,13 @@ auto BM_FullBuildOfChunkManager(benchmark::State& state, ShardSelectorFn selectS const uint32_t nChunks = state.range(1); const auto collEpoch = OID::gen(); - const auto collName = NamespaceString("test.foo"); const auto shardKeyPattern = KeyPattern(BSON("_id" << 1)); std::vector<ChunkType> chunks; chunks.reserve(nChunks); + for (uint32_t i = 0; i < nChunks; ++i) { - chunks.emplace_back(collName, + chunks.emplace_back(kNss, getRangeForChunk(i, nChunks), ChunkVersion{i + 1, 0, collEpoch}, selectShard(i, nShards, nChunks)); @@ -158,7 +161,7 @@ auto BM_FullBuildOfChunkManager(benchmark::State& state, ShardSelectorFn selectS for (auto keepRunning : state) { auto rt = RoutingTableHistory::makeNew( - collName, UUID::gen(), shardKeyPattern, nullptr, true, collEpoch, boost::none, chunks); + kNss, UUID::gen(), shardKeyPattern, nullptr, true, collEpoch, boost::none, chunks); benchmark::DoNotOptimize( CollectionMetadata(ChunkManager(ShardId("shard0"), DatabaseVersion(UUID::gen(), 1), diff --git a/src/mongo/s/commands/SConscript b/src/mongo/s/commands/SConscript index 780d4d4bc9a..8fc761a2e0b 100644 --- a/src/mongo/s/commands/SConscript +++ b/src/mongo/s/commands/SConscript @@ -124,6 +124,7 @@ env.Library( '$BUILD_DIR/mongo/db/commands/test_commands_enabled', '$BUILD_DIR/mongo/db/commands/write_commands_common', '$BUILD_DIR/mongo/db/ftdc/ftdc_server', + '$BUILD_DIR/mongo/db/initialize_api_parameters', '$BUILD_DIR/mongo/db/logical_session_cache_impl', '$BUILD_DIR/mongo/db/pipeline/aggregation', '$BUILD_DIR/mongo/db/query/command_request_response', diff --git a/src/mongo/s/commands/cluster_drop_cmd.cpp b/src/mongo/s/commands/cluster_drop_cmd.cpp index a69e3292597..f727489ccc0 100644 --- a/src/mongo/s/commands/cluster_drop_cmd.cpp +++ b/src/mongo/s/commands/cluster_drop_cmd.cpp @@ -88,7 +88,9 @@ public: // Invalidate the routing table cache entry for this collection so that we reload it the // next time it is accessed, even if sending the command to the config server fails due // to e.g. a NetworkError. - ON_BLOCK_EXIT([opCtx, nss] { Grid::get(opCtx)->catalogCache()->onEpochChange(nss); }); + ON_BLOCK_EXIT([opCtx, nss] { + Grid::get(opCtx)->catalogCache()->invalidateCollectionEntry_LINEARIZABLE(nss); + }); auto configShard = Grid::get(opCtx)->shardRegistry()->getConfigShard(); auto cmdResponse = uassertStatusOK(configShard->runCommandWithFixedRetryAttempts( diff --git a/src/mongo/s/commands/cluster_merge_chunks_cmd.cpp b/src/mongo/s/commands/cluster_merge_chunks_cmd.cpp index b4157bee9d9..531aa1ab41e 100644 --- a/src/mongo/s/commands/cluster_merge_chunks_cmd.cpp +++ b/src/mongo/s/commands/cluster_merge_chunks_cmd.cpp @@ -174,8 +174,10 @@ public: Shard::RetryPolicy::kNotIdempotent)); uassertStatusOK(response.commandStatus); - Grid::get(opCtx)->catalogCache()->invalidateShardForShardedCollection( - nss, firstChunk.getShardId()); + Grid::get(opCtx) + ->catalogCache() + ->invalidateShardOrEntireCollectionEntryForShardedCollection( + nss, boost::none, firstChunk.getShardId()); CommandHelpers::filterCommandReplyForPassthrough(response.response, &result); return true; diff --git a/src/mongo/s/commands/cluster_move_chunk_cmd.cpp b/src/mongo/s/commands/cluster_move_chunk_cmd.cpp index 01cdb91234e..f6e2d27c80f 100644 --- a/src/mongo/s/commands/cluster_move_chunk_cmd.cpp +++ b/src/mongo/s/commands/cluster_move_chunk_cmd.cpp @@ -198,9 +198,14 @@ public: cmdObj["waitForDelete"].trueValue(), forceJumbo)); - Grid::get(opCtx)->catalogCache()->invalidateShardForShardedCollection(nss, - chunk->getShardId()); - Grid::get(opCtx)->catalogCache()->invalidateShardForShardedCollection(nss, to->getId()); + Grid::get(opCtx) + ->catalogCache() + ->invalidateShardOrEntireCollectionEntryForShardedCollection( + nss, boost::none, chunk->getShardId()); + Grid::get(opCtx) + ->catalogCache() + ->invalidateShardOrEntireCollectionEntryForShardedCollection( + nss, boost::none, to->getId()); result.append("millis", t.millis()); return true; diff --git a/src/mongo/s/commands/cluster_shard_collection_cmd.cpp b/src/mongo/s/commands/cluster_shard_collection_cmd.cpp index d27fd037d30..d4c4d7901ad 100644 --- a/src/mongo/s/commands/cluster_shard_collection_cmd.cpp +++ b/src/mongo/s/commands/cluster_shard_collection_cmd.cpp @@ -105,7 +105,9 @@ public: // Invalidate the routing table cache entry for this collection so that we reload the // collection the next time it's accessed, even if we receive a failure, e.g. NetworkError. - ON_BLOCK_EXIT([opCtx, nss] { Grid::get(opCtx)->catalogCache()->onEpochChange(nss); }); + ON_BLOCK_EXIT([opCtx, nss] { + Grid::get(opCtx)->catalogCache()->invalidateCollectionEntry_LINEARIZABLE(nss); + }); auto configShard = Grid::get(opCtx)->shardRegistry()->getConfigShard(); auto cmdResponse = uassertStatusOK(configShard->runCommandWithFixedRetryAttempts( diff --git a/src/mongo/s/commands/cluster_split_cmd.cpp b/src/mongo/s/commands/cluster_split_cmd.cpp index 19d33b3f10b..5532fac1daf 100644 --- a/src/mongo/s/commands/cluster_split_cmd.cpp +++ b/src/mongo/s/commands/cluster_split_cmd.cpp @@ -270,8 +270,10 @@ public: ChunkRange(chunk->getMin(), chunk->getMax()), {splitPoint})); - Grid::get(opCtx)->catalogCache()->invalidateShardForShardedCollection(nss, - chunk->getShardId()); + Grid::get(opCtx) + ->catalogCache() + ->invalidateShardOrEntireCollectionEntryForShardedCollection( + nss, boost::none, chunk->getShardId()); return true; } diff --git a/src/mongo/s/commands/flush_router_config_cmd.cpp b/src/mongo/s/commands/flush_router_config_cmd.cpp index bcc61a82a0a..d27b65a2c4d 100644 --- a/src/mongo/s/commands/flush_router_config_cmd.cpp +++ b/src/mongo/s/commands/flush_router_config_cmd.cpp @@ -102,7 +102,7 @@ public: "Routing metadata flushed for collection {namespace}", "Routing metadata flushed for collection", "namespace"_attr = nss); - catalogCache->purgeCollection(nss); + catalogCache->invalidateCollectionEntry_LINEARIZABLE(nss); } } diff --git a/src/mongo/s/commands/strategy.cpp b/src/mongo/s/commands/strategy.cpp index 644c10e6bcb..f83b490d0ef 100644 --- a/src/mongo/s/commands/strategy.cpp +++ b/src/mongo/s/commands/strategy.cpp @@ -722,16 +722,12 @@ void runCommand(OperationContext* opCtx, auto catalogCache = Grid::get(opCtx)->catalogCache(); if (auto staleInfo = ex.extraInfo<StaleConfigInfo>()) { catalogCache->invalidateShardOrEntireCollectionEntryForShardedCollection( - opCtx, - staleNs, - staleInfo->getVersionWanted(), - staleInfo->getVersionReceived(), - staleInfo->getShardId()); + staleNs, staleInfo->getVersionWanted(), staleInfo->getShardId()); } else { // If we don't have the stale config info and therefore don't know the shard's // id, we have to force all further targetting requests for the namespace to // block on a refresh. - catalogCache->onEpochChange(staleNs); + catalogCache->invalidateCollectionEntry_LINEARIZABLE(staleNs); } @@ -1301,16 +1297,12 @@ void Strategy::explainFind(OperationContext* opCtx, Grid::get(opCtx) ->catalogCache() ->invalidateShardOrEntireCollectionEntryForShardedCollection( - opCtx, - staleNs, - staleInfo->getVersionWanted(), - staleInfo->getVersionReceived(), - staleInfo->getShardId()); + staleNs, staleInfo->getVersionWanted(), staleInfo->getShardId()); } else { // If we don't have the stale config info and therefore don't know the shard's id, // we have to force all further targetting requests for the namespace to block on // a refresh. - Grid::get(opCtx)->catalogCache()->onEpochChange(staleNs); + Grid::get(opCtx)->catalogCache()->invalidateCollectionEntry_LINEARIZABLE(staleNs); } if (canRetry) { diff --git a/src/mongo/s/comparable_chunk_version_test.cpp b/src/mongo/s/comparable_chunk_version_test.cpp index 941d9bad080..8c1fa71fce2 100644 --- a/src/mongo/s/comparable_chunk_version_test.cpp +++ b/src/mongo/s/comparable_chunk_version_test.cpp @@ -29,8 +29,7 @@ #include "mongo/platform/basic.h" -#include "mongo/s/catalog_cache.h" -#include "mongo/s/chunk_version.h" +#include "mongo/s/chunk_manager.h" #include "mongo/unittest/unittest.h" namespace mongo { @@ -95,9 +94,15 @@ TEST(ComparableChunkVersionTest, VersionLessSameEpoch) { ASSERT_FALSE(version2 > version3); } +TEST(ComparableChunkVersionTest, DefaultConstructedVersionsAreEqual) { + const ComparableChunkVersion defaultVersion1{}, defaultVersion2{}; + ASSERT(defaultVersion1 == defaultVersion2); + ASSERT_FALSE(defaultVersion1 < defaultVersion2); + ASSERT_FALSE(defaultVersion1 > defaultVersion2); +} + TEST(ComparableChunkVersionTest, DefaultConstructedVersionIsAlwaysLess) { const ComparableChunkVersion defaultVersion{}; - ASSERT_EQ(defaultVersion.getLocalSequenceNum(), 0); const auto version1 = ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion(0, 0, OID::gen())); ASSERT(defaultVersion != version1); @@ -105,5 +110,127 @@ TEST(ComparableChunkVersionTest, DefaultConstructedVersionIsAlwaysLess) { ASSERT_FALSE(defaultVersion > version1); } +TEST(ComparableChunkVersionTest, DefaultConstructedVersionIsAlwaysLessThanUnsharded) { + const ComparableChunkVersion defaultVersion{}; + const auto version1 = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion::UNSHARDED()); + ASSERT(defaultVersion != version1); + ASSERT(defaultVersion < version1); + ASSERT_FALSE(defaultVersion > version1); +} + +TEST(ComparableChunkVersionTest, DefaultConstructedVersionIsAlwaysLessThanDropped) { + const ComparableChunkVersion defaultVersion{}; + const auto version1 = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion::DROPPED()); + ASSERT(defaultVersion != version1); + ASSERT(defaultVersion < version1); + ASSERT_FALSE(defaultVersion > version1); +} + +TEST(ComparableChunkVersionTest, UnshardedAndDroppedAreEqual) { + const auto version1 = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion::UNSHARDED()); + const auto version2 = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion::DROPPED()); + const auto version3 = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion::UNSHARDED()); + const auto version4 = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion::DROPPED()); + ASSERT(version1 == version2); + ASSERT(version1 == version3); + ASSERT(version2 == version4); +} + +TEST(ComparableChunkVersionTest, NoChunksAreDifferent) { + const auto oid = OID::gen(); + const auto version1 = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion(0, 0, oid)); + const auto version2 = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion(0, 0, oid)); + ASSERT(version1 != version2); + ASSERT(version1 < version2); + ASSERT_FALSE(version1 > version2); +} + +TEST(ComparableChunkVersionTest, NoChunksCompareBySequenceNum) { + const auto oid = OID::gen(); + const auto version1 = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion(1, 0, oid)); + const auto noChunkSV1 = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion(0, 0, oid)); + + ASSERT(version1 != noChunkSV1); + ASSERT(noChunkSV1 > version1); + + const auto noChunkSV2 = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion(0, 0, oid)); + + ASSERT(noChunkSV1 != noChunkSV2); + ASSERT_FALSE(noChunkSV1 > noChunkSV2); + ASSERT(noChunkSV2 > noChunkSV1); + + const auto version2 = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion(2, 0, oid)); + + ASSERT(version2 != noChunkSV2); + ASSERT(version2 > noChunkSV2); +} + +TEST(ComparableChunkVersionTest, NoChunksGreaterThanUnshardedBySequenceNum) { + const auto unsharded = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion::UNSHARDED()); + const auto noChunkSV = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion(0, 0, OID::gen())); + + ASSERT(noChunkSV != unsharded); + ASSERT(noChunkSV > unsharded); +} + +TEST(ComparableChunkVersionTest, UnshardedGreaterThanNoChunksBySequenceNum) { + const auto noChunkSV = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion(0, 0, OID::gen())); + const auto unsharded = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion::UNSHARDED()); + + ASSERT(noChunkSV != unsharded); + ASSERT(unsharded > noChunkSV); +} + +TEST(ComparableChunkVersionTest, NoChunksGreaterThanDefault) { + const auto noChunkSV = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion(0, 0, OID::gen())); + const ComparableChunkVersion defaultVersion{}; + + ASSERT(noChunkSV != defaultVersion); + ASSERT(noChunkSV > defaultVersion); +} + +TEST(ComparableChunkVersionTest, ForcedRefreshSequenceNumber) { + auto oid = OID::gen(); + const ComparableChunkVersion defaultVersionBeforeForce; + const auto versionBeforeForce = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion(100, 0, oid)); + + const auto forcedRefreshVersion = + ComparableChunkVersion::makeComparableChunkVersionForForcedRefresh(); + + const auto versionAfterForce = + ComparableChunkVersion::makeComparableChunkVersion(ChunkVersion(100, 0, oid)); + const ComparableChunkVersion defaultVersionAfterForce; + + ASSERT(defaultVersionBeforeForce != forcedRefreshVersion); + ASSERT(defaultVersionBeforeForce < forcedRefreshVersion); + + ASSERT(versionBeforeForce != forcedRefreshVersion); + ASSERT(versionBeforeForce < forcedRefreshVersion); + + ASSERT(versionAfterForce != forcedRefreshVersion); + ASSERT(versionAfterForce > forcedRefreshVersion); + + ASSERT(defaultVersionAfterForce != forcedRefreshVersion); + ASSERT(defaultVersionAfterForce < forcedRefreshVersion); +} + } // namespace } // namespace mongo diff --git a/src/mongo/s/comparable_database_version_test.cpp b/src/mongo/s/comparable_database_version_test.cpp index 3b2486a5ebd..d4201d56564 100644 --- a/src/mongo/s/comparable_database_version_test.cpp +++ b/src/mongo/s/comparable_database_version_test.cpp @@ -82,9 +82,15 @@ TEST(ComparableDatabaseVersionTest, VersionLessSameUuid) { ASSERT_FALSE(version1 > version2); } +TEST(ComparableDatabaseVersionTest, DefaultConstructedVersionsAreEqual) { + const ComparableDatabaseVersion defaultVersion1{}, defaultVersion2{}; + ASSERT(defaultVersion1 == defaultVersion2); + ASSERT_FALSE(defaultVersion1 < defaultVersion2); + ASSERT_FALSE(defaultVersion1 > defaultVersion2); +} + TEST(ComparableDatabaseVersionTest, DefaultConstructedVersionIsAlwaysLess) { const ComparableDatabaseVersion defaultVersion{}; - ASSERT_EQ(defaultVersion.getLocalSequenceNum(), 0); const auto version1 = ComparableDatabaseVersion::makeComparableDatabaseVersion(DatabaseVersion(UUID::gen(), 0)); ASSERT(defaultVersion != version1); diff --git a/src/mongo/s/query/async_results_merger.cpp b/src/mongo/s/query/async_results_merger.cpp index 2ad05010afb..98aec3332ec 100644 --- a/src/mongo/s/query/async_results_merger.cpp +++ b/src/mongo/s/query/async_results_merger.cpp @@ -462,8 +462,11 @@ Status AsyncResultsMerger::_askForNextBatch(WithLock, size_t remoteIndex) { cmdObj = newCmdBob.obj(); } + // Never pass API parameters with getMore. + IgnoreAPIParametersBlock ignoreApiParametersBlock(_opCtx); executor::RemoteCommandRequest request( remote.getTargetHost(), remote.cursorNss.db().toString(), cmdObj, _opCtx); + ignoreApiParametersBlock.release(); auto callbackStatus = _executor->scheduleRemoteCommand(request, [this, remoteIndex](auto const& cbData) { diff --git a/src/mongo/s/query/cluster_client_cursor.h b/src/mongo/s/query/cluster_client_cursor.h index 44aae05e34d..87e3271e692 100644 --- a/src/mongo/s/query/cluster_client_cursor.h +++ b/src/mongo/s/query/cluster_client_cursor.h @@ -32,8 +32,8 @@ #include <boost/optional.hpp> #include "mongo/client/read_preference.h" +#include "mongo/db/api_parameters.h" #include "mongo/db/auth/user_name.h" -#include "mongo/db/initialize_api_parameters.h" #include "mongo/db/jsobj.h" #include "mongo/db/logical_session_id.h" #include "mongo/s/query/cluster_client_cursor_params.h" diff --git a/src/mongo/s/query/cluster_client_cursor_params.h b/src/mongo/s/query/cluster_client_cursor_params.h index d8bb0ae8da0..b0fae249884 100644 --- a/src/mongo/s/query/cluster_client_cursor_params.h +++ b/src/mongo/s/query/cluster_client_cursor_params.h @@ -36,10 +36,10 @@ #include "mongo/bson/bsonobj.h" #include "mongo/client/read_preference.h" +#include "mongo/db/api_parameters.h" #include "mongo/db/auth/privilege.h" #include "mongo/db/auth/user_name.h" #include "mongo/db/cursor_id.h" -#include "mongo/db/initialize_api_parameters.h" #include "mongo/db/namespace_string.h" #include "mongo/db/pipeline/pipeline.h" #include "mongo/db/query/cursor_response.h" diff --git a/src/mongo/s/query/cluster_find.cpp b/src/mongo/s/query/cluster_find.cpp index 3996e01c326..57925b873ed 100644 --- a/src/mongo/s/query/cluster_find.cpp +++ b/src/mongo/s/query/cluster_find.cpp @@ -504,18 +504,18 @@ CursorId ClusterFind::runQuery(OperationContext* opCtx, // Re-target and re-send the initial find command to the shards until we have established the // shard version. for (size_t retries = 1; retries <= kMaxRetries; ++retries) { - auto routingInfoStatus = getCollectionRoutingInfoForTxnCmd(opCtx, query.nss()); - if (routingInfoStatus == ErrorCodes::NamespaceNotFound) { + auto swCM = getCollectionRoutingInfoForTxnCmd(opCtx, query.nss()); + if (swCM == ErrorCodes::NamespaceNotFound) { // If the database doesn't exist, we successfully return an empty result set without // creating a cursor. return CursorId(0); } - auto routingInfo = uassertStatusOK(routingInfoStatus); + const auto cm = uassertStatusOK(std::move(swCM)); try { return runQueryWithoutRetrying( - opCtx, query, readPref, routingInfo, results, partialResultsReturned); + opCtx, query, readPref, cm, results, partialResultsReturned); } catch (ExceptionFor<ErrorCodes::StaleDbVersion>& ex) { if (retries >= kMaxRetries) { // Check if there are no retries remaining, so the last received error can be @@ -577,13 +577,9 @@ CursorId ClusterFind::runQuery(OperationContext* opCtx, if (ex.code() != ErrorCodes::ShardInvalidatedForTargeting) { if (auto staleInfo = ex.extraInfo<StaleConfigInfo>()) { catalogCache->invalidateShardOrEntireCollectionEntryForShardedCollection( - opCtx, - query.nss(), - staleInfo->getVersionWanted(), - staleInfo->getVersionReceived(), - staleInfo->getShardId()); + query.nss(), staleInfo->getVersionWanted(), staleInfo->getShardId()); } else { - catalogCache->onEpochChange(query.nss()); + catalogCache->invalidateCollectionEntry_LINEARIZABLE(query.nss()); } } @@ -776,6 +772,7 @@ StatusWith<CursorResponse> ClusterFind::runGetMore(OperationContext* opCtx, StatusWith<ClusterQueryResult> next = Status{ErrorCodes::InternalError, "uninitialized cluster query result"}; try { + IgnoreAPIParametersBlock ignoreApiParametersBlock(opCtx); next = pinnedCursor.getValue()->next(context); } catch (const ExceptionFor<ErrorCodes::CloseChangeStream>&) { // This exception is thrown when a $changeStream stage encounters an event diff --git a/src/mongo/s/request_types/set_shard_version_request.h b/src/mongo/s/request_types/set_shard_version_request.h index bfd7385ffae..44cacff0415 100644 --- a/src/mongo/s/request_types/set_shard_version_request.h +++ b/src/mongo/s/request_types/set_shard_version_request.h @@ -98,6 +98,7 @@ private: SetShardVersionRequest(); bool _isAuthoritative{false}; + // TODO (SERVER-50812) remove this flag that isn't used anymore bool _forceRefresh{false}; boost::optional<NamespaceString> _nss; diff --git a/src/mongo/s/sessions_collection_sharded.cpp b/src/mongo/s/sessions_collection_sharded.cpp index 060c1158dbd..22915bd2c0a 100644 --- a/src/mongo/s/sessions_collection_sharded.cpp +++ b/src/mongo/s/sessions_collection_sharded.cpp @@ -123,8 +123,6 @@ void SessionsCollectionSharded::checkSessionsCollectionExists(OperationContext* const auto cm = uassertStatusOK( Grid::get(opCtx)->catalogCache()->getShardedCollectionRoutingInfoWithRefresh( opCtx, NamespaceString::kLogicalSessionsNamespace)); - - uassert(ErrorCodes::NamespaceNotFound, "config.system.sessions does not exist", cm.isSharded()); } void SessionsCollectionSharded::refreshSessions(OperationContext* opCtx, diff --git a/src/mongo/s/sharding_test_fixture_common.cpp b/src/mongo/s/sharding_test_fixture_common.cpp index 95dd505687b..2ac936d3977 100644 --- a/src/mongo/s/sharding_test_fixture_common.cpp +++ b/src/mongo/s/sharding_test_fixture_common.cpp @@ -47,9 +47,11 @@ ShardingTestFixtureCommon::ShardingTestFixtureCommon() { ShardingTestFixtureCommon::~ShardingTestFixtureCommon() = default; -std::shared_ptr<RoutingTableHistory> ShardingTestFixtureCommon::makeStandaloneRoutingTableHistory( +RoutingTableHistoryValueHandle ShardingTestFixtureCommon::makeStandaloneRoutingTableHistory( RoutingTableHistory rt) { - return std::make_shared<RoutingTableHistory>(std::move(rt)); + const auto version = rt.getVersion(); + return RoutingTableHistoryValueHandle( + std::move(rt), ComparableChunkVersion::makeComparableChunkVersion(version)); } void ShardingTestFixtureCommon::onCommand(NetworkTestEnv::OnCommandFunction func) { diff --git a/src/mongo/s/sharding_test_fixture_common.h b/src/mongo/s/sharding_test_fixture_common.h index 0ecbbb30695..52377d7fbc5 100644 --- a/src/mongo/s/sharding_test_fixture_common.h +++ b/src/mongo/s/sharding_test_fixture_common.h @@ -55,8 +55,7 @@ public: * which can be used to pass to ChunkManager for tests, which specifically target the behaviour * of the ChunkManager. */ - static std::shared_ptr<RoutingTableHistory> makeStandaloneRoutingTableHistory( - RoutingTableHistory rt); + static RoutingTableHistoryValueHandle makeStandaloneRoutingTableHistory(RoutingTableHistory rt); protected: ShardingTestFixtureCommon(); diff --git a/src/mongo/s/transaction_router.cpp b/src/mongo/s/transaction_router.cpp index b7b26698e78..c269d734365 100644 --- a/src/mongo/s/transaction_router.cpp +++ b/src/mongo/s/transaction_router.cpp @@ -125,7 +125,6 @@ BSONObj appendReadConcernForTxn(BSONObj cmd, } BSONObjBuilder appendFieldsForStartTransaction(BSONObj cmd, - APIParameters apiParameters, repl::ReadConcernArgs readConcernArgs, boost::optional<LogicalTime> atClusterTime, bool doAppendStartTransaction) { @@ -134,8 +133,6 @@ BSONObjBuilder appendFieldsForStartTransaction(BSONObj cmd, appendReadConcernForTxn(std::move(cmd), readConcernArgs, atClusterTime); BSONObjBuilder bob(std::move(cmdWithReadConcern)); - - apiParameters.appendInfo(&bob); if (doAppendStartTransaction) { bob.append(OperationSessionInfoFromClient::kStartTransactionFieldName, true); } @@ -433,7 +430,6 @@ BSONObj TransactionRouter::Participant::attachTxnFieldsIfNeeded( BSONObjBuilder newCmd = mustStartTransaction ? appendFieldsForStartTransaction(std::move(cmd), - sharedOptions.apiParameters, sharedOptions.readConcernArgs, sharedOptions.atClusterTime, !hasStartTxn) @@ -1203,6 +1199,8 @@ BSONObj TransactionRouter::Router::abortTransaction(OperationContext* opCtx) { "txnNumber"_attr = o().txnNumber, "numParticipantShards"_attr = o().participants.size()); + // Omit API parameters from abortTransaction. + IgnoreAPIParametersBlock ignoreApiParametersBlock(opCtx); const auto responses = gatherResponses(opCtx, NamespaceString::kAdminDb, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, diff --git a/src/mongo/s/transaction_router.h b/src/mongo/s/transaction_router.h index 25ce17831fe..3d6be675077 100644 --- a/src/mongo/s/transaction_router.h +++ b/src/mongo/s/transaction_router.h @@ -31,8 +31,8 @@ #include <boost/optional.hpp> +#include "mongo/db/api_parameters.h" #include "mongo/db/commands/txn_cmds_gen.h" -#include "mongo/db/initialize_api_parameters.h" #include "mongo/db/logical_session_id.h" #include "mongo/db/operation_context.h" #include "mongo/db/repl/read_concern_args.h" diff --git a/src/mongo/s/transaction_router_test.cpp b/src/mongo/s/transaction_router_test.cpp index eb827201e84..a507d3e4f3f 100644 --- a/src/mongo/s/transaction_router_test.cpp +++ b/src/mongo/s/transaction_router_test.cpp @@ -316,16 +316,9 @@ TEST_F(TransactionRouterTestWithDefaultSession, CannotContiueTxnWithoutStarting) ErrorCodes::NoSuchTransaction); } -TEST_F(TransactionRouterTestWithDefaultSession, - NewParticipantMustAttachTxnAndReadConcernAndAPIParams) { +TEST_F(TransactionRouterTestWithDefaultSession, NewParticipantMustAttachTxnAndReadConcern) { TxnNumber txnNum{3}; - APIParameters apiParameters = APIParameters(); - apiParameters.setAPIVersion("1"); - apiParameters.setAPIStrict(false); - apiParameters.setAPIDeprecationErrors(false); - APIParameters::get(operationContext()) = apiParameters; - auto txnRouter = TransactionRouter::get(operationContext()); txnRouter.beginOrContinueTxn( operationContext(), txnNum, TransactionRouter::TransactionActions::kStart); @@ -337,9 +330,6 @@ TEST_F(TransactionRouterTestWithDefaultSession, << BSON("level" << "snapshot" << "atClusterTime" << kInMemoryLogicalTime.asTimestamp()) - << "apiVersion" - << "1" - << "apiStrict" << false << "apiDeprecationErrors" << false << "startTransaction" << true << "coordinator" << true << "autocommit" << false << "txnNumber" << txnNum); @@ -369,9 +359,6 @@ TEST_F(TransactionRouterTestWithDefaultSession, << BSON("level" << "snapshot" << "atClusterTime" << kInMemoryLogicalTime.asTimestamp()) - << "apiVersion" - << "1" - << "apiStrict" << false << "apiDeprecationErrors" << false << "startTransaction" << true << "autocommit" << false << "txnNumber" << txnNum); @@ -735,40 +722,6 @@ TEST_F(TransactionRouterTestWithDefaultSession, AttachTxnValidatesReadConcernIfA } } -TEST_F(TransactionRouterTestWithDefaultSession, AttachTxnAttachesAPIParameters) { - APIParameters apiParams = APIParameters(); - apiParams.setAPIVersion("2"); - apiParams.setAPIStrict(true); - apiParams.setAPIDeprecationErrors(true); - - APIParameters::get(operationContext()) = apiParams; - - TxnNumber txnNum{3}; - auto txnRouter = TransactionRouter::get(operationContext()); - txnRouter.beginOrContinueTxn( - operationContext(), txnNum, TransactionRouter::TransactionActions::kStart); - txnRouter.setDefaultAtClusterTime(operationContext()); - - { - auto newCmd = txnRouter.attachTxnFieldsIfNeeded(operationContext(), - shard1, - BSON("insert" - << "test")); - ASSERT_BSONOBJ_EQ(BSON("insert" - << "test" - << "readConcern" - << BSON("level" - << "snapshot" - << "atClusterTime" << kInMemoryLogicalTime.asTimestamp()) - << "apiVersion" - << "2" - << "apiStrict" << true << "apiDeprecationErrors" << true - << "startTransaction" << true << "coordinator" << true - << "autocommit" << false << "txnNumber" << txnNum), - newCmd); - } -} - TEST_F(TransactionRouterTestWithDefaultSession, CannotSpecifyAPIParametersAfterFirstStatement) { APIParameters apiParameters = APIParameters(); apiParameters.setAPIVersion("1"); @@ -787,40 +740,6 @@ TEST_F(TransactionRouterTestWithDefaultSession, CannotSpecifyAPIParametersAfterF 4937701); } -TEST_F(TransactionRouterTestWithDefaultSession, PassesThroughAPIParametersToParticipants) { - APIParameters apiParams = APIParameters(); - apiParams.setAPIVersion("2"); - apiParams.setAPIStrict(true); - apiParams.setAPIDeprecationErrors(true); - - APIParameters::get(operationContext()) = apiParams; - - TxnNumber txnNum{3}; - - auto txnRouter = TransactionRouter::get(operationContext()); - txnRouter.beginOrContinueTxn( - operationContext(), txnNum, TransactionRouter::TransactionActions::kStart); - txnRouter.setDefaultAtClusterTime(operationContext()); - - BSONObj expectedNewObj = BSON("insert" - << "test" - << "readConcern" - << BSON("level" - << "snapshot" - << "atClusterTime" << kInMemoryLogicalTime.asTimestamp()) - << "apiVersion" - << "2" - << "apiStrict" << true << "apiDeprecationErrors" << true - << "startTransaction" << true << "coordinator" << true - << "autocommit" << false << "txnNumber" << txnNum); - - auto newCmd = txnRouter.attachTxnFieldsIfNeeded(operationContext(), - shard1, - BSON("insert" - << "test")); - ASSERT_BSONOBJ_EQ(expectedNewObj, newCmd); -} - TEST_F(TransactionRouterTestWithDefaultSession, CannotSpecifyReadConcernAfterFirstStatement) { TxnNumber txnNum{3}; @@ -3294,6 +3213,43 @@ TEST_F(TransactionRouterMetricsTest, LogsTransactionsOverSlowMSThreshold) { assertPrintedExactlyOneSlowLogLine(); } +TEST_F(TransactionRouterMetricsTest, LogsTransactionsWithAPIParameters) { + const auto originalSlowMS = serverGlobalParams.slowMS; + const auto originalSampleRate = serverGlobalParams.sampleRate; + + serverGlobalParams.slowMS = 100; + serverGlobalParams.sampleRate = 1; + + // Reset the global parameters to their original values after this test exits. + ON_BLOCK_EXIT([originalSlowMS, originalSampleRate] { + serverGlobalParams.slowMS = originalSlowMS; + serverGlobalParams.sampleRate = originalSampleRate; + }); + + APIParameters::get(operationContext()).setAPIVersion("1"); + APIParameters::get(operationContext()).setAPIStrict(true); + APIParameters::get(operationContext()).setAPIDeprecationErrors(false); + beginTxnWithDefaultTxnNumber(); + tickSource()->advance(Milliseconds(101)); + runCommit(kDummyOkRes); + assertPrintedExactlyOneSlowLogLine(); + + int nFound = 0; + for (auto&& bson : getCapturedBSONFormatLogMessages()) { + if (bson["id"].Int() != 51805) { + continue; + } + + auto parameters = bson["attr"]["parameters"]; + ASSERT_EQUALS(parameters["apiVersion"].String(), "1"); + ASSERT_EQUALS(parameters["apiStrict"].Bool(), true); + ASSERT_EQUALS(parameters["apiDeprecationErrors"].Bool(), false); + ++nFound; + } + + ASSERT_EQUALS(nFound, 1); +} + TEST_F(TransactionRouterMetricsTest, DoesNotLogTransactionsWithSampleRateZero) { const auto originalSlowMS = serverGlobalParams.slowMS; const auto originalSampleRate = serverGlobalParams.sampleRate; diff --git a/src/mongo/s/write_ops/chunk_manager_targeter.cpp b/src/mongo/s/write_ops/chunk_manager_targeter.cpp index f7189efdfe9..6794dabc3ca 100644 --- a/src/mongo/s/write_ops/chunk_manager_targeter.cpp +++ b/src/mongo/s/write_ops/chunk_manager_targeter.cpp @@ -791,7 +791,7 @@ int ChunkManagerTargeter::getNShardsOwningChunks() const { void ChunkManagerTargeter::_refreshShardVersionNow(OperationContext* opCtx) { uassertStatusOK( - Grid::get(opCtx)->catalogCache()->getCollectionRoutingInfoWithRefresh(opCtx, _nss, true)); + Grid::get(opCtx)->catalogCache()->getCollectionRoutingInfoWithRefresh(opCtx, _nss)); _init(opCtx); } diff --git a/src/mongo/scripting/engine.cpp b/src/mongo/scripting/engine.cpp index d941e9834af..42ccc5f7154 100644 --- a/src/mongo/scripting/engine.cpp +++ b/src/mongo/scripting/engine.cpp @@ -249,6 +249,7 @@ void Scope::loadStored(OperationContext* opCtx, bool ignoreNotConnected) { v.type() != BSONType::CodeWScope); if (MONGO_unlikely(mr_killop_test_fp.shouldFail())) { + LOGV2(5062200, "Pausing mr_killop_test_fp for system.js entry", "entryName"_attr = n); /* This thread sleep makes the interrupts in the test come in at a time * where the js misses the interrupt and throw an exception instead of diff --git a/src/mongo/shell/collection.js b/src/mongo/shell/collection.js index 20522653bc2..bf5aa3a2653 100644 --- a/src/mongo/shell/collection.js +++ b/src/mongo/shell/collection.js @@ -1322,9 +1322,7 @@ DBCollection.prototype.getSlaveOk = function() { return this.getSecondaryOk(); }; -DBCollection.prototype.setSecondaryOk = function(value) { - if (value === undefined) - value = true; +DBCollection.prototype.setSecondaryOk = function(value = true) { this._secondaryOk = value; }; diff --git a/src/mongo/shell/db.js b/src/mongo/shell/db.js index 73fdb9c25e2..16c109e9cb4 100644 --- a/src/mongo/shell/db.js +++ b/src/mongo/shell/db.js @@ -1091,28 +1091,17 @@ DB.prototype.printSecondaryReplicationInfo = function() { return null; } - function g(x) { - assert(x, "how could this be null (printSecondaryReplicationInfo gx)"); - print("source: " + x.host); - if (x.syncedTo) { - var st = new Date(DB.tsToSeconds(x.syncedTo) * 1000); - getReplLag(st); - } else { - print("\tdoing initial sync"); - } - } - - function r(x) { - assert(x, "how could this be null (printSecondaryReplicationInfo rx)"); - if (x.state == 1 || x.state == 7) { // ignore primaries (1) and arbiters (7) + function printNodeReplicationInfo(node) { + assert(node); + if (node.state === 1 || node.state === 7) { // ignore primaries (1) and arbiters (7) return; } - print("source: " + x.name); - if (x.optime) { - getReplLag(x.optimeDate); + print("source: " + node.name); + if (node.optime && node.health != 0) { + getReplLag(node.optimeDate); } else { - print("\tno replication info, yet. State: " + x.stateStr); + print("\tno replication info, yet. State: " + node.stateStr); } } @@ -1136,7 +1125,7 @@ DB.prototype.printSecondaryReplicationInfo = function() { } for (i in status.members) { - r(status.members[i]); + printNodeReplicationInfo(status.members[i]); } } }; @@ -1255,7 +1244,7 @@ DB.autocomplete = function(obj) { return ret; }; -DB.prototype.setSlaveOk = function(value) { +DB.prototype.setSlaveOk = function(value = true) { print( "WARNING: setSlaveOk() is deprecated and may be removed in the next major release. Please use setSecondaryOk() instead."); this.setSecondaryOk(value); @@ -1267,9 +1256,7 @@ DB.prototype.getSlaveOk = function() { return this.getSecondaryOk(); }; -DB.prototype.setSecondaryOk = function(value) { - if (value == undefined) - value = true; +DB.prototype.setSecondaryOk = function(value = true) { this._secondaryOk = value; }; diff --git a/src/mongo/shell/mongo.js b/src/mongo/shell/mongo.js index 23a2cf775f1..5beecf5646d 100644 --- a/src/mongo/shell/mongo.js +++ b/src/mongo/shell/mongo.js @@ -39,9 +39,7 @@ Mongo.prototype.getSlaveOk = function() { return this.getSecondaryOk(); }; -Mongo.prototype.setSecondaryOk = function(value) { - if (value == undefined) - value = true; +Mongo.prototype.setSecondaryOk = function(value = true) { this.secondaryOk = value; }; diff --git a/src/mongo/transport/service_executor_fixed.cpp b/src/mongo/transport/service_executor_fixed.cpp index bdf75660dce..f48a9d7a170 100644 --- a/src/mongo/transport/service_executor_fixed.cpp +++ b/src/mongo/transport/service_executor_fixed.cpp @@ -64,7 +64,9 @@ ServiceExecutorFixed::ServiceExecutorFixed(ThreadPool::Options options) _options.onCreateThread = [this, onCreate = std::move(_options.onCreateThread)](const std::string& name) mutable { _executorContext = std::make_unique<ExecutorThreadContext>(this->weak_from_this()); - onCreate(name); + if (onCreate) { + onCreate(name); + } }; _threadPool = std::make_unique<ThreadPool>(_options); } diff --git a/src/mongo/util/concurrency/thread_pool.cpp b/src/mongo/util/concurrency/thread_pool.cpp index 680d397946f..0e8eda183b4 100644 --- a/src/mongo/util/concurrency/thread_pool.cpp +++ b/src/mongo/util/concurrency/thread_pool.cpp @@ -33,23 +33,37 @@ #include "mongo/util/concurrency/thread_pool.h" +#include <deque> +#include <fmt/format.h> +#include <list> +#include <sstream> +#include <vector> + #include "mongo/base/status.h" #include "mongo/logv2/log.h" #include "mongo/platform/atomic_word.h" +#include "mongo/platform/mutex.h" +#include "mongo/stdx/condition_variable.h" #include "mongo/util/assert_util.h" #include "mongo/util/concurrency/idle_thread_block.h" #include "mongo/util/concurrency/thread_name.h" -#include "mongo/util/str.h" - -#include <sstream> +#include "mongo/util/hierarchical_acquisition.h" namespace mongo { namespace { +using namespace fmt::literals; + // Counter used to assign unique names to otherwise-unnamed thread pools. AtomicWord<int> nextUnnamedThreadPoolId{1}; +std::string threadIdToString(stdx::thread::id id) { + std::ostringstream oss; + oss << id; + return oss.str(); +} + /** * Sets defaults and checks bounds limits on "options", and returns it. * @@ -57,10 +71,10 @@ AtomicWord<int> nextUnnamedThreadPoolId{1}; */ ThreadPool::Options cleanUpOptions(ThreadPool::Options&& options) { if (options.poolName.empty()) { - options.poolName = str::stream() << "ThreadPool" << nextUnnamedThreadPoolId.fetchAndAdd(1); + options.poolName = "ThreadPool{}"_format(nextUnnamedThreadPoolId.fetchAndAdd(1)); } if (options.threadNamePrefix.empty()) { - options.threadNamePrefix = str::stream() << options.poolName << '-'; + options.threadNamePrefix = "{}-"_format(options.poolName); } if (options.maxThreads < 1) { LOGV2_FATAL(28702, @@ -85,28 +99,144 @@ ThreadPool::Options cleanUpOptions(ThreadPool::Options&& options) { } // namespace -ThreadPool::Options::Options(const ThreadPool::Limits& limits) - : minThreads(limits.minThreads), - maxThreads(limits.maxThreads), - maxIdleThreadAge(limits.maxIdleThreadAge) {} -ThreadPool::ThreadPool(Options options) : _options(cleanUpOptions(std::move(options))) {} +// Public functions forwarded from ThreadPool. +class ThreadPool::Impl { +public: + explicit Impl(Options options); + ~Impl(); + void startup(); + void shutdown(); + void join(); + void schedule(Task task); + void waitForIdle(); + Stats getStats() const; + +private: + /** + * Representation of the stage of life of a thread pool. + * + * A pool starts out in the preStart state, and ends life in the shutdownComplete state. Work + * may only be scheduled in the preStart and running states. Threads may only be started in the + * running state. In shutdownComplete, there are no remaining threads or pending tasks to + * execute. + * + * Diagram of legal transitions: + * + * preStart -> running -> joinRequired -> joining -> shutdownComplete + * \ ^ + * \_____________/ + */ + enum LifecycleState { preStart, running, joinRequired, joining, shutdownComplete }; + + /** The thread body for worker threads. */ + void _workerThreadBody(const std::string& threadName) noexcept; + + /** + * Starts a worker thread, unless _options.maxThreads threads are already running or + * _state is not running. + */ + void _startWorkerThread_inlock(); + + /** + * This is the run loop of a worker thread, invoked by _workerThreadBody. + */ + void _consumeTasks(); + + /** + * Implementation of shutdown once _mutex is locked. + */ + void _shutdown_inlock(); + + /** + * Implementation of join once _mutex is owned by "lk". + */ + void _join_inlock(stdx::unique_lock<Latch>* lk); + + /** + * Runs the remaining tasks on a new thread as part of the join process, blocking until + * complete. Caller must not hold the mutex! + */ + void _drainPendingTasks(); + + /** + * Executes one task from _pendingTasks. "lk" must own _mutex, and _pendingTasks must have at + * least one entry. + */ + void _doOneTask(stdx::unique_lock<Latch>* lk) noexcept; + + /** + * Changes the lifecycle state (_state) of the pool and wakes up any threads waiting for a state + * change. Has no effect if _state == newState. + */ + void _setState_inlock(LifecycleState newState); + + /** + * Waits for all remaining retired threads to join. + * If a thread's _workerThreadBody() were ever to attempt to reacquire + * ThreadPool::_mutex after that thread had been added to _retiredThreads, + * it could cause a deadlock. + */ + void _joinRetired_inlock(); + + // These are the options with which the pool was configured at construction time. + const Options _options; + + // Mutex guarding all non-const member variables. + mutable Mutex _mutex = MONGO_MAKE_LATCH(HierarchicalAcquisitionLevel(0), "ThreadPool::_mutex"); + + // This variable represents the lifecycle state of the pool. + // + // Work may only be scheduled in states preStart and running, and only executes in states + // running and shuttingDown. + LifecycleState _state = preStart; + + // Condition signaled to indicate that there is work in the _pendingTasks queue, or + // that the system is shutting down. + stdx::condition_variable _workAvailable; + + // Condition signaled to indicate that there is no work in the _pendingTasks queue. + stdx::condition_variable _poolIsIdle; + + // Condition variable signaled whenever _state changes. + stdx::condition_variable _stateChange; + + // Queue of yet-to-be-executed tasks. + std::deque<Task> _pendingTasks; + + // List of threads serving as the worker pool. + std::list<stdx::thread> _threads; + + // List of threads that are retired and pending join + std::list<stdx::thread> _retiredThreads; + + // Count of idle threads. + size_t _numIdleThreads = 0; + + // Id counter for assigning thread names + size_t _nextThreadId = 0; + + // The last time that _pendingTasks.size() grew to be at least _threads.size(). + Date_t _lastFullUtilizationDate; +}; + +ThreadPool::Impl::Impl(Options options) : _options(cleanUpOptions(std::move(options))) {} -ThreadPool::~ThreadPool() { +ThreadPool::Impl::~Impl() { stdx::unique_lock<Latch> lk(_mutex); _shutdown_inlock(); - if (shutdownComplete != _state) { + if (_state != shutdownComplete) { _join_inlock(&lk); } - if (shutdownComplete != _state) { + if (_state != shutdownComplete) { LOGV2_FATAL(28704, "Failed to shutdown pool during destruction"); } invariant(_threads.empty()); invariant(_pendingTasks.empty()); } -void ThreadPool::startup() { +void ThreadPool::Impl::startup() { stdx::lock_guard<Latch> lk(_mutex); if (_state != preStart) { LOGV2_FATAL(28698, @@ -116,19 +246,18 @@ void ThreadPool::startup() { } _setState_inlock(running); invariant(_threads.empty()); - const size_t numToStart = - std::min(_options.maxThreads, std::max(_options.minThreads, _pendingTasks.size())); + size_t numToStart = std::clamp(_pendingTasks.size(), _options.minThreads, _options.maxThreads); for (size_t i = 0; i < numToStart; ++i) { _startWorkerThread_inlock(); } } -void ThreadPool::shutdown() { +void ThreadPool::Impl::shutdown() { stdx::lock_guard<Latch> lk(_mutex); _shutdown_inlock(); } -void ThreadPool::_shutdown_inlock() { +void ThreadPool::Impl::_shutdown_inlock() { switch (_state) { case preStart: case running: @@ -143,38 +272,30 @@ void ThreadPool::_shutdown_inlock() { MONGO_UNREACHABLE; } -void ThreadPool::join() { +void ThreadPool::Impl::join() { stdx::unique_lock<Latch> lk(_mutex); _join_inlock(&lk); } -void ThreadPool::_joinRetired_inlock() { +void ThreadPool::Impl::_joinRetired_inlock() { while (!_retiredThreads.empty()) { auto& t = _retiredThreads.front(); t.join(); - _options.onJoinRetiredThread(t); + if (_options.onJoinRetiredThread) + _options.onJoinRetiredThread(t); _retiredThreads.pop_front(); } } -void ThreadPool::_join_inlock(stdx::unique_lock<Latch>* lk) { - _stateChange.wait(*lk, [this] { - switch (_state) { - case preStart: - return false; - case running: - return false; - case joinRequired: - return true; - case joining: - case shutdownComplete: - LOGV2_FATAL(28700, - "Attempted to join pool {poolName} more than once", - "Attempted to join pool more than once", - "poolName"_attr = _options.poolName); - } - MONGO_UNREACHABLE; - }); +void ThreadPool::Impl::_join_inlock(stdx::unique_lock<Latch>* lk) { + _stateChange.wait(*lk, [this] { return _state != preStart && _state != running; }); + if (_state != joinRequired) { + LOGV2_FATAL(28700, + "Attempted to join pool {poolName} more than once", + "Attempted to join pool more than once", + "poolName"_attr = _options.poolName); + } + _setState_inlock(joining); ++_numIdleThreads; if (!_pendingTasks.empty()) { @@ -184,8 +305,7 @@ void ThreadPool::_join_inlock(stdx::unique_lock<Latch>* lk) { } --_numIdleThreads; _joinRetired_inlock(); - ThreadList threadsToJoin; - swap(threadsToJoin, _threads); + auto threadsToJoin = std::exchange(_threads, {}); lk->unlock(); for (auto& t : threadsToJoin) { t.join(); @@ -195,14 +315,14 @@ void ThreadPool::_join_inlock(stdx::unique_lock<Latch>* lk) { _setState_inlock(shutdownComplete); } -void ThreadPool::_drainPendingTasks() { +void ThreadPool::Impl::_drainPendingTasks() { // Tasks cannot be run inline because they can create OperationContexts and the join() caller // may already have one associated with the thread. stdx::thread cleanThread = stdx::thread([&] { - const std::string threadName = str::stream() - << _options.threadNamePrefix << _nextThreadId++; + const std::string threadName = "{}{}"_format(_options.threadNamePrefix, _nextThreadId++); setThreadName(threadName); - _options.onCreateThread(threadName); + if (_options.onCreateThread) + _options.onCreateThread(threadName); stdx::unique_lock<Latch> lock(_mutex); while (!_pendingTasks.empty()) { _doOneTask(&lock); @@ -211,16 +331,16 @@ void ThreadPool::_drainPendingTasks() { cleanThread.join(); } -void ThreadPool::schedule(Task task) { +void ThreadPool::Impl::schedule(Task task) { stdx::unique_lock<Latch> lk(_mutex); switch (_state) { case joinRequired: case joining: case shutdownComplete: { - auto status = Status(ErrorCodes::ShutdownInProgress, - str::stream() << "Shutdown of thread pool " << _options.poolName - << " in progress"); + auto status = + Status(ErrorCodes::ShutdownInProgress, + "Shutdown of thread pool {} in progress"_format(_options.poolName)); lk.unlock(); task(status); @@ -246,15 +366,14 @@ void ThreadPool::schedule(Task task) { _workAvailable.notify_one(); } -void ThreadPool::waitForIdle() { +void ThreadPool::Impl::waitForIdle() { stdx::unique_lock<Latch> lk(_mutex); - // If there are any pending tasks, or non-idle threads, the pool is not idle. - while (!_pendingTasks.empty() || _numIdleThreads < _threads.size()) { - _poolIsIdle.wait(lk); - } + // True when there are no `_pendingTasks` and all `_threads` are idle. + auto isIdle = [this] { return _pendingTasks.empty() && _numIdleThreads >= _threads.size(); }; + _poolIsIdle.wait(lk, isIdle); } -ThreadPool::Stats ThreadPool::getStats() const { +ThreadPool::Stats ThreadPool::Impl::getStats() const { stdx::lock_guard<Latch> lk(_mutex); Stats result; result.options = _options; @@ -265,95 +384,91 @@ ThreadPool::Stats ThreadPool::getStats() const { return result; } -void ThreadPool::_workerThreadBody(ThreadPool* pool, const std::string& threadName) noexcept { +void ThreadPool::Impl::_workerThreadBody(const std::string& threadName) noexcept { setThreadName(threadName); - pool->_options.onCreateThread(threadName); - const auto poolName = pool->_options.poolName; + if (_options.onCreateThread) + _options.onCreateThread(threadName); LOGV2_DEBUG(23104, 1, "Starting thread {threadName} in pool {poolName}", "Starting thread", "threadName"_attr = threadName, - "poolName"_attr = poolName); - pool->_consumeTasks(); - - // At this point, another thread may have destroyed "pool", if this thread chose to detach - // itself and remove itself from pool->_threads before releasing pool->_mutex. Do not access - // member variables of "pool" from here, on. - // - // This can happen if this thread decided to retire, got descheduled after removing itself - // from _threads and calling detach(), and then the pool was deleted. When this thread resumes, - // it is no longer safe to access "pool". + "poolName"_attr = _options.poolName); + _consumeTasks(); LOGV2_DEBUG(23105, 1, "Shutting down thread {threadName} in pool {poolName}", "Shutting down thread", "threadName"_attr = threadName, - "poolName"_attr = poolName); + "poolName"_attr = _options.poolName); } -void ThreadPool::_consumeTasks() { +void ThreadPool::Impl::_consumeTasks() { stdx::unique_lock<Latch> lk(_mutex); while (_state == running) { - if (_pendingTasks.empty()) { - /** - * Help with garbage collecting retired threads to: - * * Reduce the memory overhead of _retiredThreads - * * Expedite the shutdown process - */ - _joinRetired_inlock(); - - if (_threads.size() > _options.minThreads) { - // Since there are more than minThreads threads, this thread may be eligible for - // retirement. If it isn't now, it may be later, so it must put a time limit on how - // long it waits on _workAvailable. - const auto now = Date_t::now(); - const auto nextThreadRetirementDate = - _lastFullUtilizationDate + _options.maxIdleThreadAge; - if (now >= nextThreadRetirementDate) { - _lastFullUtilizationDate = now; - LOGV2_DEBUG(23106, - 1, - "Reaping this thread; next thread reaped no earlier than " - "{nextThreadRetirementDate}", - "Reaping this thread", - "nextThreadRetirementDate"_attr = - _lastFullUtilizationDate + _options.maxIdleThreadAge); - break; - } - - LOGV2_DEBUG(23107, - 3, - "Not reaping this thread because the earliest retirement date is " + if (!_pendingTasks.empty()) { + _doOneTask(&lk); + continue; + } + + // Help with garbage collecting retired threads to reduce the + // memory overhead of _retiredThreads and expedite the shutdown + // process. + _joinRetired_inlock(); + + boost::optional<Date_t> waitDeadline; + + if (_threads.size() > _options.minThreads) { + // Since there are more than minThreads threads, this thread may be eligible for + // retirement. If it isn't now, it may be later, so it must put a time limit on how + // long it waits on _workAvailable. + const auto now = Date_t::now(); + const auto nextRetirement = _lastFullUtilizationDate + _options.maxIdleThreadAge; + if (now >= nextRetirement) { + _lastFullUtilizationDate = now; + LOGV2_DEBUG(23106, + 1, + "Reaping this thread; next thread reaped no earlier than " "{nextThreadRetirementDate}", - "Not reaping this thread", - "nextThreadRetirementDate"_attr = nextThreadRetirementDate); - MONGO_IDLE_THREAD_BLOCK; - _workAvailable.wait_until(lk, nextThreadRetirementDate.toSystemTimePoint()); - } else { - // Since the number of threads is not more than minThreads, this thread is not - // eligible for retirement. It is OK to sleep until _workAvailable is signaled, - // because any new threads that put the number of total threads above minThreads - // would be eligible for retirement once they had no work left to do. - LOGV2_DEBUG(23108, - 3, - "Waiting for work; the thread pool size is {numThreads}; the minimum " - "number of threads is {minThreads}", - "Waiting for work", - "numThreads"_attr = _threads.size(), - "minThreads"_attr = _options.minThreads); - MONGO_IDLE_THREAD_BLOCK; - _workAvailable.wait(lk); + "Reaping this thread", + "nextThreadRetirementDate"_attr = + _lastFullUtilizationDate + _options.maxIdleThreadAge); + break; } - continue; + + LOGV2_DEBUG(23107, + 3, + "Not reaping this thread because the earliest retirement date is " + "{nextThreadRetirementDate}", + "Not reaping this thread", + "nextThreadRetirementDate"_attr = nextRetirement); + waitDeadline = nextRetirement; + } else { + // Since the number of threads is not more than minThreads, this thread is not + // eligible for retirement. It is OK to sleep until _workAvailable is signaled, + // because any new threads that put the number of total threads above minThreads + // would be eligible for retirement once they had no work left to do. + LOGV2_DEBUG(23108, + 3, + "Waiting for work; the thread pool size is {numThreads}; the minimum " + "number of threads is {minThreads}", + "Waiting for work", + "numThreads"_attr = _threads.size(), + "minThreads"_attr = _options.minThreads); } - _doOneTask(&lk); + auto wake = [&] { return _state != running || !_pendingTasks.empty(); }; + MONGO_IDLE_THREAD_BLOCK; + if (waitDeadline) { + _workAvailable.wait_until(lk, waitDeadline->toSystemTimePoint(), wake); + } else { + _workAvailable.wait(lk, wake); + } } // We still hold the lock, but this thread is retiring. If the whole pool is shutting down, this // thread lends a hand in draining the work pool and returns so it can be joined. Otherwise, it - // falls through to the detach code, below. + // falls through to the thread retirement code, below. if (_state == joinRequired || _state == joining) { // Drain the leftover pending tasks. @@ -375,29 +490,22 @@ void ThreadPool::_consumeTasks() { "expectedState"_attr = static_cast<int32_t>(running)); } - // This thread is ending because it was idle for too long. Find self in _threads, remove self - // from _threads, and add self to the list of retired threads. - for (size_t i = 0; i < _threads.size(); ++i) { - auto& t = _threads[i]; - if (t.get_id() != stdx::this_thread::get_id()) { - continue; - } - std::swap(t, _threads.back()); - _retiredThreads.push_back(std::move(_threads.back())); - _threads.pop_back(); - return; + // This thread is ending because it was idle for too long. + // Move self from _threads to _retiredThreads. + auto selfId = stdx::this_thread::get_id(); + auto pos = std::find_if( + _threads.begin(), _threads.end(), [&](auto&& t) { return t.get_id() == selfId; }); + if (pos == _threads.end()) { + LOGV2_FATAL_NOTRACE(28703, + "Could not find thread with id {threadId} in pool {poolName}", + "Could not find thread", + "threadId"_attr = threadIdToString(selfId), + "poolName"_attr = _options.poolName); } - - std::ostringstream threadId; - threadId << stdx::this_thread::get_id(); - LOGV2_FATAL_NOTRACE(28703, - "Could not find thread with id {threadId} in pool {poolName}", - "Could not find thread", - "threadId"_attr = threadId.str(), - "poolName"_attr = _options.poolName); + _retiredThreads.splice(_retiredThreads.end(), _threads, pos); } -void ThreadPool::_doOneTask(stdx::unique_lock<Latch>* lk) noexcept { +void ThreadPool::Impl::_doOneTask(stdx::unique_lock<Latch>* lk) noexcept { invariant(!_pendingTasks.empty()); LOGV2_DEBUG(23109, 3, @@ -416,7 +524,7 @@ void ThreadPool::_doOneTask(stdx::unique_lock<Latch>* lk) noexcept { } } -void ThreadPool::_startWorkerThread_inlock() { +void ThreadPool::Impl::_startWorkerThread_inlock() { switch (_state) { case preStart: LOGV2_DEBUG( @@ -452,9 +560,9 @@ void ThreadPool::_startWorkerThread_inlock() { return; } invariant(_threads.size() < _options.maxThreads); - const std::string threadName = str::stream() << _options.threadNamePrefix << _nextThreadId++; + std::string threadName = "{}{}"_format(_options.threadNamePrefix, _nextThreadId++); try { - _threads.emplace_back([this, threadName] { _workerThreadBody(this, threadName); }); + _threads.emplace_back([this, threadName] { _workerThreadBody(threadName); }); ++_numIdleThreads; } catch (const std::exception& ex) { LOGV2_ERROR(23113, @@ -468,7 +576,7 @@ void ThreadPool::_startWorkerThread_inlock() { } } -void ThreadPool::_setState_inlock(const LifecycleState newState) { +void ThreadPool::Impl::_setState_inlock(const LifecycleState newState) { if (newState == _state) { return; } @@ -476,4 +584,35 @@ void ThreadPool::_setState_inlock(const LifecycleState newState) { _stateChange.notify_all(); } +// ======================================== +// ThreadPool public functions that simply forward to the `_impl`. + +ThreadPool::ThreadPool(Options options) : _impl{std::make_unique<Impl>(std::move(options))} {} + +ThreadPool::~ThreadPool() = default; + +void ThreadPool::startup() { + _impl->startup(); +} + +void ThreadPool::shutdown() { + _impl->shutdown(); +} + +void ThreadPool::join() { + _impl->join(); +} + +void ThreadPool::schedule(Task task) { + _impl->schedule(std::move(task)); +} + +void ThreadPool::waitForIdle() { + _impl->waitForIdle(); +} + +ThreadPool::Stats ThreadPool::getStats() const { + return _impl->getStats(); +} + } // namespace mongo diff --git a/src/mongo/util/concurrency/thread_pool.h b/src/mongo/util/concurrency/thread_pool.h index a6e56f8c9bf..29acd9e09c0 100644 --- a/src/mongo/util/concurrency/thread_pool.h +++ b/src/mongo/util/concurrency/thread_pool.h @@ -29,47 +29,52 @@ #pragma once -#include <deque> #include <functional> +#include <memory> #include <string> -#include <vector> -#include "mongo/platform/mutex.h" -#include "mongo/stdx/condition_variable.h" #include "mongo/stdx/thread.h" #include "mongo/util/concurrency/thread_pool_interface.h" -#include "mongo/util/hierarchical_acquisition.h" +#include "mongo/util/duration.h" #include "mongo/util/time_support.h" namespace mongo { -class Status; - /** * A configurable thread pool, for general use. * * See the Options struct for information about how to configure an instance. */ class ThreadPool final : public ThreadPoolInterface { - ThreadPool(const ThreadPool&) = delete; - ThreadPool& operator=(const ThreadPool&) = delete; - public: - struct Limits; + /** + * Contains a subset of the fields from Options related to limiting the number of concurrent + * threads in the pool. Used in places where we want a way to specify limits to the size of a + * ThreadPool without overriding the other behaviors of the pool such thread names or onCreate + * behaviors. Each field of Limits maps directly to the same-named field in Options. + */ + struct Limits { + size_t minThreads = 1; + size_t maxThreads = 8; + Milliseconds maxIdleThreadAge = Seconds{30}; + }; /** * Structure used to configure an instance of ThreadPool. */ struct Options { - - Options() = default; - explicit Options(const Limits& limits); - // Set maxThreads to this if you don't want to limit the number of threads in the pool. // Note: the value used here is high enough that it will never be reached, but low enough // that it won't cause overflows if mixed with signed ints or math. static constexpr size_t kUnlimited = 1'000'000'000; + Options() = default; + + explicit Options(const Limits& limits) + : minThreads(limits.minThreads), + maxThreads(limits.maxThreads), + maxIdleThreadAge(limits.maxIdleThreadAge) {} + // Name of the thread pool. If this string is empty, the pool will be assigned a // name unique to the current process. std::string poolName; @@ -95,29 +100,15 @@ public: // a thread. Milliseconds maxIdleThreadAge = Seconds{30}; - // This function is run before each worker thread begins consuming tasks. - using OnCreateThreadFn = std::function<void(const std::string& threadName)>; - OnCreateThreadFn onCreateThread = [](const std::string&) {}; + /** If callable, called before each worker thread begins consuming tasks. */ + std::function<void(const std::string&)> onCreateThread; /** - * This function is called after joining each retired thread. + * If callable, called after joining each retired thread. * Since there could be multiple calls to this function in a single critical section, * avoid complex logic in the callback. */ - using OnJoinRetiredThreadFn = std::function<void(const stdx::thread&)>; - OnJoinRetiredThreadFn onJoinRetiredThread = [](const stdx::thread&) {}; - }; - - /** - * Contains a subset of the fields from Options related to limiting the number of concurrent - * threads in the pool. Used in places where we want a way to specify limits to the size of a - * ThreadPool without overriding the other behaviors of the pool such thread names or onCreate - * behaviors. Each field of Limits maps directly to the same-named field in Options. - */ - struct Limits { - size_t minThreads = 1; - size_t maxThreads = 8; - Milliseconds maxIdleThreadAge = Seconds{30}; + std::function<void(const stdx::thread&)> onJoinRetiredThread; }; /** @@ -145,12 +136,18 @@ public: */ explicit ThreadPool(Options options); + ThreadPool(const ThreadPool&) = delete; + ThreadPool& operator=(const ThreadPool&) = delete; + ~ThreadPool() override; + // from OutOfLineExecutor (base of ThreadPoolInterface) + void schedule(Task task) override; + + // from ThreadPoolInterface void startup() override; void shutdown() override; void join() override; - void schedule(Task task) override; /** * Blocks the caller until there are no pending tasks on this pool. @@ -170,120 +167,8 @@ public: Stats getStats() const; private: - using TaskList = std::deque<Task>; - using ThreadList = std::vector<stdx::thread>; - using RetiredThreadList = std::list<stdx::thread>; - - /** - * Representation of the stage of life of a thread pool. - * - * A pool starts out in the preStart state, and ends life in the shutdownComplete state. Work - * may only be scheduled in the preStart and running states. Threads may only be started in the - * running state. In shutdownComplete, there are no remaining threads or pending tasks to - * execute. - * - * Diagram of legal transitions: - * - * preStart -> running -> joinRequired -> joining -> shutdownComplete - * \ ^ - * \_____________/ - */ - enum LifecycleState { preStart, running, joinRequired, joining, shutdownComplete }; - - /** - * This is the thread body for worker threads. It is a static member function, - * because late in its execution it is possible for the pool to have been destroyed. - * As such, it is advisable to pass the pool pointer as an explicit argument, rather - * than as the implicit "this" argument. - */ - static void _workerThreadBody(ThreadPool* pool, const std::string& threadName) noexcept; - - /** - * Starts a worker thread, unless _options.maxThreads threads are already running or - * _state is not running. - */ - void _startWorkerThread_inlock(); - - /** - * This is the run loop of a worker thread, invoked by _workerThreadBody. - */ - void _consumeTasks(); - - /** - * Implementation of shutdown once _mutex is locked. - */ - void _shutdown_inlock(); - - /** - * Implementation of join once _mutex is owned by "lk". - */ - void _join_inlock(stdx::unique_lock<Latch>* lk); - - /** - * Runs the remaining tasks on a new thread as part of the join process, blocking until - * complete. Caller must not hold the mutex! - */ - void _drainPendingTasks(); - - /** - * Executes one task from _pendingTasks. "lk" must own _mutex, and _pendingTasks must have at - * least one entry. - */ - void _doOneTask(stdx::unique_lock<Latch>* lk) noexcept; - - /** - * Changes the lifecycle state (_state) of the pool and wakes up any threads waiting for a state - * change. Has no effect if _state == newState. - */ - void _setState_inlock(LifecycleState newState); - - /** - * Waits for all remaining retired threads to join. - * If a thread's _workerThreadBody() were ever to attempt to reacquire - * ThreadPool::_mutex after that thread had been added to _retiredThreads, - * it could cause a deadlock. - */ - void _joinRetired_inlock(); - - // These are the options with which the pool was configured at construction time. - const Options _options; - - // Mutex guarding all non-const member variables. - mutable Mutex _mutex = MONGO_MAKE_LATCH(HierarchicalAcquisitionLevel(0), "ThreadPool::_mutex"); - - // This variable represents the lifecycle state of the pool. - // - // Work may only be scheduled in states preStart and running, and only executes in states - // running and shuttingDown. - LifecycleState _state = preStart; - - // Condition signaled to indicate that there is work in the _pendingTasks queue, or - // that the system is shutting down. - stdx::condition_variable _workAvailable; - - // Condition signaled to indicate that there is no work in the _pendingTasks queue. - stdx::condition_variable _poolIsIdle; - - // Condition variable signaled whenever _state changes. - stdx::condition_variable _stateChange; - - // Queue of yet-to-be-executed tasks. - TaskList _pendingTasks; - - // List of threads serving as the worker pool. - ThreadList _threads; - - // List of threads that are retired and pending join - RetiredThreadList _retiredThreads; - - // Count of idle threads. - size_t _numIdleThreads = 0; - - // Id counter for assigning thread names - size_t _nextThreadId = 0; - - // The last time that _pendingTasks.size() grew to be at least _threads.size(). - Date_t _lastFullUtilizationDate; + class Impl; + std::unique_ptr<Impl> _impl; }; } // namespace mongo diff --git a/src/mongo/util/fail_point.cpp b/src/mongo/util/fail_point.cpp index c0a28ddb3ac..e467ff2d9fb 100644 --- a/src/mongo/util/fail_point.cpp +++ b/src/mongo/util/fail_point.cpp @@ -70,7 +70,7 @@ void FailPoint::setThreadPRNGSeed(int32_t seed) { threadPrng = PseudoRandom(seed); } -FailPoint::FailPoint() = default; +FailPoint::FailPoint(std::string name) : _name(std::move(name)) {} void FailPoint::_shouldFailCloseBlock() { _fpInfo.subtractAndFetch(1); @@ -286,8 +286,8 @@ BSONObj FailPoint::toBSON() const { return builder.obj(); } -FailPointRegisterer::FailPointRegisterer(const std::string& name, FailPoint* fp) { - uassertStatusOK(globalFailPointRegistry().add(name, fp)); +FailPointRegisterer::FailPointRegisterer(FailPoint* fp) { + uassertStatusOK(globalFailPointRegistry().add(fp)); } FailPointRegistry& globalFailPointRegistry() { @@ -309,12 +309,18 @@ auto setGlobalFailPoint(const std::string& failPointName, const BSONObj& cmdObj) return timesEntered; } -FailPointEnableBlock::FailPointEnableBlock(std::string failPointName) - : FailPointEnableBlock(std::move(failPointName), {}) {} +FailPointEnableBlock::FailPointEnableBlock(StringData failPointName) + : FailPointEnableBlock(failPointName, {}) {} + +FailPointEnableBlock::FailPointEnableBlock(StringData failPointName, BSONObj data) + : FailPointEnableBlock(globalFailPointRegistry().find(failPointName), std::move(data)) {} + +FailPointEnableBlock::FailPointEnableBlock(FailPoint* failPoint) + : FailPointEnableBlock(failPoint, {}) {} + +FailPointEnableBlock::FailPointEnableBlock(FailPoint* failPoint, BSONObj data) + : _failPoint(failPoint) { -FailPointEnableBlock::FailPointEnableBlock(std::string failPointName, BSONObj data) - : _failPointName(std::move(failPointName)) { - _failPoint = globalFailPointRegistry().find(_failPointName); invariant(_failPoint != nullptr); _initialTimesEntered = _failPoint->setMode(FailPoint::alwaysOn, 0, std::move(data)); @@ -322,7 +328,7 @@ FailPointEnableBlock::FailPointEnableBlock(std::string failPointName, BSONObj da LOGV2_WARNING(23830, "Set failpoint {failPointName} to: {failPoint}", "Set failpoint", - "failPointName"_attr = _failPointName, + "failPointName"_attr = _failPoint->getName(), "failPoint"_attr = _failPoint->toBSON()); } @@ -331,24 +337,25 @@ FailPointEnableBlock::~FailPointEnableBlock() { LOGV2_WARNING(23831, "Set failpoint {failPointName} to: {failPoint}", "Set failpoint", - "failPointName"_attr = _failPointName, + "failPointName"_attr = _failPoint->getName(), "failPoint"_attr = _failPoint->toBSON()); } FailPointRegistry::FailPointRegistry() : _frozen(false) {} -Status FailPointRegistry::add(const std::string& name, FailPoint* failPoint) { +Status FailPointRegistry::add(FailPoint* failPoint) { if (_frozen) { return {ErrorCodes::CannotMutateObject, "Registry is already frozen"}; } - auto [pos, ok] = _fpMap.insert({name, failPoint}); + auto [pos, ok] = _fpMap.insert({failPoint->getName(), failPoint}); if (!ok) { - return {ErrorCodes::Error(51006), "Fail point already registered: {}"_format(name)}; + return {ErrorCodes::Error(51006), + "Fail point already registered: {}"_format(failPoint->getName())}; } return Status::OK(); } -FailPoint* FailPointRegistry::find(const std::string& name) const { +FailPoint* FailPointRegistry::find(StringData name) const { auto iter = _fpMap.find(name); return (iter == _fpMap.end()) ? nullptr : iter->second; } diff --git a/src/mongo/util/fail_point.h b/src/mongo/util/fail_point.h index 5322fad8b67..af02e9f1622 100644 --- a/src/mongo/util/fail_point.h +++ b/src/mongo/util/fail_point.h @@ -40,6 +40,7 @@ #include "mongo/stdx/unordered_map.h" #include "mongo/util/duration.h" #include "mongo/util/interruptible.h" +#include "mongo/util/string_map.h" namespace mongo { @@ -202,11 +203,15 @@ public: */ static StatusWith<ModeOptions> parseBSON(const BSONObj& obj); - FailPoint(); + explicit FailPoint(std::string name); FailPoint(const FailPoint&) = delete; FailPoint& operator=(const FailPoint&) = delete; + const std::string& getName() const { + return _name; + } + /** * Returns true if fail point is active. * @@ -423,6 +428,8 @@ private: AtomicWord<int> _timesOrPeriod{0}; BSONObj _data; + const std::string _name; + // protects _mode, _timesOrPeriod, _data mutable Mutex _modMutex = MONGO_MAKE_LATCH("FailPoint::_modMutex"); }; @@ -439,12 +446,12 @@ public: * 51006 - if the given name already exists in this registry. * CannotMutateObject - if this registry is already frozen. */ - Status add(const std::string& name, FailPoint* failPoint); + Status add(FailPoint* failPoint); /** * @return a registered FailPoint, or nullptr if it was not registered. */ - FailPoint* find(const std::string& name) const; + FailPoint* find(StringData name) const; /** * Freezes this registry from being modified. @@ -460,7 +467,7 @@ public: private: bool _frozen; - stdx::unordered_map<std::string, FailPoint*> _fpMap; + StringMap<FailPoint*> _fpMap; }; /** @@ -468,10 +475,15 @@ private: */ class FailPointEnableBlock { public: - explicit FailPointEnableBlock(std::string failPointName); - FailPointEnableBlock(std::string failPointName, BSONObj data); + explicit FailPointEnableBlock(StringData failPointName); + FailPointEnableBlock(StringData failPointName, BSONObj data); + explicit FailPointEnableBlock(FailPoint* failPoint); + FailPointEnableBlock(FailPoint* failPoint, BSONObj data); ~FailPointEnableBlock(); + FailPointEnableBlock(const FailPointEnableBlock&) = delete; + FailPointEnableBlock& operator=(const FailPointEnableBlock&) = delete; + // Const access to the underlying FailPoint const FailPoint* failPoint() const { return _failPoint; @@ -488,8 +500,7 @@ public: } private: - std::string _failPointName; - FailPoint* _failPoint; + FailPoint* const _failPoint; FailPoint::EntryCountT _initialTimesEntered; }; @@ -507,7 +518,7 @@ FailPoint::EntryCountT setGlobalFailPoint(const std::string& failPointName, cons */ class FailPointRegisterer { public: - FailPointRegisterer(const std::string& name, FailPoint* fp); + explicit FailPointRegisterer(FailPoint* fp); }; FailPointRegistry& globalFailPointRegistry(); @@ -518,8 +529,8 @@ FailPointRegistry& globalFailPointRegistry(); * Never use in header files, only .cpp files. */ #define MONGO_FAIL_POINT_DEFINE(fp) \ - ::mongo::FailPoint fp; \ - ::mongo::FailPointRegisterer fp##failPointRegisterer(#fp, &fp); + ::mongo::FailPoint fp(#fp); \ + ::mongo::FailPointRegisterer fp##failPointRegisterer(&fp); } // namespace mongo diff --git a/src/mongo/util/fail_point_test.cpp b/src/mongo/util/fail_point_test.cpp index 26b051fb7dc..a3c346594c9 100644 --- a/src/mongo/util/fail_point_test.cpp +++ b/src/mongo/util/fail_point_test.cpp @@ -55,12 +55,12 @@ namespace stdx = mongo::stdx; namespace mongo_test { TEST(FailPoint, InitialState) { - FailPoint failPoint; + FailPoint failPoint("testFP"); ASSERT_FALSE(failPoint.shouldFail()); } TEST(FailPoint, AlwaysOn) { - FailPoint failPoint; + FailPoint failPoint("testFP"); failPoint.setMode(FailPoint::alwaysOn); ASSERT(failPoint.shouldFail()); @@ -74,7 +74,7 @@ TEST(FailPoint, AlwaysOn) { } TEST(FailPoint, NTimes) { - FailPoint failPoint; + FailPoint failPoint("testFP"); failPoint.setMode(FailPoint::nTimes, 4); ASSERT(failPoint.shouldFail()); ASSERT(failPoint.shouldFail()); @@ -87,14 +87,14 @@ TEST(FailPoint, NTimes) { } TEST(FailPoint, BlockOff) { - FailPoint failPoint; + FailPoint failPoint("testFP"); bool called = false; failPoint.execute([&](const BSONObj&) { called = true; }); ASSERT_FALSE(called); } TEST(FailPoint, BlockAlwaysOn) { - FailPoint failPoint; + FailPoint failPoint("testFP"); failPoint.setMode(FailPoint::alwaysOn); bool called = false; @@ -104,7 +104,7 @@ TEST(FailPoint, BlockAlwaysOn) { } TEST(FailPoint, BlockNTimes) { - FailPoint failPoint; + FailPoint failPoint("testFP"); failPoint.setMode(FailPoint::nTimes, 1); size_t counter = 0; @@ -116,7 +116,7 @@ TEST(FailPoint, BlockNTimes) { } TEST(FailPoint, BlockWithException) { - FailPoint failPoint; + FailPoint failPoint("testFP"); failPoint.setMode(FailPoint::alwaysOn); bool threw = false; @@ -134,7 +134,7 @@ TEST(FailPoint, BlockWithException) { } TEST(FailPoint, SetGetParam) { - FailPoint failPoint; + FailPoint failPoint("testFP"); failPoint.setMode(FailPoint::alwaysOn, 0, BSON("x" << 20)); failPoint.execute([&](const BSONObj& data) { ASSERT_EQUALS(20, data["x"].numberInt()); }); @@ -143,12 +143,13 @@ TEST(FailPoint, SetGetParam) { class FailPointStress : public mongo::unittest::Test { public: void setUp() { - _fp.setMode(FailPoint::alwaysOn, 0, BSON("a" << 44)); + _fp = std::make_unique<FailPoint>("testFP"); + _fp->setMode(FailPoint::alwaysOn, 0, BSON("a" << 44)); } void tearDown() { // Note: This can loop indefinitely if reference counter was off - _fp.setMode(FailPoint::off, 0, BSON("a" << 66)); + _fp->setMode(FailPoint::off, 0, BSON("a" << 66)); } void startTest() { @@ -174,7 +175,7 @@ public: private: void blockTask() { while (true) { - _fp.execute([](const BSONObj& data) { + _fp->execute([](const BSONObj& data) { // Expanded ASSERT_EQUALS since the error is not being // printed out properly if (data["a"].numberInt() != 44) { @@ -196,7 +197,7 @@ private: void blockWithExceptionTask() { while (true) { try { - _fp.execute([](const BSONObj& data) { + _fp->execute([](const BSONObj& data) { if (data["a"].numberInt() != 44) { using namespace mongo::literals; LOGV2_ERROR(24130, @@ -219,7 +220,7 @@ private: void simpleTask() { while (true) { - static_cast<void>(MONGO_unlikely(_fp.shouldFail())); + static_cast<void>(MONGO_unlikely(_fp->shouldFail())); stdx::lock_guard<mongo::Latch> lk(_mutex); if (_inShutdown) break; @@ -228,10 +229,10 @@ private: void flipTask() { while (true) { - if (_fp.shouldFail()) { - _fp.setMode(FailPoint::off, 0); + if (_fp->shouldFail()) { + _fp->setMode(FailPoint::off, 0); } else { - _fp.setMode(FailPoint::alwaysOn, 0, BSON("a" << 44)); + _fp->setMode(FailPoint::alwaysOn, 0, BSON("a" << 44)); } stdx::lock_guard<mongo::Latch> lk(_mutex); @@ -240,7 +241,7 @@ private: } } - FailPoint _fp; + std::unique_ptr<FailPoint> _fp; std::vector<stdx::thread> _tasks; mongo::Mutex _mutex = MONGO_MAKE_LATCH(); @@ -249,7 +250,7 @@ private: TEST_F(FailPointStress, Basic) { startTest(); - mongo::sleepsecs(30); + mongo::sleepsecs(5); stopTest(); } @@ -277,7 +278,7 @@ static int64_t runParallelFailPointTest(FailPoint::Mode fpMode, const int32_t numEncountersPerThread) { ASSERT_GT(numThreads, 0); ASSERT_GT(numEncountersPerThread, 0); - FailPoint failPoint; + FailPoint failPoint("testFP"); failPoint.setMode(fpMode, fpVal); std::vector<stdx::thread*> tasks; std::vector<int64_t> counts(numThreads, 0); @@ -398,7 +399,7 @@ TEST(FailPoint, parseBSONValidDataSucceeds) { ASSERT_TRUE(swTuple.isOK()); } -TEST(FailPoint, FailPointBlockBasicTest) { +TEST(FailPoint, FailPointEnableBlockBasicTest) { auto failPoint = mongo::globalFailPointRegistry().find("dummy"); ASSERT_FALSE(failPoint->shouldFail()); @@ -411,8 +412,21 @@ TEST(FailPoint, FailPointBlockBasicTest) { ASSERT_FALSE(failPoint->shouldFail()); } -TEST(FailPoint, FailPointBlockIfBasicTest) { - FailPoint failPoint; +TEST(FailPoint, FailPointEnableBlockByPointer) { + auto failPoint = mongo::globalFailPointRegistry().find("dummy"); + + ASSERT_FALSE(failPoint->shouldFail()); + + { + FailPointEnableBlock dummyFp(failPoint); + ASSERT_TRUE(failPoint->shouldFail()); + } + + ASSERT_FALSE(failPoint->shouldFail()); +} + +TEST(FailPoint, ExecuteIfBasicTest) { + FailPoint failPoint("testFP"); failPoint.setMode(FailPoint::nTimes, 1, BSON("skip" << true)); { bool hit = false; @@ -463,7 +477,7 @@ void assertFunctionInterruptable(std::function<void(Interruptible* interruptible } TEST(FailPoint, PauseWhileSetInterruptibility) { - FailPoint failPoint; + FailPoint failPoint("testFP"); failPoint.setMode(FailPoint::alwaysOn); assertFunctionInterruptable( @@ -473,7 +487,7 @@ TEST(FailPoint, PauseWhileSetInterruptibility) { } TEST(FailPoint, WaitForFailPointTimeout) { - FailPoint failPoint; + FailPoint failPoint("testFP"); failPoint.setMode(FailPoint::alwaysOn); assertFunctionInterruptable([&failPoint](Interruptible* interruptible) { diff --git a/src/mongo/util/invalidating_lru_cache.h b/src/mongo/util/invalidating_lru_cache.h index 18b9a94c9fa..c8ead4adecc 100644 --- a/src/mongo/util/invalidating_lru_cache.h +++ b/src/mongo/util/invalidating_lru_cache.h @@ -196,9 +196,9 @@ public: */ class ValueHandle { public: - // The two constructors below are present in order to offset the fact that the cache doesn't - // support pinning items. Their only usage must be in the authorization mananager for the - // internal authentication user. + // The three constructors below are present in order to offset the fact that the cache + // doesn't support pinning items. Their only usage must be in the authorization mananager + // for the internal authentication user. explicit ValueHandle(Value&& value) : _value(std::make_shared<StoredValue>(nullptr, 0, @@ -207,6 +207,10 @@ public: CacheNotCausallyConsistent(), CacheNotCausallyConsistent())) {} + explicit ValueHandle(Value&& value, const Time& t) + : _value( + std::make_shared<StoredValue>(nullptr, 0, boost::none, std::move(value), t, t)) {} + ValueHandle() = default; operator bool() const { @@ -264,15 +268,16 @@ public: Value&& value, const Time& time = CacheNotCausallyConsistent()) { LockGuardWithPostUnlockDestructor guard(_mutex); - Time timeInStore; - _invalidate(&guard, key, _cache.find(key), &timeInStore); - if (auto evicted = _cache.add(key, - std::make_shared<StoredValue>(this, - ++_epoch, - key, - std::forward<Value>(value), - time, - std::max(time, timeInStore)))) { + Time currentTime, currentTimeInStore; + _invalidate(&guard, key, _cache.find(key), ¤tTime, ¤tTimeInStore); + if (auto evicted = + _cache.add(key, + std::make_shared<StoredValue>(this, + ++_epoch, + key, + std::forward<Value>(value), + time, + std::max(time, currentTimeInStore)))) { const auto& evictedKey = evicted->first; auto& evictedValue = evicted->second; @@ -310,15 +315,16 @@ public: Value&& value, const Time& time = CacheNotCausallyConsistent()) { LockGuardWithPostUnlockDestructor guard(_mutex); - Time timeInStore; - _invalidate(&guard, key, _cache.find(key), &timeInStore); - if (auto evicted = _cache.add(key, - std::make_shared<StoredValue>(this, - ++_epoch, - key, - std::forward<Value>(value), - time, - std::max(time, timeInStore)))) { + Time currentTime, currentTimeInStore; + _invalidate(&guard, key, _cache.find(key), ¤tTime, ¤tTimeInStore); + if (auto evicted = + _cache.add(key, + std::make_shared<StoredValue>(this, + ++_epoch, + key, + std::forward<Value>(value), + time, + std::max(time, currentTimeInStore)))) { const auto& evictedKey = evicted->first; auto& evictedValue = evicted->second; @@ -526,10 +532,13 @@ private: void _invalidate(LockGuardWithPostUnlockDestructor* guard, const Key& key, typename Cache::iterator it, + Time* outTime = nullptr, Time* outTimeInStore = nullptr) { if (it != _cache.end()) { auto& storedValue = it->second; storedValue->isValid.store(false); + if (outTime) + *outTime = storedValue->time; if (outTimeInStore) *outTimeInStore = storedValue->timeInStore; guard->releasePtr(std::move(storedValue)); @@ -545,6 +554,8 @@ private: // released and drops to zero if (auto evictedValue = itEvicted->second.lock()) { evictedValue->isValid.store(false); + if (outTime) + *outTime = evictedValue->time; if (outTimeInStore) *outTimeInStore = evictedValue->timeInStore; guard->releasePtr(std::move(evictedValue)); diff --git a/src/mongo/util/invalidating_lru_cache_test.cpp b/src/mongo/util/invalidating_lru_cache_test.cpp index 282a130af68..8476dfc5c9e 100644 --- a/src/mongo/util/invalidating_lru_cache_test.cpp +++ b/src/mongo/util/invalidating_lru_cache_test.cpp @@ -67,11 +67,14 @@ TEST(InvalidatingLRUCacheTest, ValueHandleOperators) { TestValueCache cache(1); cache.insertOrAssign(100, {"Test value"}); + // Test non-const operators { auto valueHandle = cache.get(100); ASSERT_EQ("Test value", valueHandle->value); ASSERT_EQ("Test value", (*valueHandle).value); } + + // Test const operators { const auto valueHandle = cache.get(100); ASSERT_EQ("Test value", valueHandle->value); @@ -473,7 +476,7 @@ void parallelTest(size_t cacheSize, TestFunc doTest) { } TEST(InvalidatingLRUCacheParallelTest, InsertOrAssignThenGet) { - parallelTest<TestValueCache>(1, [](auto& cache) mutable { + parallelTest<TestValueCache>(1, [](auto& cache) { const int key = 100; cache.insertOrAssign(key, TestValue{"Parallel tester value"}); @@ -501,7 +504,7 @@ TEST(InvalidatingLRUCacheParallelTest, InsertOrAssignAndGet) { } TEST(InvalidatingLRUCacheParallelTest, CacheSizeZeroInsertOrAssignAndGet) { - parallelTest<TestValueCache>(0, [](auto& cache) mutable { + parallelTest<TestValueCache>(0, [](auto& cache) { const int key = 300; auto cachedItem = cache.insertOrAssignAndGet(key, TestValue{"Parallel tester value"}); ASSERT(cachedItem); @@ -511,12 +514,18 @@ TEST(InvalidatingLRUCacheParallelTest, CacheSizeZeroInsertOrAssignAndGet) { } TEST(InvalidatingLRUCacheParallelTest, AdvanceTime) { - AtomicWord<uint64_t> counter{0}; + AtomicWord<uint64_t> counter{1}; + Mutex insertOrAssignMutex = MONGO_MAKE_LATCH("ReadThroughCacheBase::_cancelTokenMutex"); - parallelTest<TestValueCacheCausallyConsistent>(0, [&counter](auto& cache) mutable { + parallelTest<TestValueCacheCausallyConsistent>(0, [&](auto& cache) { const int key = 300; - cache.insertOrAssign( - key, TestValue{"Parallel tester value"}, Timestamp(counter.fetchAndAdd(1))); + { + // The calls to insertOrAssign must always pass strictly incrementing time + stdx::lock_guard lg(insertOrAssignMutex); + cache.insertOrAssign( + key, TestValue{"Parallel tester value"}, Timestamp(counter.fetchAndAdd(1))); + } + auto latestCached = cache.get(key, CacheCausalConsistency::kLatestCached); auto latestKnown = cache.get(key, CacheCausalConsistency::kLatestKnown); diff --git a/src/mongo/util/read_through_cache.h b/src/mongo/util/read_through_cache.h index 3d5c7bf0923..72b3e7a5771 100644 --- a/src/mongo/util/read_through_cache.h +++ b/src/mongo/util/read_through_cache.h @@ -136,10 +136,12 @@ public: */ class ValueHandle { public: - // The two constructors below are present in order to offset the fact that the cache doesn't - // support pinning items. Their only usage must be in the authorization mananager for the - // internal authentication user. + // The three constructors below are present in order to offset the fact that the cache + // doesn't support pinning items. Their only usage must be in the authorization mananager + // for the internal authentication user. ValueHandle(Value&& value) : _valueHandle({std::move(value), Date_t::min()}) {} + ValueHandle(Value&& value, const Time& t) + : _valueHandle({std::move(value), Date_t::min()}, t) {} ValueHandle() = default; operator bool() const { @@ -289,6 +291,16 @@ public: } /** + * Acquires the latest value from the cache, or an empty ValueHandle if the key is not present + * in the cache. + * + * Doesn't attempt to lookup, and so doesn't block. + */ + ValueHandle peekLatestCached(const Key& key) { + return {_cache.get(key, CacheCausalConsistency::kLatestCached)}; + } + + /** * Invalidates the given 'key' and immediately replaces it with a new value. */ ValueHandle insertOrAssignAndGet(const Key& key, Value&& newValue, Date_t updateWallClockTime) { diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 6a630be33db..9bf8a939b70 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-4.6", - "commit": "bb92ab603f22ca84c24af3be7bc9194f44ff3e64" + "commit": "a68890f718f74cdc9e9961bf5b33f5b125e853dd" } diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index ac7cef167ff..daee3be92a8 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -267,7 +267,7 @@ __session_close(WT_SESSION *wt_session, const char *config) SESSION_API_CALL_PREPARE_ALLOWED(session, close, config, cfg); WT_UNUSED(cfg); - WT_ERR(__wt_session_close_internal(session)); + WT_TRET(__wt_session_close_internal(session)); session = NULL; err: diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable10.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable10.py index 4a322c61998..12a3daeedfc 100755 --- a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable10.py +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable10.py @@ -38,7 +38,7 @@ from time import sleep def timestamp_str(t): return '%x' % t -def retry_rollback(self, name, code): +def retry_rollback(self, name, txn_session, code): retry_limit = 100 retries = 0 completed = False @@ -46,7 +46,12 @@ def retry_rollback(self, name, code): while not completed and retries < retry_limit: if retries != 0: self.pr("Retrying operation for " + name) + if txn_session: + txn_session.rollback_transaction() sleep(0.1) + if txn_session: + txn_session.begin_transaction('isolation=snapshot') + self.pr("Began new transaction for " + name) try: code() completed = True @@ -164,13 +169,13 @@ class test_rollback_to_stable10(test_rollback_to_stable_base): # Perform several updates in parallel with checkpoint. # Rollbacks may occur when checkpoint is running, so retry as needed. self.pr("updates") - retry_rollback(self, 'update ds1, e', + retry_rollback(self, 'update ds1, e', None, lambda: self.large_updates(uri_1, value_e, ds_1, nrows, 70)) - retry_rollback(self, 'update ds2, e', + retry_rollback(self, 'update ds2, e', None, lambda: self.large_updates(uri_2, value_e, ds_2, nrows, 70)) - retry_rollback(self, 'update ds1, f', + retry_rollback(self, 'update ds1, f', None, lambda: self.large_updates(uri_1, value_f, ds_1, nrows, 80)) - retry_rollback(self, 'update ds2, f', + retry_rollback(self, 'update ds2, f', None, lambda: self.large_updates(uri_2, value_f, ds_2, nrows, 80)) finally: done.set() @@ -271,12 +276,17 @@ class test_rollback_to_stable10(test_rollback_to_stable_base): else: self.conn.set_timestamp('stable_timestamp=' + timestamp_str(50)) - # Here's the update operation we'll perform, encapsulated so we can easily retry + # Here's the update operations we'll perform, encapsulated so we can easily retry # it if we get a rollback. Rollbacks may occur when checkpoint is running. - def simple_update(cursor, key, value): - cursor.set_key(key) - cursor.set_value(value) - self.assertEquals(cursor.update(), 0) + def prepare_range_updates(session, cursor, ds, value, nrows, prepare_config): + self.pr("updates") + for i in range(1, nrows): + key = ds.key(i) + cursor.set_key(key) + cursor.set_value(value) + self.assertEquals(cursor.update(), 0) + self.pr("prepare") + session.prepare_transaction(prepare_config) # Create a checkpoint thread done = threading.Event() @@ -289,23 +299,19 @@ class test_rollback_to_stable10(test_rollback_to_stable_base): session_p1 = self.conn.open_session() cursor_p1 = session_p1.open_cursor(uri_1) session_p1.begin_transaction('isolation=snapshot') - self.pr("updates 1") - for i in range(1, nrows): - retry_rollback(self, 'update ds1', - lambda: simple_update(cursor_p1, ds_1.key(i), value_e)) - self.pr("prepare 1") - session_p1.prepare_transaction('prepare_timestamp=' + timestamp_str(69)) + retry_rollback(self, 'update ds1', session_p1, + lambda: prepare_range_updates( + session_p1, cursor_p1, ds_1, value_e, nrows, + 'prepare_timestamp=' + timestamp_str(69))) # Perform several updates in parallel with checkpoint. session_p2 = self.conn.open_session() cursor_p2 = session_p2.open_cursor(uri_2) session_p2.begin_transaction('isolation=snapshot') - self.pr("updates 2") - for i in range(1, nrows): - retry_rollback(self, 'update ds2', - lambda: simple_update(cursor_p2, ds_2.key(i), value_e)) - self.pr("prepare 2") - session_p2.prepare_transaction('prepare_timestamp=' + timestamp_str(69)) + retry_rollback(self, 'update ds2', session_p2, + lambda: prepare_range_updates( + session_p2, cursor_p2, ds_2, value_e, nrows, + 'prepare_timestamp=' + timestamp_str(69))) finally: done.set() ckpt.join() diff --git a/src/third_party/wiredtiger/test/suite/wttest.py b/src/third_party/wiredtiger/test/suite/wttest.py index a0a86731f1c..617a8326582 100755 --- a/src/third_party/wiredtiger/test/suite/wttest.py +++ b/src/third_party/wiredtiger/test/suite/wttest.py @@ -227,6 +227,7 @@ class WiredTigerTestCase(unittest.TestCase): if hasattr(self, 'scenarios'): assert(len(self.scenarios) == len(dict(self.scenarios))) unittest.TestCase.__init__(self, *args, **kwargs) + self.skipped = False if not self._globalSetup: WiredTigerTestCase.globalSetup() @@ -253,6 +254,10 @@ class WiredTigerTestCase(unittest.TestCase): def buildDirectory(self): return self._builddir + def skipTest(self, reason): + self.skipped = True + super(WiredTigerTestCase, self).skipTest(reason) + # Return the wiredtiger_open extension argument for # any needed shared library. def extensionsConfig(self): @@ -460,9 +465,10 @@ class WiredTigerTestCase(unittest.TestCase): for f in files: os.chmod(os.path.join(root, f), 0o666) self.pr('passed=' + str(passed)) + self.pr('skipped=' + str(self.skipped)) # Clean up unless there's a failure - if passed and not WiredTigerTestCase._preserveFiles: + if (passed and (not WiredTigerTestCase._preserveFiles)) or self.skipped: shutil.rmtree(self.testdir, ignore_errors=True) else: self.pr('preserving directory ' + self.testdir) @@ -470,7 +476,7 @@ class WiredTigerTestCase(unittest.TestCase): elapsed = time.time() - self.starttime if elapsed > 0.001 and WiredTigerTestCase._verbose >= 2: print("%s: %.2f seconds" % (str(self), elapsed)) - if not passed: + if (not passed) and (not self.skipped): print("ERROR in " + str(self)) self.pr('FAIL') self.pr('preserving directory ' + self.testdir) |