diff options
author | Esha Maharishi <esha.maharishi@mongodb.com> | 2018-06-11 15:58:28 -0400 |
---|---|---|
committer | Esha Maharishi <esha.maharishi@mongodb.com> | 2018-06-11 16:15:23 -0400 |
commit | b7c49f9ba1e9684028a08ee9de0b03be51470706 (patch) | |
tree | 6852b4395215c96f64e51e6256204860480459fd | |
parent | 1d378efb7e7aec14404b4d50df524fc2f1a3e881 (diff) | |
download | mongo-b7c49f9ba1e9684028a08ee9de0b03be51470706.tar.gz |
SERVER-34072 Temporarily blacklist database_versioning.js workload from non-stepdown concurrency suites as well
(cherry picked from commit f4f7f2cc15ad46f20de8566e913fb5d25cfd4778)
SERVER-34072 config.databases writes with new version field should take the global IX lock rather than the fcvLock
(cherry picked from commit 0254efea54ab78d065e7df44a2d21b4c5b62212e)
19 files changed, 335 insertions, 152 deletions
diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_causal_consistency.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_causal_consistency.yml index d8a5284ae5b..e3eba518069 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_causal_consistency.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_causal_consistency.yml @@ -112,6 +112,9 @@ selector: - jstests/concurrency/fsm_workloads/yield_and_hashed.js - jstests/concurrency/fsm_workloads/yield_and_sorted.js + # TODO Unblacklist (SERVER-35538). + - jstests/concurrency/fsm_workloads/database_versioning.js + executor: archive: hooks: diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_causal_consistency_and_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_causal_consistency_and_balancer.yml index d87180a16f2..5da18cfa424 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_causal_consistency_and_balancer.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_causal_consistency_and_balancer.yml @@ -119,6 +119,9 @@ selector: - jstests/concurrency/fsm_workloads/yield_and_hashed.js - jstests/concurrency/fsm_workloads/yield_and_sorted.js + # TODO Unblacklist (SERVER-35538). + - jstests/concurrency/fsm_workloads/database_versioning.js + executor: archive: hooks: diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_replication.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_replication.yml index b5e67ad6a15..48843083a1f 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_replication.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_replication.yml @@ -109,6 +109,9 @@ selector: - jstests/concurrency/fsm_workloads/yield_and_hashed.js - jstests/concurrency/fsm_workloads/yield_and_sorted.js + # TODO Unblacklist (SERVER-35538). + - jstests/concurrency/fsm_workloads/database_versioning.js + executor: archive: hooks: diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer.yml index 2f166cd0027..ebcf7db5533 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer.yml @@ -116,6 +116,9 @@ selector: - jstests/concurrency/fsm_workloads/yield_and_hashed.js - jstests/concurrency/fsm_workloads/yield_and_sorted.js + # TODO Unblacklist (SERVER-35538). + - jstests/concurrency/fsm_workloads/database_versioning.js + executor: archive: hooks: diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml index e118de74a15..d3812362bc1 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml @@ -173,6 +173,11 @@ selector: # much value in running it. - jstests/concurrency/fsm_workloads/drop_collection.js + # TODO (SERVER-35534) Unblacklist this workload from the concurrency stepdown suites. It fails + # with PooledConnectionsDropped when setFCV is run concurrently with movePrimary, which seems like + # it's due to a race condition in the NetworkInterfaceTL. + - jstests/concurrency/fsm_workloads/database_versioning.js + executor: archive: hooks: diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml index d198a78899f..9b0ac84484d 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml @@ -178,6 +178,12 @@ selector: # much value in running it. - jstests/concurrency/fsm_workloads/drop_collection.js + + # TODO (SERVER-35534) Unblacklist this workload from the concurrency stepdown suites. It fails + # with PooledConnectionsDropped when setFCV is run concurrently with movePrimary, which seems like + # it's due to a race condition in the NetworkInterfaceTL. + - jstests/concurrency/fsm_workloads/database_versioning.js + executor: archive: hooks: diff --git a/jstests/concurrency/fsm_workloads/database_versioning.js b/jstests/concurrency/fsm_workloads/database_versioning.js new file mode 100644 index 00000000000..f7b02fc914a --- /dev/null +++ b/jstests/concurrency/fsm_workloads/database_versioning.js @@ -0,0 +1,154 @@ +'use strict'; + +/** + * Stress tests writes to config.databases while continuously running setFCV to ensure that the + * config.databases schema always matches the FCV. + */ +var $config = (function() { + + var states = (function() { + + function init(db, collName) { + // Dynamically load the shard names for the movePrimary thread to avoid hard-coding + // them. + this.shards = db.getSiblingDB("config").shards.find().toArray().map(shard => shard._id); + } + + function setFCV(db, data) { + if (data.fcv === undefined) { + data.fcv = "4.0"; + } + + // First check that the current entries match the current FCV's schema. + const databases = db.getSiblingDB("config").databases.find().toArray(); + for (let i in databases) { + const database = databases[i]; + if (data.fcv === "3.6") { + assertAlways.eq(undefined, + database.version, + "database had a version in FCV 3.6: " + tojson(database)); + } else { + assertAlways.neq( + undefined, + database.version, + "database did not have a version in FCV 4.0: " + tojson(database)); + } + } + + // Then change the FCV. + data.fcv = (data.fcv === "4.0") ? "3.6" : "4.0"; + assertAlways.commandWorked(db.adminCommand({setFeatureCompatibilityVersion: data.fcv})); + } + + function createWithEnableSharding(db, data) { + data.dbNameCount = (data.dbNameCount === undefined) ? 0 : data.dbNameCount + 1; + assertAlways.commandWorked( + db.adminCommand({enableSharding: "createWithEnableShardingDb" + data.dbNameCount})); + } + + function createWithInsert(db, data) { + data.dbNameCount = (data.dbNameCount === undefined) ? 0 : data.dbNameCount + 1; + assertAlways.commandWorked( + db.getSiblingDB("createWithInsertDb" + data.dbNameCount).foo.insert({x: 1})); + } + + function movePrimary(db, data) { + // Assume an arbitrary shard is the current primary shard; if it's not, the first + // iteration will be a no-op. + if (data.primaryShard === undefined) { + data.primaryShard = data.shards[0]; + } + + const toShard = + (data.primaryShard === data.shards[0]) ? data.shards[1] : data.shards[0]; + const res = db.adminCommand({movePrimary: "movePrimaryDb", to: toShard}); + + // movePrimary will correctly error if the FCV changes while it is running. + if (res.code === ErrorCodes.ConflictingOperationInProgress) { + return; + } + + assertAlways.commandWorked(res); + data.primaryShard = toShard; + } + + function state(db, collName) { + switch (this.tid) { + case 0: + setFCV(db, this); + break; + case 1: + createWithEnableSharding(db, this); + break; + case 2: + createWithInsert(db, this); + break; + case 3: + movePrimary(db, this); + break; + } + } + + return {init: init, state: state}; + + })(); + + var transitions = {init: {state: 1}, state: {state: 1}}; + + function setup(db, collName, cluster) { + // Ensure the cluster starts in FCV 4.0. + assertAlways.commandWorked(db.adminCommand({setFeatureCompatibilityVersion: "4.0"})); + + // Create a database for the thread doing movePrimary. Use 'enableSharding' rather than + // an insert to create the database (if an unsharded collection exists in a database and an + // initial movePrimary attempt fails after copying the unsharded collection to the new + // primary, retrying the movePrimary will fail). + assertAlways.commandWorked(db.adminCommand({enableSharding: "movePrimaryDb"})); + } + + function teardown(db, collName, cluster) { + // Ensure all databases have data in them so that they show up in listDatabases (which calls + // listDatabases against all shards rather than reading config.databases). This guarantees + // they are dropped by the cleanup hook. + const databases = db.getSiblingDB("config").databases.find().toArray(); + for (let i in databases) { + const database = databases[i]; + assertAlways.commandWorked(db.getSiblingDB(databases[i]._id).foo.insert({x: 1})); + } + + // If this workload is run with --repeat, mongos will already have all the database entries + // cached. Because of SERVER-xxx (mongos does not mark its database entry as invalid on + // CannotImplicitlyCreateCollection), this mongos will never realize the databases have been + // dropped, and so will never send the implicit createDatabase for writes in the next run + // (and instead will exhaust retries of CannotImplicitlyCreateCollection). + // As a temporary workaround, flush mongos's cache at the end of each workload. + assertAlways.commandWorked(db.adminCommand({flushRouterConfig: 1})); + } + + function skip(cluster) { + if (!cluster.isSharded()) { + return {skip: true, msg: 'only runs in a sharded cluster'}; + } + return {skip: false}; + } + + // This test performs sharding catalog operations (which take distributed locks) concurrently + // from many threads. Since a distributed lock is acquired by repeatedly attempting to grab the + // lock every half second for 20 seconds (a max of 40 attempts), it's possible that some thread + // will be starved by the other threads and fail to grab the lock after 40 attempts. To reduce + // the likelihood of this, we choose threadCount and iterations so that threadCount * iterations + // does not exceed 40. + // Note that this test's structure requires at least 4 threads (one per sharding catalog op). + // The iterations can be increased after PM-697 ("Remove all usages of distributed lock"). + return { + threadCount: 4, + iterations: 10, + data: {}, + states: states, + transitions: transitions, + setup: setup, + teardown: teardown, + skip: skip, + }; + +})(); diff --git a/src/mongo/db/s/SConscript b/src/mongo/db/s/SConscript index 66ce03e8045..2216e68abae 100644 --- a/src/mongo/db/s/SConscript +++ b/src/mongo/db/s/SConscript @@ -197,6 +197,7 @@ env.Library( '$BUILD_DIR/mongo/db/catalog/collection_options', '$BUILD_DIR/mongo/db/catalog_raii', '$BUILD_DIR/mongo/db/repl/read_concern_args', + '$BUILD_DIR/mongo/db/rw_concern_d', '$BUILD_DIR/mongo/executor/network_interface', '$BUILD_DIR/mongo/s/catalog/sharding_catalog_client', '$BUILD_DIR/mongo/s/client/sharding_client', diff --git a/src/mongo/db/s/config/configsvr_add_shard_command.cpp b/src/mongo/db/s/config/configsvr_add_shard_command.cpp index bf5ed552446..53a7a53b6e6 100644 --- a/src/mongo/db/s/config/configsvr_add_shard_command.cpp +++ b/src/mongo/db/s/config/configsvr_add_shard_command.cpp @@ -97,11 +97,6 @@ public: "_configsvrAddShard can only be run on config servers"); } - // Do not allow adding shards while a featureCompatibilityVersion upgrade or downgrade is in - // progress (see SERVER-31231 for details). - invariant(!opCtx->lockState()->isLocked()); - Lock::SharedLock lk(opCtx->lockState(), FeatureCompatibilityVersion::fcvLock); - // Set the operation context read concern level to local for reads into the config database. repl::ReadConcernArgs::get(opCtx) = repl::ReadConcernArgs(repl::ReadConcernLevel::kLocalReadConcern); diff --git a/src/mongo/db/s/config/configsvr_move_primary_command.cpp b/src/mongo/db/s/config/configsvr_move_primary_command.cpp index 82b49d344af..5dbab2f5023 100644 --- a/src/mongo/db/s/config/configsvr_move_primary_command.cpp +++ b/src/mongo/db/s/config/configsvr_move_primary_command.cpp @@ -259,12 +259,13 @@ public: } { - // Check if the FCV has been changed under us. - invariant(!opCtx->lockState()->isLocked()); - Lock::SharedLock lk(opCtx->lockState(), FeatureCompatibilityVersion::fcvLock); + // Hold the Global IX lock across checking the FCV and writing the database entry. + // Because of the Global S lock barrier in setFCV, this ensures a concurrent setFCV + // will block before performing schema upgrade until we have written the entry. + Lock::GlobalLock lk(opCtx, MODE_IX); // If we are upgrading to (or are fully on) FCV 4.0, then fail. If we do not fail, we - // will potentially write an unversioned database in a schema that requires versions. + // will potentially change the primary shard of a database without changing its version. if (serverGlobalParams.featureCompatibility.getVersion() == ServerGlobalParams::FeatureCompatibility::Version::kUpgradingTo40 || serverGlobalParams.featureCompatibility.getVersion() == @@ -273,9 +274,14 @@ public: "committing movePrimary failed due to version mismatch"); } - // Update the new primary in the config server metadata. - dbType.setPrimary(toShard->getId()); - uassertStatusOK(catalogClient->updateDatabase(opCtx, dbname, dbType)); + // Update the database entry with the new primary shard. + uassertStatusOK(catalogClient->updateConfigDocument( + opCtx, + DatabaseType::ConfigNS, + BSON(DatabaseType::name(dbname)), + BSON("$set" << BSON(DatabaseType::primary(toShard->getId().toString()))), + false, + ShardingCatalogClient::kLocalWriteConcern)); } diff --git a/src/mongo/db/s/config/sharding_catalog_manager_database_operations.cpp b/src/mongo/db/s/config/sharding_catalog_manager_database_operations.cpp index ad61fe0d787..a7371789025 100644 --- a/src/mongo/db/s/config/sharding_catalog_manager_database_operations.cpp +++ b/src/mongo/db/s/config/sharding_catalog_manager_database_operations.cpp @@ -37,6 +37,7 @@ #include "mongo/db/namespace_string.h" #include "mongo/db/repl/repl_client_info.h" #include "mongo/db/server_options.h" +#include "mongo/db/write_concern.h" #include "mongo/s/catalog/sharding_catalog_client_impl.h" #include "mongo/s/catalog/type_database.h" #include "mongo/s/catalog_cache.h" @@ -111,11 +112,11 @@ DatabaseType ShardingCatalogManager::createDatabase(OperationContext* opCtx, const auto primaryShardId = uassertStatusOK(_selectShardForNewDatabase(opCtx, Grid::get(opCtx)->shardRegistry())); - // Take the fcvLock to prevent the fcv from changing from the point we decide whether to include - // a databaseVersion until after we finish writing the database entry. Otherwise, we may end up - // in fcv>3.6, but without a databaseVersion. - invariant(!opCtx->lockState()->isLocked()); - Lock::SharedLock lk(opCtx->lockState(), FeatureCompatibilityVersion::fcvLock); + // Hold the Global IX lock across checking the FCV and writing the new database entry. + // Because of the Global S lock barrier in setFCV, this ensures either the setFCV schema + // upgrade/downgrade will block until we have written the entry, or we will write the entry in + // the target FCV's schema. + Lock::GlobalLock lk(opCtx, MODE_IX); // If in FCV>3.6, generate a databaseVersion, including a UUID, for the new database. boost::optional<DatabaseVersion> dbVersion = boost::none; @@ -132,7 +133,7 @@ DatabaseType ShardingCatalogManager::createDatabase(OperationContext* opCtx, log() << "Registering new database " << db << " in sharding catalog"; uassertStatusOK(Grid::get(opCtx)->catalogClient()->insertConfigDocument( - opCtx, DatabaseType::ConfigNS, db.toBSON(), ShardingCatalogClient::kMajorityWriteConcern)); + opCtx, DatabaseType::ConfigNS, db.toBSON(), ShardingCatalogClient::kLocalWriteConcern)); return db; } @@ -154,8 +155,26 @@ void ShardingCatalogManager::enableSharding(OperationContext* opCtx, const std:: auto dbType = createDatabase(opCtx, dbName); dbType.setSharded(true); + // We must wait for the database entry to be majority committed, because it's possible that + // reading from the majority snapshot has been set on the RecoveryUnit due to an earlier read, + // such as overtaking a distlock or loading the ShardRegistry. + WriteConcernResult unusedResult; + uassertStatusOK( + waitForWriteConcern(opCtx, + repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp(), + WriteConcernOptions(WriteConcernOptions::kMajority, + WriteConcernOptions::SyncMode::UNSET, + Milliseconds{30000}), + &unusedResult)); + log() << "Enabling sharding for database [" << dbName << "] in config db"; - uassertStatusOK(Grid::get(opCtx)->catalogClient()->updateDatabase(opCtx, dbName, dbType)); + uassertStatusOK(Grid::get(opCtx)->catalogClient()->updateConfigDocument( + opCtx, + DatabaseType::ConfigNS, + BSON(DatabaseType::name(dbName)), + BSON("$set" << BSON(DatabaseType::sharded(true))), + false, + ShardingCatalogClient::kLocalWriteConcern)); } StatusWith<std::vector<std::string>> ShardingCatalogManager::getDatabasesForShard( @@ -192,9 +211,10 @@ Status ShardingCatalogManager::commitMovePrimary(OperationContext* opCtx, auto const configShard = Grid::get(opCtx)->shardRegistry()->getConfigShard(); - // Check if the FCV has been changed under us. - invariant(!opCtx->lockState()->isLocked()); - Lock::SharedLock lk(opCtx->lockState(), FeatureCompatibilityVersion::fcvLock); + // Hold the Global IX lock across checking the FCV and writing the database entry. Because of + // the Global S lock barrier in setFCV, this ensures a concurrent setFCV will block before + // performing schema downgrade until we have written the entry. + Lock::GlobalLock lk(opCtx, MODE_IX); auto currentFCV = serverGlobalParams.featureCompatibility.getVersion(); @@ -250,8 +270,8 @@ Status ShardingCatalogManager::commitMovePrimary(OperationContext* opCtx, DatabaseType::ConfigNS, updateQueryBuilder.obj(), newDbType.toBSON(), - true, - ShardingCatalogClient::kMajorityWriteConcern); + false, + ShardingCatalogClient::kLocalWriteConcern); if (!updateStatus.isOK()) { log() << "error committing movePrimary: " << dbname diff --git a/src/mongo/db/s/config/sharding_catalog_manager_shard_operations.cpp b/src/mongo/db/s/config/sharding_catalog_manager_shard_operations.cpp index e6c68ba2f04..153e9f78b71 100644 --- a/src/mongo/db/s/config/sharding_catalog_manager_shard_operations.cpp +++ b/src/mongo/db/s/config/sharding_catalog_manager_shard_operations.cpp @@ -45,6 +45,7 @@ #include "mongo/db/audit.h" #include "mongo/db/catalog_raii.h" #include "mongo/db/client.h" +#include "mongo/db/commands/feature_compatibility_version.h" #include "mongo/db/commands/feature_compatibility_version_command_parser.h" #include "mongo/db/commands/feature_compatibility_version_parser.h" #include "mongo/db/namespace_string.h" @@ -659,65 +660,84 @@ StatusWith<std::string> ShardingCatalogManager::addShard( return batchResponseStatus; } + { + // Hold the fcvLock across checking the FCV, sending setFCV to the new shard, and + // writing the entry for the new shard to config.shards. This ensures the FCV doesn't change + // after we send setFCV to the new shard, but before we write its entry to config.shards. + // (Note, we don't use a Global IX lock here, because we don't want to hold the global lock + // while blocking on the network). + invariant(!opCtx->lockState()->isLocked()); + Lock::SharedLock lk(opCtx->lockState(), FeatureCompatibilityVersion::fcvLock); + + BSONObj setFCVCmd; + switch (serverGlobalParams.featureCompatibility.getVersion()) { + case ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo40: + case ServerGlobalParams::FeatureCompatibility::Version::kUpgradingTo40: + setFCVCmd = BSON(FeatureCompatibilityVersionCommandParser::kCommandName + << FeatureCompatibilityVersionParser::kVersion40 + << WriteConcernOptions::kWriteConcernField + << opCtx->getWriteConcern().toBSON()); + break; + default: + setFCVCmd = BSON(FeatureCompatibilityVersionCommandParser::kCommandName + << FeatureCompatibilityVersionParser::kVersion36 + << WriteConcernOptions::kWriteConcernField + << opCtx->getWriteConcern().toBSON()); + break; + } + auto versionResponse = + _runCommandForAddShard(opCtx, targeter.get(), NamespaceString::kAdminDb, setFCVCmd); + if (!versionResponse.isOK()) { + return versionResponse.getStatus(); + } - // Since addShard runs under the fcvLock, it is guaranteed the fcv state won't change, but it's - // possible an earlier setFCV failed partway, so we handle all possible fcv states. Note, if - // the state is upgrading (downgrading), a user cannot switch to downgrading (upgrading) without - // first finishing the upgrade (downgrade). - BSONObj setFCVCmd; - switch (serverGlobalParams.featureCompatibility.getVersion()) { - case ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo40: - case ServerGlobalParams::FeatureCompatibility::Version::kUpgradingTo40: - setFCVCmd = BSON(FeatureCompatibilityVersionCommandParser::kCommandName - << FeatureCompatibilityVersionParser::kVersion40 - << WriteConcernOptions::kWriteConcernField - << opCtx->getWriteConcern().toBSON()); - break; - default: - setFCVCmd = BSON(FeatureCompatibilityVersionCommandParser::kCommandName - << FeatureCompatibilityVersionParser::kVersion36 - << WriteConcernOptions::kWriteConcernField - << opCtx->getWriteConcern().toBSON()); - break; - } - auto versionResponse = - _runCommandForAddShard(opCtx, targeter.get(), NamespaceString::kAdminDb, setFCVCmd); - if (!versionResponse.isOK()) { - return versionResponse.getStatus(); - } - - if (!versionResponse.getValue().commandStatus.isOK()) { - return versionResponse.getValue().commandStatus; - } + if (!versionResponse.getValue().commandStatus.isOK()) { + return versionResponse.getValue().commandStatus; + } - log() << "going to insert new entry for shard into config.shards: " << shardType.toString(); + log() << "going to insert new entry for shard into config.shards: " << shardType.toString(); - Status result = Grid::get(opCtx)->catalogClient()->insertConfigDocument( - opCtx, - ShardType::ConfigNS, - shardType.toBSON(), - ShardingCatalogClient::kMajorityWriteConcern); - if (!result.isOK()) { - log() << "error adding shard: " << shardType.toBSON() << " err: " << result.reason(); - return result; + Status result = Grid::get(opCtx)->catalogClient()->insertConfigDocument( + opCtx, + ShardType::ConfigNS, + shardType.toBSON(), + ShardingCatalogClient::kLocalWriteConcern); + if (!result.isOK()) { + log() << "error adding shard: " << shardType.toBSON() << " err: " << result.reason(); + return result; + } } // Add all databases which were discovered on the new shard for (const auto& dbName : dbNamesStatus.getValue()) { DatabaseType dbt(dbName, shardType.getName(), false); - // If we're in FCV 4.0, we should add a version to each database. - if (serverGlobalParams.featureCompatibility.getVersion() == - ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo40 || - serverGlobalParams.featureCompatibility.getVersion() == - ServerGlobalParams::FeatureCompatibility::Version::kUpgradingTo40) { - dbt.setVersion(databaseVersion::makeNew()); - } + { + // Hold the Global IX lock across checking the FCV and writing the database entry. + // Because of the Global S lock barrier in setFCV, this ensures either the setFCV schema + // upgrade/downgrade will block until we have written the entry, or we will write the + // entry in the target FCV's schema. + Lock::GlobalLock lk(opCtx, MODE_IX); + + // If we're in FCV 4.0, we should add a version to each database. + if (serverGlobalParams.featureCompatibility.getVersion() == + ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo40 || + serverGlobalParams.featureCompatibility.getVersion() == + ServerGlobalParams::FeatureCompatibility::Version::kUpgradingTo40) { + dbt.setVersion(databaseVersion::makeNew()); + } - Status status = Grid::get(opCtx)->catalogClient()->updateDatabase(opCtx, dbName, dbt); - if (!status.isOK()) { - log() << "adding shard " << shardConnectionString.toString() - << " even though could not add database " << dbName; + const auto status = Grid::get(opCtx)->catalogClient()->updateConfigDocument( + opCtx, + DatabaseType::ConfigNS, + BSON(DatabaseType::name(dbName)), + dbt.toBSON(), + true, + ShardingCatalogClient::kLocalWriteConcern); + if (!status.isOK()) { + log() << "adding shard " << shardConnectionString.toString() + << " even though could not add database " << dbName; + } } } diff --git a/src/mongo/db/s/shard_filtering_metadata_refresh.cpp b/src/mongo/db/s/shard_filtering_metadata_refresh.cpp index 04288ed16bd..7a677c60d8a 100644 --- a/src/mongo/db/s/shard_filtering_metadata_refresh.cpp +++ b/src/mongo/db/s/shard_filtering_metadata_refresh.cpp @@ -204,39 +204,13 @@ void forceDatabaseRefresh(OperationContext* opCtx, const StringData dbName) { uassertStatusOK(Grid::get(opCtx)->catalogCache()->getDatabaseWithRefresh(opCtx, dbName)) .databaseVersion(); - // Only set the in-memory version in FCV 4.0, and hold the lock across checking the FCV and - // setting the version. - Lock::SharedLock lk(opCtx->lockState(), FeatureCompatibilityVersion::fcvLock); - if (serverGlobalParams.featureCompatibility.getVersion() == - ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo40) { - // First, check under a shared lock if another thread already updated the cached version. - // This is a best-effort optimization to make as few threads as possible to convoy on the - // exclusive lock below. - { - // Take the DBLock directly rather than using AutoGetDb, to prevent a recursive call - // into checkDbVersion(). - Lock::DBLock dbLock(opCtx, dbName, MODE_IS); - const auto db = DatabaseHolder::getDatabaseHolder().get(opCtx, dbName); - if (!db) { - log() << "Database " << dbName - << " has been dropped; not caching the refreshed databaseVersion"; - return; - } - - const auto cachedDbVersion = DatabaseShardingState::get(db).getDbVersion(opCtx); - if (cachedDbVersion && refreshedDbVersion && - cachedDbVersion->getUuid() == refreshedDbVersion->getUuid() && - cachedDbVersion->getLastMod() >= refreshedDbVersion->getLastMod()) { - LOG(2) << "Skipping setting cached databaseVersion for " << dbName - << " to refreshed version " << refreshedDbVersion->toBSON() - << " because current cached databaseVersion is already " - << cachedDbVersion->toBSON(); - return; - } - } - - // The cached version is older than the refreshed version; update the cached version. - Lock::DBLock dbLock(opCtx, dbName, MODE_X); + // First, check under a shared lock if another thread already updated the cached version. + // This is a best-effort optimization to make as few threads as possible to convoy on the + // exclusive lock below. + { + // Take the DBLock directly rather than using AutoGetDb, to prevent a recursive call + // into checkDbVersion(). + Lock::DBLock dbLock(opCtx, dbName, MODE_IS); const auto db = DatabaseHolder::getDatabaseHolder().get(opCtx, dbName); if (!db) { log() << "Database " << dbName @@ -244,8 +218,28 @@ void forceDatabaseRefresh(OperationContext* opCtx, const StringData dbName) { return; } - DatabaseShardingState::get(db).setDbVersion(opCtx, std::move(refreshedDbVersion)); + const auto cachedDbVersion = DatabaseShardingState::get(db).getDbVersion(opCtx); + if (cachedDbVersion && refreshedDbVersion && + cachedDbVersion->getUuid() == refreshedDbVersion->getUuid() && + cachedDbVersion->getLastMod() >= refreshedDbVersion->getLastMod()) { + LOG(2) << "Skipping setting cached databaseVersion for " << dbName + << " to refreshed version " << refreshedDbVersion->toBSON() + << " because current cached databaseVersion is already " + << cachedDbVersion->toBSON(); + return; + } } + + // The cached version is older than the refreshed version; update the cached version. + Lock::DBLock dbLock(opCtx, dbName, MODE_X); + const auto db = DatabaseHolder::getDatabaseHolder().get(opCtx, dbName); + if (!db) { + log() << "Database " << dbName + << " has been dropped; not caching the refreshed databaseVersion"; + return; + } + + DatabaseShardingState::get(db).setDbVersion(opCtx, std::move(refreshedDbVersion)); } } // namespace mongo diff --git a/src/mongo/s/catalog/sharding_catalog_client.h b/src/mongo/s/catalog/sharding_catalog_client.h index 427a4c53775..ef0337dbf1a 100644 --- a/src/mongo/s/catalog/sharding_catalog_client.h +++ b/src/mongo/s/catalog/sharding_catalog_client.h @@ -109,13 +109,6 @@ public: virtual void shutDown(OperationContext* opCtx) = 0; /** - * Updates or creates the metadata for a given database. - */ - virtual Status updateDatabase(OperationContext* opCtx, - const std::string& dbName, - const DatabaseType& db) = 0; - - /** * Retrieves the metadata for a given database, if it exists. * * @param dbName name of the database (case sensitive) diff --git a/src/mongo/s/catalog/sharding_catalog_client_impl.cpp b/src/mongo/s/catalog/sharding_catalog_client_impl.cpp index 0afb6f59f2c..7ebc7561849 100644 --- a/src/mongo/s/catalog/sharding_catalog_client_impl.cpp +++ b/src/mongo/s/catalog/sharding_catalog_client_impl.cpp @@ -150,20 +150,6 @@ Status ShardingCatalogClientImpl::updateShardingCatalogEntryForCollection( return status.getStatus().withContext(str::stream() << "Collection metadata write failed"); } -Status ShardingCatalogClientImpl::updateDatabase(OperationContext* opCtx, - const std::string& dbName, - const DatabaseType& db) { - fassert(28616, db.validate()); - - auto status = updateConfigDocument(opCtx, - DatabaseType::ConfigNS, - BSON(DatabaseType::name(dbName)), - db.toBSON(), - true, - ShardingCatalogClient::kMajorityWriteConcern); - return status.getStatus().withContext(str::stream() << "Database metadata write failed"); -} - Status ShardingCatalogClientImpl::logAction(OperationContext* opCtx, const std::string& what, const std::string& ns, diff --git a/src/mongo/s/catalog/sharding_catalog_client_impl.h b/src/mongo/s/catalog/sharding_catalog_client_impl.h index f20ea1e4ac1..6ef87d060ff 100644 --- a/src/mongo/s/catalog/sharding_catalog_client_impl.h +++ b/src/mongo/s/catalog/sharding_catalog_client_impl.h @@ -71,10 +71,6 @@ public: void shutDown(OperationContext* opCtx) override; - Status updateDatabase(OperationContext* opCtx, - const std::string& dbName, - const DatabaseType& db) override; - Status logAction(OperationContext* opCtx, const std::string& what, const std::string& ns, diff --git a/src/mongo/s/catalog/sharding_catalog_client_mock.cpp b/src/mongo/s/catalog/sharding_catalog_client_mock.cpp index 04fdd6b649e..b1a1633f4b0 100644 --- a/src/mongo/s/catalog/sharding_catalog_client_mock.cpp +++ b/src/mongo/s/catalog/sharding_catalog_client_mock.cpp @@ -63,17 +63,6 @@ void ShardingCatalogClientMock::shutDown(OperationContext* opCtx) { } } -Status ShardingCatalogClientMock::enableSharding(OperationContext* opCtx, - const std::string& dbName) { - return {ErrorCodes::InternalError, "Method not implemented"}; -} - -Status ShardingCatalogClientMock::updateDatabase(OperationContext* opCtx, - const string& dbName, - const DatabaseType& db) { - return {ErrorCodes::InternalError, "Method not implemented"}; -} - StatusWith<repl::OpTimeWith<DatabaseType>> ShardingCatalogClientMock::getDatabase( OperationContext* opCtx, const string& dbName, repl::ReadConcernLevel readConcernLevel) { return {ErrorCodes::InternalError, "Method not implemented"}; diff --git a/src/mongo/s/catalog/sharding_catalog_client_mock.h b/src/mongo/s/catalog/sharding_catalog_client_mock.h index 643ef0e3d78..d98c644792d 100644 --- a/src/mongo/s/catalog/sharding_catalog_client_mock.h +++ b/src/mongo/s/catalog/sharding_catalog_client_mock.h @@ -44,12 +44,6 @@ public: void shutDown(OperationContext* opCtx) override; - Status enableSharding(OperationContext* opCtx, const std::string& dbName); - - Status updateDatabase(OperationContext* opCtx, - const std::string& dbName, - const DatabaseType& db) override; - StatusWith<repl::OpTimeWith<DatabaseType>> getDatabase( OperationContext* opCtx, const std::string& dbName, diff --git a/src/mongo/s/catalog/sharding_catalog_test.cpp b/src/mongo/s/catalog/sharding_catalog_test.cpp index c8a15fb5c36..fa954472e32 100644 --- a/src/mongo/s/catalog/sharding_catalog_test.cpp +++ b/src/mongo/s/catalog/sharding_catalog_test.cpp @@ -1080,7 +1080,13 @@ TEST_F(ShardingCatalogClientTest, UpdateDatabase) { DatabaseType dbt("test", ShardId("shard0000"), true); auto future = launchAsync([this, dbt] { - auto status = catalogClient()->updateDatabase(operationContext(), dbt.getName(), dbt); + auto status = + catalogClient()->updateConfigDocument(operationContext(), + DatabaseType::ConfigNS, + BSON(DatabaseType::name(dbt.getName())), + dbt.toBSON(), + true, + ShardingCatalogClient::kMajorityWriteConcern); ASSERT_OK(status); }); @@ -1114,14 +1120,20 @@ TEST_F(ShardingCatalogClientTest, UpdateDatabase) { future.timed_get(kFutureTimeout); } -TEST_F(ShardingCatalogClientTest, UpdateDatabaseExceededTimeLimit) { +TEST_F(ShardingCatalogClientTest, UpdateConfigDocumentExceededTimeLimit) { HostAndPort host1("TestHost1"); configTargeter()->setFindHostReturnValue(host1); DatabaseType dbt("test", ShardId("shard0001"), false); auto future = launchAsync([this, dbt] { - auto status = catalogClient()->updateDatabase(operationContext(), dbt.getName(), dbt); + auto status = + catalogClient()->updateConfigDocument(operationContext(), + DatabaseType::ConfigNS, + BSON(DatabaseType::name(dbt.getName())), + dbt.toBSON(), + true, + ShardingCatalogClient::kMajorityWriteConcern); ASSERT_EQ(ErrorCodes::ExceededTimeLimit, status); }); |