SERVER-24858 Tighten assertions around waiting for write concern

author: Kaloian Manassiev <kaloian.manassiev@mongodb.com> 2016-06-30 14:32:38 -0400
committer: Kaloian Manassiev <kaloian.manassiev@mongodb.com> 2016-07-08 13:26:39 -0400
commit: 7c67f3a6fd52ea4933c23ce89f1cc5da62628ec4 (patch)
tree: d843d57ca15094997eff24e10035170244be1c1b /src/mongo/db
parent: 920afddd4910e1efa6736331f2b38e7e276bd3d8 (diff)
download: mongo-7c67f3a6fd52ea4933c23ce89f1cc5da62628ec4.tar.gz
9 files changed, 84 insertions, 108 deletions
diff --git a/src/mongo/db/commands/get_last_error.cpp b/src/mongo/db/commands/get_last_error.cpp
index 7af1a83aafa..498d32d9d0c 100644
--- a/src/mongo/db/commands/get_last_error.cpp
+++ b/src/mongo/db/commands/get_last_error.cpp
@@ -44,6 +44,7 @@
 #include "mongo/util/log.h"
 
 namespace mongo {
+namespace {
 
 using std::string;
 using std::stringstream;
@@ -219,11 +220,9 @@ public:
         //
         // Validate write concern no matter what, this matches 2.4 behavior
         //
-
-
         if (status.isOK()) {
             // Ensure options are valid for this host
-            status = validateWriteConcern(txn, writeConcern, dbname);
+            status = validateWriteConcern(txn, writeConcern);
         }
 
         if (!status.isOK()) {
@@ -316,4 +315,6 @@ public:
         return true;
     }
 } cmdGetPrevError;
-}
+
+}  // namespace
+}  // namespace mongo
diff --git a/src/mongo/db/dbcommands.cpp b/src/mongo/db/dbcommands.cpp
index d6facd3fd3f..91f9ad970d3 100644
--- a/src/mongo/db/dbcommands.cpp
+++ b/src/mongo/db/dbcommands.cpp
@@ -99,7 +99,7 @@
 #include "mongo/s/chunk_version.h"
 #include "mongo/s/client/shard_registry.h"
 #include "mongo/s/grid.h"
-#include "mongo/s/stale_exception.h"  // for SendStaleConfigException
+#include "mongo/s/stale_exception.h"
 #include "mongo/scripting/engine.h"
 #include "mongo/util/fail_point_service.h"
 #include "mongo/util/log.h"
@@ -1473,7 +1473,7 @@ bool Command::run(OperationContext* txn,
             }
         } else {
             // Skip waiting for the OpTime when testing snapshot behavior.
-            if (!testingSnapshotBehaviorInIsolation) {
+            if (!testingSnapshotBehaviorInIsolation && !readConcernArgs.isEmpty()) {
                 // Wait for readConcern to be satisfied.
                 auto readConcernStatus = replCoord->waitUntilOpTimeForRead(txn, readConcernArgs);
                 if (!readConcernStatus.isOK()) {
diff --git a/src/mongo/db/repl/read_concern_args.cpp b/src/mongo/db/repl/read_concern_args.cpp
index c0af213fe56..409560de922 100644
--- a/src/mongo/db/repl/read_concern_args.cpp
+++ b/src/mongo/db/repl/read_concern_args.cpp
@@ -70,6 +70,10 @@ BSONObj ReadConcernArgs::toBSON() const {
     return bob.obj();
 }
 
+bool ReadConcernArgs::isEmpty() const {
+    return getOpTime().isNull() && getLevel() == repl::ReadConcernLevel::kLocalReadConcern;
+}
+
 ReadConcernLevel ReadConcernArgs::getLevel() const {
     return _level.value_or(ReadConcernLevel::kLocalReadConcern);
 }
diff --git a/src/mongo/db/repl/read_concern_args.h b/src/mongo/db/repl/read_concern_args.h
index 5fa9a5fba8c..b258ce04461 100644
--- a/src/mongo/db/repl/read_concern_args.h
+++ b/src/mongo/db/repl/read_concern_args.h
@@ -80,6 +80,12 @@ public:
      */
     void appendInfo(BSONObjBuilder* builder) const;
 
+    /**
+     * Returns whether these arguments are 'empty' in the sense that no read concern has been
+     * requested.
+     */
+    bool isEmpty() const;
+
     ReadConcernLevel getLevel() const;
     OpTime getOpTime() const;
     BSONObj toBSON() const;
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 875468ca3f0..9337ffb9cfd 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -96,8 +96,7 @@ using NextAction = Fetcher::NextAction;
 
 namespace {
 
-Status shutDownInProgressStatus(ErrorCodes::ShutdownInProgress,
-                                "replication system is shutting down");
+const char kLocalDB[] = "local";
 
 void lockAndCall(stdx::unique_lock<stdx::mutex>* lk, const stdx::function<void()>& fn) {
     if (!lk->owns_lock()) {
@@ -1060,6 +1059,10 @@ OpTime ReplicationCoordinatorImpl::getMyLastDurableOpTime() const {
 
 Status ReplicationCoordinatorImpl::waitUntilOpTimeForRead(OperationContext* txn,
                                                           const ReadConcernArgs& settings) {
+    // We should never wait for replication if we are holding any locks, because this can
+    // potentially block for long time while doing network activity.
+    invariant(!txn->lockState()->isLocked());
+
     const bool isMajorityReadConcern =
         settings.getLevel() == ReadConcernLevel::kMajorityReadConcern;
 
@@ -1501,6 +1504,10 @@ ReplicationCoordinator::StatusAndDuration ReplicationCoordinatorImpl::_awaitRepl
     const OpTime& opTime,
     SnapshotName minSnapshot,
     const WriteConcernOptions& writeConcern) {
+    // We should never wait for writes to replicate if we are holding any locks, because the this
+    // can potentially block for long time while doing network activity.
+    invariant(!txn->lockState()->isLocked());
+
     const Mode replMode = getReplicationMode();
     if (replMode == modeNone) {
         // no replication check needed (validated above)
@@ -1808,7 +1815,7 @@ bool ReplicationCoordinatorImpl::canAcceptWritesForDatabase(StringData dbName) {
     if (_canAcceptNonLocalWrites) {
         return true;
     }
-    if (dbName == "local") {
+    if (dbName == kLocalDB) {
         return true;
     }
     return !replAllDead && _settings.isMaster();
@@ -1860,7 +1867,7 @@ bool ReplicationCoordinatorImpl::shouldIgnoreUniqueIndex(const IndexDescriptor*
     if (idx->isIdIndex()) {
         return false;
     }
-    if (nsToDatabaseSubstring(idx->parentNS()) == "local") {
+    if (nsToDatabaseSubstring(idx->parentNS()) == kLocalDB) {
         // always enforce on local
         return false;
     }
@@ -2971,7 +2978,7 @@ SyncSourceResolverResponse ReplicationCoordinatorImpl::selectSyncSource(
         };
         Fetcher candidateProber(&_replExecutor,
                                 candidate,
-                                "local",
+                                kLocalDB,
                                 BSON("find"
                                      << "oplog.rs"
                                      << "limit"
diff --git a/src/mongo/db/write_concern.cpp b/src/mongo/db/write_concern.cpp
index 1fbb5b9d827..ac428b29183 100644
--- a/src/mongo/db/write_concern.cpp
+++ b/src/mongo/db/write_concern.cpp
@@ -58,90 +58,70 @@ static ServerStatusMetricField<TimerStats> displayGleLatency("getLastError.wtime
 static Counter64 gleWtimeouts;
 static ServerStatusMetricField<Counter64> gleWtimeoutsDisplay("getLastError.wtimeouts",
                                                               &gleWtimeouts);
-namespace {
-const std::string kLocalDB = "local";
-}  // namespace
 
 StatusWith<WriteConcernOptions> extractWriteConcern(OperationContext* txn,
                                                     const BSONObj& cmdObj,
                                                     const std::string& dbName,
                                                     const bool supportsWriteConcern) {
-    // The default write concern if empty is w : 1
-    // Specifying w : 0 is/was allowed, but is interpreted identically to w : 1
-    WriteConcernOptions writeConcern =
-        repl::getGlobalReplicationCoordinator()->getGetLastErrorDefault();
-
-    auto wcResult = WriteConcernOptions::extractWCFromCommand(cmdObj, dbName, writeConcern);
+    // The default write concern if empty is {w:1}. Specifying {w:0} is/was allowed, but is
+    // interpreted identically to {w:1}.
+    auto wcResult = WriteConcernOptions::extractWCFromCommand(
+        cmdObj, dbName, repl::ReplicationCoordinator::get(txn)->getGetLastErrorDefault());
     if (!wcResult.isOK()) {
         return wcResult.getStatus();
     }
-    writeConcern = wcResult.getValue();
 
-    // We didn't use the default, so the user supplied their own writeConcern.
-    if (!wcResult.getValue().usedDefault) {
+    WriteConcernOptions writeConcern = wcResult.getValue();
+
+    if (writeConcern.usedDefault) {
+        if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer &&
+            !txn->getClient()->isInDirectClient()) {
+            // This is here only for backwards compatibility with 3.2 clusters which have commands
+            // that do not specify write concern when writing to the config server.
+            writeConcern = {
+                WriteConcernOptions::kMajority, WriteConcernOptions::SyncMode::UNSET, Seconds(30)};
+        }
+    } else if (supportsWriteConcern) {
         // If it supports writeConcern and does not use the default, validate the writeConcern.
-        if (supportsWriteConcern) {
-            Status wcStatus = validateWriteConcern(txn, writeConcern, dbName);
-            if (!wcStatus.isOK()) {
-                return wcStatus;
-            }
-        } else {
-            // This command doesn't do writes so it should not be passed a writeConcern.
-            // If we did not use the default writeConcern, one was provided when it shouldn't have
-            // been by the user.
-            return Status(ErrorCodes::InvalidOptions, "Command does not support writeConcern");
+        Status wcStatus = validateWriteConcern(txn, writeConcern);
+        if (!wcStatus.isOK()) {
+            return wcStatus;
         }
+    } else {
+        // This command doesn't do writes so it should not be passed a writeConcern. If we did not
+        // use the default writeConcern, one was provided when it shouldn't have been by the user.
+        return {ErrorCodes::InvalidOptions, "Command does not support writeConcern"};
     }
 
     return writeConcern;
 }
 
-Status validateWriteConcern(OperationContext* txn,
-                            const WriteConcernOptions& writeConcern,
-                            const std::string& dbName) {
-    const bool isJournalEnabled = getGlobalServiceContext()->getGlobalStorageEngine()->isDurable();
-
-    if (writeConcern.syncMode == WriteConcernOptions::SyncMode::JOURNAL && !isJournalEnabled) {
+Status validateWriteConcern(OperationContext* txn, const WriteConcernOptions& writeConcern) {
+    if (writeConcern.syncMode == WriteConcernOptions::SyncMode::JOURNAL &&
+        !txn->getServiceContext()->getGlobalStorageEngine()->isDurable()) {
         return Status(ErrorCodes::BadValue,
                       "cannot use 'j' option when a host does not have journaling enabled");
     }
 
-    const bool isConfigServer = serverGlobalParams.clusterRole == ClusterRole::ConfigServer;
-    const bool isLocalDb(dbName == kLocalDB);
-    const repl::ReplicationCoordinator::Mode replMode =
-        repl::getGlobalReplicationCoordinator()->getReplicationMode();
-
-    if (isConfigServer) {
-        if (!writeConcern.validForConfigServers()) {
-            return Status(
-                ErrorCodes::BadValue,
-                str::stream()
-                    << "w:1 and w:'majority' are the only valid write concerns when writing to "
-                       "config servers, got: "
-                    << writeConcern.toBSON().toString());
-        }
-
-        if (replMode == repl::ReplicationCoordinator::modeReplSet &&
-            // Allow writes performed within the server to have a write concern of { w: 1 }.
-            // This is so commands have the option to skip waiting for replication if they are
-            // holding locks (ex. addShardToZone). This also allows commands that perform
-            // multiple writes to batch the wait at the end.
-            !txn->getClient()->isInDirectClient() &&
-            !isLocalDb && writeConcern.wMode.empty()) {
-            invariant(writeConcern.wNumNodes == 1);
-            return Status(
-                ErrorCodes::BadValue,
-                str::stream()
-                    << "w: 'majority' is the only valid write concern when writing to config "
-                       "server replica sets, got: "
-                    << writeConcern.toBSON().toString());
+    // Remote callers of the config server (as in callers making network calls, not the internal
+    // logic) should never be making non-majority writes against the config server, because sharding
+    // is not resilient against rollbacks of metadata writes.
+    if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer &&
+        !writeConcern.validForConfigServers()) {
+        // The only cases where we allow non-majority writes are from within the config servers
+        // themselves, because these wait for write concern explicitly.
+        if (!txn->getClient()->isInDirectClient()) {
+            return {ErrorCodes::BadValue,
+                    str::stream() << "w:'majority' is the only valid write concern when writing "
+                                     "to config servers, got: "
+                                  << writeConcern.toBSON()};
         }
     }
 
-    if (replMode == repl::ReplicationCoordinator::modeNone) {
-        if (writeConcern.wNumNodes > 1) {
-            return Status(ErrorCodes::BadValue, "cannot use 'w' > 1 when a host is not replicated");
-        }
+    const auto replMode = repl::ReplicationCoordinator::get(txn)->getReplicationMode();
+
+    if (replMode == repl::ReplicationCoordinator::modeNone && writeConcern.wNumNodes > 1) {
+        return Status(ErrorCodes::BadValue, "cannot use 'w' > 1 when a host is not replicated");
     }
 
     if (replMode != repl::ReplicationCoordinator::modeReplSet && !writeConcern.wMode.empty() &&
@@ -186,45 +166,26 @@ void WriteConcernResult::appendTo(const WriteConcernOptions& writeConcern,
     else
         result->append("err", err);
 
-    // *** 2.4 SyncClusterConnection compatibility ***
-    // 2.4 expects either fsync'd files, or a "waited" field exist after running an fsync : true
-    // GLE, but with journaling we don't actually need to run the fsync (fsync command is
-    // preferred in 2.6).  So we add a "waited" field if one doesn't exist.
-
-    if (writeConcern.syncMode == WriteConcernOptions::SyncMode::FSYNC) {
-        if (fsyncFiles < 0 && (wTime < 0 || !wTimedOut)) {
-            dassert(result->asTempObj()["waited"].eoo());
-            result->appendNumber("waited", syncMillis);
-        }
-
-        // For ephemeral storage engines, 0 files may be fsynced.
-        dassert(result->asTempObj()["fsyncFiles"].numberLong() >= 0 ||
-                !result->asTempObj()["waited"].eoo());
-    }
+    // For ephemeral storage engines, 0 files may be fsynced
+    invariant(writeConcern.syncMode != WriteConcernOptions::SyncMode::FSYNC ||
+              (result->asTempObj()["fsyncFiles"].numberLong() >= 0 ||
+               !result->asTempObj()["waited"].eoo()));
 }
 
 Status waitForWriteConcern(OperationContext* txn,
                            const OpTime& replOpTime,
                            const WriteConcernOptions& writeConcern,
                            WriteConcernResult* result) {
-    // We assume all options have been validated earlier, if not, programming error.
-    // Passing localDB name is a hack to avoid more rigorous check that performed for non local DB.
-    dassert(validateWriteConcern(txn, writeConcern, kLocalDB).isOK());
+    // We assume all options have been validated earlier, if not, programming error
+    dassertOK(validateWriteConcern(txn, writeConcern));
 
-    // We should never be waiting for write concern while holding any sort of lock, because this may
-    // lead to situations where the replication heartbeats are stalled.
-    //
-    // This check does not hold for writes done through dbeval because it runs with a global X lock.
-    dassert(!txn->lockState()->isLocked() || txn->getClient()->isInDirectClient());
+    auto replCoord = repl::ReplicationCoordinator::get(txn);
 
     // Next handle blocking on disk
-
     Timer syncTimer;
-    auto replCoord = repl::getGlobalReplicationCoordinator();
     WriteConcernOptions writeConcernWithPopulatedSyncMode =
         replCoord->populateUnsetWriteConcernOptionsSyncMode(writeConcern);
 
-
     switch (writeConcernWithPopulatedSyncMode.syncMode) {
         case WriteConcernOptions::SyncMode::UNSET:
             severe() << "Attempting to wait on a WriteConcern with an unset sync option";
@@ -270,16 +231,15 @@ Status waitForWriteConcern(OperationContext* txn,
         return Status::OK();
     }
 
-    // Now we wait for replication
-    // Note that replica set stepdowns and gle mode changes are thrown as errors
+    // Replica set stepdowns and gle mode changes are thrown as errors
     repl::ReplicationCoordinator::StatusAndDuration replStatus =
-        repl::getGlobalReplicationCoordinator()->awaitReplication(
-            txn, replOpTime, writeConcernWithPopulatedSyncMode);
+        replCoord->awaitReplication(txn, replOpTime, writeConcernWithPopulatedSyncMode);
     if (replStatus.status == ErrorCodes::WriteConcernFailed) {
         gleWtimeouts.increment();
         result->err = "timeout";
         result->wTimedOut = true;
     }
+
     // Add stats
     result->writtenTo = repl::getGlobalReplicationCoordinator()->getHostsWrittenTo(
         replOpTime,
diff --git a/src/mongo/db/write_concern.h b/src/mongo/db/write_concern.h
index 2b18ff5c538..eeff284ec61 100644
--- a/src/mongo/db/write_concern.h
+++ b/src/mongo/db/write_concern.h
@@ -54,9 +54,7 @@ StatusWith<WriteConcernOptions> extractWriteConcern(OperationContext* txn,
 /**
  * Verifies that a WriteConcern is valid for this particular host.
  */
-Status validateWriteConcern(OperationContext* txn,
-                            const WriteConcernOptions& writeConcern,
-                            const std::string& dbName);
+Status validateWriteConcern(OperationContext* txn, const WriteConcernOptions& writeConcern);
 
 struct WriteConcernResult {
     WriteConcernResult() {
diff --git a/src/mongo/db/write_concern_options.cpp b/src/mongo/db/write_concern_options.cpp
index 2b0c9abef23..95b411d9baa 100644
--- a/src/mongo/db/write_concern_options.cpp
+++ b/src/mongo/db/write_concern_options.cpp
@@ -180,7 +180,7 @@ bool WriteConcernOptions::shouldWaitForOtherNodes() const {
 }
 
 bool WriteConcernOptions::validForConfigServers() const {
-    return wNumNodes == 1 || wMode == kMajority;
+    return wMode == kMajority;
 }
 
 }  // namespace mongo
diff --git a/src/mongo/db/write_concern_options.h b/src/mongo/db/write_concern_options.h
index f969fc08031..06ddfbc56a0 100644
--- a/src/mongo/db/write_concern_options.h
+++ b/src/mongo/db/write_concern_options.h
@@ -79,8 +79,8 @@ public:
     bool shouldWaitForOtherNodes() const;
 
     /**
-     * Returns true if this is a valid write concern to use against a config server.
-     * TODO(spencer): Once we stop supporting SCCC config servers, forbid this from allowing w:1
+     * Returns true if this is a {w:majority} write concern, which is the only valid write concern
+     * to use against a config server.
      */
     bool validForConfigServers() const;
author	Kaloian Manassiev <kaloian.manassiev@mongodb.com>	2016-06-30 14:32:38 -0400
committer	Kaloian Manassiev <kaloian.manassiev@mongodb.com>	2016-07-08 13:26:39 -0400
commit	7c67f3a6fd52ea4933c23ce89f1cc5da62628ec4 (patch)
tree	d843d57ca15094997eff24e10035170244be1c1b /src/mongo/db
parent	920afddd4910e1efa6736331f2b38e7e276bd3d8 (diff)
download	mongo-7c67f3a6fd52ea4933c23ce89f1cc5da62628ec4.tar.gz