SERVER-60577 Add logic to check time-series buckets for mixed-schema data at index time

author: Gregory Wlodarek <gregory.wlodarek@mongodb.com> 2021-10-30 01:05:07 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2021-10-30 01:38:21 +0000
commit: 8b405630fc1506abf620aaf4b08363d015327c9b (patch)
tree: d8f81d4f33499857d6c76fb2694cf9c5d4cccaa6 /src/mongo/db/catalog
parent: 2455e1c112c89a3bdde41d718cadf7f9cc9b5bf0 (diff)
download: mongo-8b405630fc1506abf620aaf4b08363d015327c9b.tar.gz
2 files changed, 92 insertions, 0 deletions
diff --git a/src/mongo/db/catalog/multi_index_block.cpp b/src/mongo/db/catalog/multi_index_block.cpp
index 79ad8d98d47..2d4bf748a29 100644
--- a/src/mongo/db/catalog/multi_index_block.cpp
+++ b/src/mongo/db/catalog/multi_index_block.cpp
@@ -53,6 +53,8 @@
 #include "mongo/db/repl/tenant_migration_conflict_info.h"
 #include "mongo/db/storage/storage_options.h"
 #include "mongo/db/storage/write_unit_of_work.h"
+#include "mongo/db/timeseries/timeseries_constants.h"
+#include "mongo/db/timeseries/timeseries_index_schema_conversion_functions.h"
 #include "mongo/logv2/log.h"
 #include "mongo/util/assert_util.h"
 #include "mongo/util/fail_point.h"
@@ -83,6 +85,20 @@ size_t getEachIndexBuildMaxMemoryUsageBytes(size_t numIndexSpecs) {
         numIndexSpecs;
 }
 
+Status timeseriesMixedSchemaDataFailure(const Collection* collection) {
+    // TODO SERVER-61070: Re-word the error message below if necessary and add a URL for
+    // workarounds.
+    return Status(
+        ErrorCodes::CannotCreateIndex,
+        str::stream() << "Index build on collection '" << collection->ns() << "' ("
+                      << collection->uuid()
+                      << ") failed due to the detection of mixed-schema data in the "
+                      << "time-series buckets collection. Starting as of v5.2, time-series "
+                      << "measurement bucketing has been modified to ensure that newly created "
+                      << "time-series buckets do not contain mixed-schema data. For workarounds, "
+                      << "see: <url>");
+}
+
 }  // namespace
 
 MultiIndexBlock::~MultiIndexBlock() {
@@ -255,6 +271,15 @@ StatusWith<std::vector<BSONObj>> MultiIndexBlock::init(
             info = statusWithInfo.getValue();
             indexInfoObjs.push_back(info);
 
+            // TODO SERVER-54592: Remove FCV check once feature flag is enabled for v5.2.
+            boost::optional<TimeseriesOptions> options = collection->getTimeseriesOptions();
+            if (options &&
+                serverGlobalParams.featureCompatibility.isFCVUpgradingToOrAlreadyLatest() &&
+                timeseries::doesBucketsIndexIncludeKeyOnMeasurement(*options, info)) {
+                invariant(collection->getTimeseriesBucketsMayHaveMixedSchemaData());
+                _containsIndexBuildOnTimeseriesMeasurement = true;
+            }
+
             boost::optional<IndexStateInfo> stateInfo;
             auto& index = _indexes.emplace_back();
             index.block =
@@ -651,6 +676,37 @@ Status MultiIndexBlock::_insert(OperationContext* opCtx,
                                 const std::function<void()>& saveCursorBeforeWrite,
                                 const std::function<void()>& restoreCursorAfterWrite) {
     invariant(!_buildIsCleanedUp);
+
+    // The detection of mixed-schema data needs to be done before applying the partial filter
+    // expression below. Only check for mixed-schema data if it's possible for the time-series
+    // collection to have it.
+    if (_containsIndexBuildOnTimeseriesMeasurement &&
+        *collection->getTimeseriesBucketsMayHaveMixedSchemaData()) {
+        bool docHasMixedSchemaData =
+            collection->doesTimeseriesBucketsDocContainMixedSchemaData(doc);
+
+        if (docHasMixedSchemaData) {
+            LOGV2(6057700,
+                  "Detected mixed-schema data in time-series bucket collection",
+                  logAttrs(collection->ns()),
+                  logAttrs(collection->uuid()),
+                  "recordId"_attr = loc,
+                  "control"_attr = redact(doc.getObjectField(timeseries::kBucketControlFieldName)));
+
+            _timeseriesBucketContainsMixedSchemaData = true;
+        }
+
+        // Only enforce the mixed-schema data constraint on the primary. Index builds may not fail
+        // on the secondaries. The primary will replicate an abortIndexBuild oplog entry.
+        auto replCoord = repl::ReplicationCoordinator::get(opCtx);
+        const bool replSetAndNotPrimary = replCoord->getSettings().usingReplSets() &&
+            !replCoord->canAcceptWritesFor(opCtx, collection->ns());
+
+        if (docHasMixedSchemaData && !replSetAndNotPrimary) {
+            return timeseriesMixedSchemaDataFailure(collection.get());
+        }
+    }
+
     for (size_t i = 0; i < _indexes.size(); i++) {
         if (_indexes[i].filterExpression && !_indexes[i].filterExpression->matchesBSON(doc)) {
             continue;
@@ -857,6 +913,23 @@ Status MultiIndexBlock::commit(OperationContext* opCtx,
         invariant(_collectionUUID.get() == collection->uuid());
     }
 
+    auto replCoord = repl::ReplicationCoordinator::get(opCtx);
+    const bool replSetAndNotPrimary = replCoord->getSettings().usingReplSets() &&
+        !replCoord->canAcceptWritesFor(opCtx, collection->ns());
+
+    // During the collection scan phase, only the primary will enforce the mixed-schema data
+    // constraint. Secondaries will only keep track of and take no action if mixed-schema data is
+    // detected. If the primary steps down during the index build, a secondary node will takeover.
+    // This can happen after the collection scan phase, which is why we need this check here.
+    if (_timeseriesBucketContainsMixedSchemaData && !replSetAndNotPrimary) {
+        LOGV2_DEBUG(6057701,
+                    1,
+                    "Aborting index build commit due to the earlier detection of mixed-schema data",
+                    logAttrs(collection->ns()),
+                    logAttrs(collection->uuid()));
+        return timeseriesMixedSchemaDataFailure(collection);
+    }
+
     // Do not interfere with writing multikey information when committing index builds.
     ScopeGuard restartTracker(
         [this, opCtx] { MultikeyPathTracker::get(opCtx).startTrackingMultikeyPathInfo(); });
@@ -894,6 +967,18 @@ Status MultiIndexBlock::commit(OperationContext* opCtx,
 
     onCommit();
 
+    // Update the 'timeseriesBucketsMayHaveMixedSchemaData' catalog entry flag to false in order to
+    // allow subsequent index builds to skip checking bucket documents for mixed-schema data.
+    if (_containsIndexBuildOnTimeseriesMeasurement && !_timeseriesBucketContainsMixedSchemaData) {
+        boost::optional<bool> mayContainMixedSchemaData =
+            collection->getTimeseriesBucketsMayHaveMixedSchemaData();
+        invariant(mayContainMixedSchemaData);
+
+        if (*mayContainMixedSchemaData) {
+            collection->setTimeseriesBucketsMayHaveMixedSchemaData(opCtx, false);
+        }
+    }
+
     CollectionQueryInfo::get(collection).clearQueryCache(opCtx, collection);
     opCtx->recoveryUnit()->onCommit(
         [this](boost::optional<Timestamp> commitTime) { _buildIsCleanedUp = true; });
diff --git a/src/mongo/db/catalog/multi_index_block.h b/src/mongo/db/catalog/multi_index_block.h
index 2dd45df2a7b..fa747afd8a3 100644
--- a/src/mongo/db/catalog/multi_index_block.h
+++ b/src/mongo/db/catalog/multi_index_block.h
@@ -354,6 +354,13 @@ private:
 
     bool _ignoreUnique = false;
 
+    // True if one or more indexes being built are on time-series measurements.
+    bool _containsIndexBuildOnTimeseriesMeasurement = false;
+
+    // True if at least one bucket document contains mixed-schema data and
+    // '_containsIndexBuildOnTimeseriesMeasurement=true'.
+    bool _timeseriesBucketContainsMixedSchemaData = false;
+
     // Set to true when no work remains to be done, the object can safely destruct without leaving
     // incorrect state set anywhere.
     bool _buildIsCleanedUp = true;
author	Gregory Wlodarek <gregory.wlodarek@mongodb.com>	2021-10-30 01:05:07 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2021-10-30 01:38:21 +0000
commit	8b405630fc1506abf620aaf4b08363d015327c9b (patch)
tree	d8f81d4f33499857d6c76fb2694cf9c5d4cccaa6 /src/mongo/db/catalog
parent	2455e1c112c89a3bdde41d718cadf7f9cc9b5bf0 (diff)
download	mongo-8b405630fc1506abf620aaf4b08363d015327c9b.tar.gz