1 files changed, 122 insertions, 0 deletions
diff --git a/jstests/noPassthrough/timeseries_sample.js b/jstests/noPassthrough/timeseries_sample.js
new file mode 100644
index 00000000000..d38af3559d3
--- /dev/null
+++ b/jstests/noPassthrough/timeseries_sample.js
@@ -0,0 +1,122 @@
+/**
+ * Tests inserting sample data into the time-series buckets collection. This test is for the
+ * exercising the optimized $sample implementation for $_internalUnpackBucket.
+ * @tags: [
+ *     sbe_incompatible,
+ *     requires_wiredtiger,
+ * ]
+ */
+(function() {
+"use strict";
+
+load("jstests/core/timeseries/libs/timeseries.js");
+load("jstests/libs/analyze_plan.js");
+
+const conn = MongoRunner.runMongod({setParameter: {timeseriesBucketMaxCount: 2}});
+
+// Although this test is tagged with 'requires_wiredtiger', this is not sufficient for ensuring
+// that the parallel suite runs this test only on WT configurations.
+if (jsTest.options().storageEngine && jsTest.options().storageEngine !== "wiredTiger") {
+    jsTest.log("Skipping test on non-WT storage engine: " + jsTest.options().storageEngine);
+    MongoRunner.stopMongod(conn);
+    return;
+}
+
+const dbName = jsTestName();
+const testDB = conn.getDB(dbName);
+assert.commandWorked(testDB.dropDatabase());
+
+if (!TimeseriesTest.timeseriesCollectionsEnabled(testDB.getMongo())) {
+    jsTestLog("Skipping test because the time-series collection feature flag is disabled");
+    MongoRunner.stopMongod(conn);
+    return;
+}
+
+// In order to trigger the optimized sample path we need at least 100 buckets in the bucket
+// collection.
+const nBuckets = 101;
+let bucketMaxCount = 2;
+let numDocs = nBuckets * bucketMaxCount;
+
+const coll = testDB.getCollection('timeseries_sample');
+const bucketsColl = testDB.getCollection("system.buckets." + coll.getName());
+
+coll.drop();
+
+const timeFieldName = "time";
+const metaFieldName = "m";
+assert.commandWorked(testDB.createCollection(
+    coll.getName(), {timeseries: {timeField: timeFieldName, metaField: metaFieldName}}));
+
+assert.contains(bucketsColl.getName(), testDB.getCollectionNames());
+
+for (let i = 0; i < numDocs; i++) {
+    let id = ObjectId();
+    assert.commandWorked(
+        coll.insert({_id: id, [timeFieldName]: ISODate(), [metaFieldName]: i % nBuckets, x: i}),
+        "failed to insert doc: " + id);
+}
+
+let buckets = bucketsColl.find().toArray();
+assert.eq(nBuckets, buckets.length, buckets);
+
+let assertUniqueDocuments = function(docs) {
+    let seen = new Set();
+    docs.forEach(doc => {
+        assert.eq(seen.has(doc._id), false);
+        seen.add(doc._id);
+    });
+};
+
+// Check the time-series view to make sure we have the correct number of docs and that there are no
+// duplicates after sampling.
+const viewDocs = coll.find({}, {x: 1}).toArray();
+assert.eq(numDocs, viewDocs.length, viewDocs);
+
+let sampleSize = 5;
+let result = coll.aggregate([{$sample: {size: sampleSize}}]).toArray();
+assert.eq(sampleSize, result.length, result);
+assertUniqueDocuments(result);
+
+// Check that we have absorbed $sample into $_internalUnpackBucket.
+const optimizedSamplePlan = coll.explain().aggregate([{$sample: {size: sampleSize}}]);
+let bucketStage = getAggPlanStage(optimizedSamplePlan, "$_internalUnpackBucket");
+assert.eq(bucketStage["$_internalUnpackBucket"]["sample"], sampleSize);
+assert(!aggPlanHasStage(optimizedSamplePlan, "$sample"));
+
+// Run an agg pipeline with optimization disabled.
+result = coll.aggregate([{$_internalInhibitOptimization: {}}, {$sample: {size: 1}}]).toArray();
+assert.eq(1, result.length, result);
+
+// Check that $sample hasn't been absorbed by $_internalUnpackBucket.
+sampleSize = 100;
+const unoptimizedSamplePlan = coll.explain().aggregate([{$sample: {size: sampleSize}}]);
+bucketStage = getAggPlanStage(unoptimizedSamplePlan, "$_internalUnpackBucket");
+assert.eq(bucketStage["$_internalUnpackBucket"]["sample"], undefined);
+assert(aggPlanHasStage(unoptimizedSamplePlan, "$sample"));
+
+const unoptimizedResult = coll.aggregate([{$sample: {size: sampleSize}}]).toArray();
+assertUniqueDocuments(unoptimizedResult);
+
+// Check that a sampleSize greater than the number of measurements doesn't cause an infinte loop.
+result = coll.aggregate([{$sample: {size: numDocs + 1}}]).toArray();
+assert.eq(numDocs, result.length, result);
+
+// Check that $lookup against a time-series collection doesn't cache inner pipeline results if it
+// contains a $sample stage.
+result =
+    coll.aggregate({$lookup: {from: coll.getName(), as: "docs", pipeline: [{$sample: {size: 1}}]}})
+        .toArray();
+
+// Each subquery should be an independent sample by checking that we didn't sample the same document
+// repeatedly. It's sufficient for now to make sure that the seen set contains at least two distinct
+// samples.
+let seen = new Set();
+result.forEach(r => {
+    assert.eq(r.docs.length, 1);
+    seen.add(r.docs[0]._id);
+});
+assert.gte(seen.size, 2);
+
+MongoRunner.stopMongod(conn);
+})();