summaryrefslogtreecommitdiff
path: root/jstests/noPassthrough/sort_spill_estimate_data_size.js
diff options
context:
space:
mode:
authorMohammad Dashti <mdashti@gmail.com>2021-05-10 22:25:46 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-05-10 23:05:44 +0000
commitef3c46f76af3b8e2ca92cc9f071885d5b49998fb (patch)
treef21f423437993b8fa8f1045fb4dcb30c1dbfecd6 /jstests/noPassthrough/sort_spill_estimate_data_size.js
parentf92afd9e21ed21e0297329a7a9958142573be6ab (diff)
downloadmongo-ef3c46f76af3b8e2ca92cc9f071885d5b49998fb.tar.gz
SERVER-53760 Improved document size approximation for spilling to disk
Co-authored-by: Mihai Andrei <mihai.andrei@10gen.com>
Diffstat (limited to 'jstests/noPassthrough/sort_spill_estimate_data_size.js')
-rw-r--r--jstests/noPassthrough/sort_spill_estimate_data_size.js73
1 files changed, 73 insertions, 0 deletions
diff --git a/jstests/noPassthrough/sort_spill_estimate_data_size.js b/jstests/noPassthrough/sort_spill_estimate_data_size.js
new file mode 100644
index 00000000000..0f9d3b4c625
--- /dev/null
+++ b/jstests/noPassthrough/sort_spill_estimate_data_size.js
@@ -0,0 +1,73 @@
+/**
+ * Test that estimate of the total data size sorted when spilling to disk is reasonable.
+ *
+ * This test was originally designed to reproduce SERVER-53760.
+ */
+(function() {
+"use strict";
+load('jstests/libs/analyze_plan.js'); // For 'getAggPlanStage()'.
+
+const conn = MongoRunner.runMongod();
+assert.neq(null, conn, "mongod was unable to start up");
+const db = conn.getDB("test");
+
+const collName = jsTestName();
+const coll = db[collName];
+coll.drop();
+const numDocs = 5;
+const bigStrLen = numDocs * 40;
+const arrLen = numDocs * 40;
+
+// To reproduce SERVER-53760, we need to create a collection with N documents, where each document
+// is sizable and consists of an array field, called `data`. Then, if we pass the collection through
+// a pipeline consisting of a `$unwind` (on `data`) followed by `$sort`, the documents in the output
+// of `$unwind` all share the same backing BSON in the original collection. Next, if the sorter does
+// not calculate the appropriate size of the document (by discarding the parts of backing BSON not
+// used by each document in the output of `$unwind`), the size approximation can be way bigger than
+// the actual amount, which can result in (unnecessarily) opening too many files (and even running
+// out of the number of allowed open files for some operating systems). In this example, it'd be a
+// factor of 100x.
+const docs = [];
+let totalSize = 0;
+const str = "a".repeat(bigStrLen);
+for (let i = 0; i < numDocs; ++i) {
+ let doc = {_id: i, foo: i * 2};
+ let arr = [];
+ for (let j = 0; j < arrLen; ++j) {
+ arr.push({bigString: str, uniqueValue: j});
+ }
+
+ doc["data"] = arr;
+ docs.push(doc);
+ totalSize += Object.bsonsize(doc);
+}
+
+assert.commandWorked(
+ db.adminCommand({setParameter: 1, internalQueryMaxBlockingSortMemoryUsageBytes: 5000}));
+assert.commandWorked(coll.insert(docs));
+
+function createPipeline(collection) {
+ return collection.aggregate(
+ [
+ {$unwind: "$data"},
+ {$sort: {'_id': -1, 'data.val': -1}},
+ {$limit: 900},
+ {$group: {_id: 0, sumTop900UniqueValues: {$sum: '$data.uniqueValue'}}}
+ ],
+ {allowDiskUse: true});
+}
+
+const explain = createPipeline(coll.explain("executionStats"));
+const sortStages = getAggPlanStages(explain, "$sort");
+
+assert.eq(sortStages.length, 1, explain);
+const sort = sortStages[0];
+const dataBytesSorted = sort["totalDataSizeSortedBytesEstimate"];
+
+// The total data size sorted is no greater than 3x the total size of all documents sorted.
+assert.lt(dataBytesSorted, 3 * totalSize, explain);
+
+assert.eq(createPipeline(coll).toArray(), [{_id: 0, sumTop900UniqueValues: 84550}], explain);
+
+MongoRunner.stopMongod(conn);
+})();