diff options
author | Mohammad Dashti <mdashti@gmail.com> | 2021-05-10 22:25:46 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-05-10 23:05:44 +0000 |
commit | ef3c46f76af3b8e2ca92cc9f071885d5b49998fb (patch) | |
tree | f21f423437993b8fa8f1045fb4dcb30c1dbfecd6 /jstests/noPassthrough/sort_spill_estimate_data_size.js | |
parent | f92afd9e21ed21e0297329a7a9958142573be6ab (diff) | |
download | mongo-ef3c46f76af3b8e2ca92cc9f071885d5b49998fb.tar.gz |
SERVER-53760 Improved document size approximation for spilling to disk
Co-authored-by: Mihai Andrei <mihai.andrei@10gen.com>
Diffstat (limited to 'jstests/noPassthrough/sort_spill_estimate_data_size.js')
-rw-r--r-- | jstests/noPassthrough/sort_spill_estimate_data_size.js | 73 |
1 files changed, 73 insertions, 0 deletions
diff --git a/jstests/noPassthrough/sort_spill_estimate_data_size.js b/jstests/noPassthrough/sort_spill_estimate_data_size.js new file mode 100644 index 00000000000..0f9d3b4c625 --- /dev/null +++ b/jstests/noPassthrough/sort_spill_estimate_data_size.js @@ -0,0 +1,73 @@ +/** + * Test that estimate of the total data size sorted when spilling to disk is reasonable. + * + * This test was originally designed to reproduce SERVER-53760. + */ +(function() { +"use strict"; +load('jstests/libs/analyze_plan.js'); // For 'getAggPlanStage()'. + +const conn = MongoRunner.runMongod(); +assert.neq(null, conn, "mongod was unable to start up"); +const db = conn.getDB("test"); + +const collName = jsTestName(); +const coll = db[collName]; +coll.drop(); +const numDocs = 5; +const bigStrLen = numDocs * 40; +const arrLen = numDocs * 40; + +// To reproduce SERVER-53760, we need to create a collection with N documents, where each document +// is sizable and consists of an array field, called `data`. Then, if we pass the collection through +// a pipeline consisting of a `$unwind` (on `data`) followed by `$sort`, the documents in the output +// of `$unwind` all share the same backing BSON in the original collection. Next, if the sorter does +// not calculate the appropriate size of the document (by discarding the parts of backing BSON not +// used by each document in the output of `$unwind`), the size approximation can be way bigger than +// the actual amount, which can result in (unnecessarily) opening too many files (and even running +// out of the number of allowed open files for some operating systems). In this example, it'd be a +// factor of 100x. +const docs = []; +let totalSize = 0; +const str = "a".repeat(bigStrLen); +for (let i = 0; i < numDocs; ++i) { + let doc = {_id: i, foo: i * 2}; + let arr = []; + for (let j = 0; j < arrLen; ++j) { + arr.push({bigString: str, uniqueValue: j}); + } + + doc["data"] = arr; + docs.push(doc); + totalSize += Object.bsonsize(doc); +} + +assert.commandWorked( + db.adminCommand({setParameter: 1, internalQueryMaxBlockingSortMemoryUsageBytes: 5000})); +assert.commandWorked(coll.insert(docs)); + +function createPipeline(collection) { + return collection.aggregate( + [ + {$unwind: "$data"}, + {$sort: {'_id': -1, 'data.val': -1}}, + {$limit: 900}, + {$group: {_id: 0, sumTop900UniqueValues: {$sum: '$data.uniqueValue'}}} + ], + {allowDiskUse: true}); +} + +const explain = createPipeline(coll.explain("executionStats")); +const sortStages = getAggPlanStages(explain, "$sort"); + +assert.eq(sortStages.length, 1, explain); +const sort = sortStages[0]; +const dataBytesSorted = sort["totalDataSizeSortedBytesEstimate"]; + +// The total data size sorted is no greater than 3x the total size of all documents sorted. +assert.lt(dataBytesSorted, 3 * totalSize, explain); + +assert.eq(createPipeline(coll).toArray(), [{_id: 0, sumTop900UniqueValues: 84550}], explain); + +MongoRunner.stopMongod(conn); +})(); |