jstests/aggregation/bugs/server21632.js


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86

// Cannot implicitly shard accessed collections because the coll.stats() output from a mongod when
// run against a sharded collection is wrapped in a "shards" object with keys for each shard.
// @tags: [assumes_unsharded_collection]

// This test is designed to stress $sample, and any optimizations a storage engine might provide.
//
// A $sample stage as the first stage in a pipeline should ideally have a uniform distribution, so
// should at least have the following properties:
//   1. In a collection of N documents, we have a high probability of seeing at least N/4 distinct
//      documents after sampling N times.
//   2. We should not see any duplicate documents in any one $sample (this is only guaranteed if
//      there are no ongoing write operations).
(function() {
"use strict";

var coll = db.server21632;
coll.drop();

// If there is no collection, or no documents in the collection, we should not get any results
// from a sample.
assert.eq([], coll.aggregate([{$sample: {size: 1}}]).toArray());
assert.eq([], coll.aggregate([{$sample: {size: 10}}]).toArray());

db.createCollection(coll.getName());

// Test if we are running WT + LSM and if so, skip the test.
// WiredTiger LSM random cursor implementation doesn't currently give random enough
// distribution to pass this test case, so disable the test when checking an LSM
// configuration for now. We will need revisit this before releasing WiredTiger LSM
// as a supported file type. (See: WT-2403 for details on forthcoming changes)

var storageEngine = jsTest.options().storageEngine || "wiredTiger";

if (storageEngine == "wiredTiger" && coll.stats().wiredTiger.type == 'lsm') {
    return;
}

assert.eq([], coll.aggregate([{$sample: {size: 1}}]).toArray());
assert.eq([], coll.aggregate([{$sample: {size: 10}}]).toArray());

// If there is only one document, we should get that document.
var paddingStr = "abcdefghijklmnopqrstuvwxyz";
var firstDoc = {_id: 0, paddingStr: paddingStr};
assert.writeOK(coll.insert(firstDoc));
assert.eq([firstDoc], coll.aggregate([{$sample: {size: 1}}]).toArray());
assert.eq([firstDoc], coll.aggregate([{$sample: {size: 10}}]).toArray());

// Insert a bunch of documents.
var bulk = coll.initializeUnorderedBulkOp();
var nDocs = 1000;
for (var id = 1; id < nDocs; id++) {
    bulk.insert({_id: id, paddingStr: paddingStr});
}
bulk.execute();

// Will contain a document's _id as a key if we've ever seen that document.
var cumulativeSeenIds = {};
var sampleSize = 10;

jsTestLog("About to do repeated samples, explain output: " +
          tojson(coll.explain().aggregate([{$sample: {size: sampleSize}}])));

// Repeatedly ask for small samples of documents to get a cumulative sample of size 'nDocs'.
for (var i = 0; i < nDocs / sampleSize; i++) {
    var results = coll.aggregate([{$sample: {size: sampleSize}}]).toArray();

    assert.eq(results.length, sampleSize, "$sample did not return the expected number of results");

    // Check that there are no duplicate documents in the result of any single sample.
    var idsThisSample = {};
    results.forEach(function recordId(result) {
        assert.lte(result._id, nDocs, "$sample returned an unknown document");
        assert(!idsThisSample[result._id],
               "A single $sample returned the same document twice: " + result._id);

        cumulativeSeenIds[result._id] = true;
        idsThisSample[result._id] = true;
    });
}

// An implementation would have to be very broken for this assertion to fail.
assert.gte(Object.keys(cumulativeSeenIds).length, nDocs / 4);

// Make sure we can return all documents in the collection.
assert.eq(coll.aggregate([{$sample: {size: nDocs}}]).toArray().length, nDocs);
})();