summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCharlie Swanson <cswanson310@gmail.com>2015-10-23 16:04:42 -0400
committerCharlie Swanson <cswanson310@gmail.com>2015-10-23 17:46:09 -0400
commit9284edb11f0626c77017117ef02790881586aa26 (patch)
tree69bf0c8d5120085228b70f7e4b94bdd427e5d007
parentd4d8955022819016cca2d5555ddec69c12f8c9dc (diff)
downloadmongo-9284edb11f0626c77017117ef02790881586aa26.tar.gz
SERVER-20906 Add a random cursor implementation for mmap_v1 storage engine
-rw-r--r--src/mongo/db/exec/SConscript1
-rw-r--r--src/mongo/db/exec/index_iterator.cpp93
-rw-r--r--src/mongo/db/exec/index_iterator.h93
-rw-r--r--src/mongo/db/index/index_access_method.cpp5
-rw-r--r--src/mongo/db/index/index_access_method.h4
-rw-r--r--src/mongo/db/pipeline/pipeline_d.cpp48
-rw-r--r--src/mongo/db/query/stage_types.h4
-rw-r--r--src/mongo/db/storage/SConscript1
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp53
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp98
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_logic.h14
-rw-r--r--src/mongo/db/storage/sorted_data_interface.h18
-rw-r--r--src/mongo/db/storage/sorted_data_interface_test_harness.h26
-rw-r--r--src/mongo/db/storage/sorted_data_interface_test_rand_cursor.cpp246
14 files changed, 692 insertions, 12 deletions
diff --git a/src/mongo/db/exec/SConscript b/src/mongo/db/exec/SConscript
index bc4db81ddaa..dc8c83ae743 100644
--- a/src/mongo/db/exec/SConscript
+++ b/src/mongo/db/exec/SConscript
@@ -50,6 +50,7 @@ env.Library(
"geo_near.cpp",
"group.cpp",
"idhack.cpp",
+ "index_iterator.cpp",
"index_scan.cpp",
"keep_mutations.cpp",
"limit.cpp",
diff --git a/src/mongo/db/exec/index_iterator.cpp b/src/mongo/db/exec/index_iterator.cpp
new file mode 100644
index 00000000000..f12947b996d
--- /dev/null
+++ b/src/mongo/db/exec/index_iterator.cpp
@@ -0,0 +1,93 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/exec/index_iterator.h"
+
+#include "mongo/db/catalog/collection.h"
+#include "mongo/db/exec/scoped_timer.h"
+#include "mongo/db/index/index_access_method.h"
+#include "mongo/stdx/memory.h"
+
+namespace mongo {
+
+using std::unique_ptr;
+using stdx::make_unique;
+
+const char* IndexIteratorStage::kStageType = "INDEX_ITERATOR";
+
+IndexIteratorStage::IndexIteratorStage(OperationContext* txn,
+ WorkingSet* ws,
+ Collection* collection,
+ IndexAccessMethod* iam,
+ BSONObj keyPattern,
+ unique_ptr<SortedDataInterface::Cursor> cursor)
+ : PlanStage(kStageType, txn),
+ _collection(collection),
+ _ws(ws),
+ _iam(iam),
+ _cursor(std::move(cursor)),
+ _keyPattern(keyPattern.getOwned()) {
+ invariant(_collection); // It is illegal to use this stage without a collection.
+}
+
+PlanStage::StageState IndexIteratorStage::work(WorkingSetID* out) {
+ ++_commonStats.works;
+
+ // Adds the amount of time taken by work() to executionTimeMillis.
+ ScopedTimer timer(&_commonStats.executionTimeMillis);
+
+ if (auto entry = _cursor->next()) {
+ if (!entry->key.isOwned())
+ entry->key = entry->key.getOwned();
+
+ WorkingSetID id = _ws->allocate();
+ WorkingSetMember* member = _ws->get(id);
+ member->loc = entry->loc;
+ member->keyData.push_back(IndexKeyDatum(_keyPattern, entry->key, _iam));
+ _ws->transitionToLocAndIdx(id);
+
+ *out = id;
+ ++_commonStats.advanced;
+ return PlanStage::ADVANCED;
+ }
+
+ _commonStats.isEOF = true;
+ return PlanStage::IS_EOF;
+}
+
+bool IndexIteratorStage::isEOF() {
+ return _commonStats.isEOF;
+}
+
+unique_ptr<PlanStageStats> IndexIteratorStage::getStats() {
+ return make_unique<PlanStageStats>(_commonStats, STAGE_INDEX_ITERATOR);
+}
+
+} // namespace mongo
diff --git a/src/mongo/db/exec/index_iterator.h b/src/mongo/db/exec/index_iterator.h
new file mode 100644
index 00000000000..3adb1a33104
--- /dev/null
+++ b/src/mongo/db/exec/index_iterator.h
@@ -0,0 +1,93 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "mongo/db/exec/plan_stage.h"
+#include "mongo/db/exec/plan_stats.h"
+#include "mongo/db/storage/sorted_data_interface.h"
+
+namespace mongo {
+class Collection;
+
+/**
+ * A simple wrapper around a SortedDataInterface::Cursor, which will advance the cursor until it is
+ * exhausted.
+ */
+class IndexIteratorStage final : public PlanStage {
+public:
+ IndexIteratorStage(OperationContext* txn,
+ WorkingSet* ws,
+ Collection* collection,
+ IndexAccessMethod* iam,
+ BSONObj keyPattern,
+ std::unique_ptr<SortedDataInterface::Cursor> cursor);
+
+ PlanStage::StageState work(WorkingSetID* out) final;
+
+ bool isEOF() final;
+
+ void doSaveState() final {
+ _cursor->save();
+ }
+ void doRestoreState() final {
+ _cursor->restore();
+ }
+ void doDetachFromOperationContext() final {
+ _cursor->detachFromOperationContext();
+ }
+ void doReattachToOperationContext() final {
+ _cursor->reattachToOperationContext(getOpCtx());
+ }
+
+ std::unique_ptr<PlanStageStats> getStats() final;
+
+ // Not used.
+ SpecificStats* getSpecificStats() const final {
+ return nullptr;
+ }
+
+ StageType stageType() const final {
+ return STAGE_INDEX_ITERATOR;
+ }
+
+ static const char* kStageType;
+
+private:
+ Collection* _collection;
+ WorkingSet* _ws;
+ IndexAccessMethod* _iam; // owned by Collection -> IndexCatalog.
+
+ std::unique_ptr<SortedDataInterface::Cursor> _cursor;
+ const BSONObj _keyPattern;
+};
+
+} // namespace mongo
diff --git a/src/mongo/db/index/index_access_method.cpp b/src/mongo/db/index/index_access_method.cpp
index b40c2319a8a..d731f9cbfed 100644
--- a/src/mongo/db/index/index_access_method.cpp
+++ b/src/mongo/db/index/index_access_method.cpp
@@ -168,6 +168,11 @@ std::unique_ptr<SortedDataInterface::Cursor> IndexAccessMethod::newCursor(Operat
return _newInterface->newCursor(txn, isForward);
}
+std::unique_ptr<SortedDataInterface::Cursor> IndexAccessMethod::newRandomCursor(
+ OperationContext* txn) const {
+ return _newInterface->newRandomCursor(txn);
+}
+
// Remove the provided doc from the index.
Status IndexAccessMethod::remove(OperationContext* txn,
const BSONObj& obj,
diff --git a/src/mongo/db/index/index_access_method.h b/src/mongo/db/index/index_access_method.h
index 5dc88d30d80..df62c4ae937 100644
--- a/src/mongo/db/index/index_access_method.h
+++ b/src/mongo/db/index/index_access_method.h
@@ -123,6 +123,10 @@ public:
*/
std::unique_ptr<SortedDataInterface::Cursor> newCursor(OperationContext* txn,
bool isForward = true) const;
+ /**
+ * Returns a pseudo-random cursor over 'this' index.
+ */
+ std::unique_ptr<SortedDataInterface::Cursor> newRandomCursor(OperationContext* txn) const;
// ------ index level operations ------
diff --git a/src/mongo/db/pipeline/pipeline_d.cpp b/src/mongo/db/pipeline/pipeline_d.cpp
index 5005e2ccdb5..27eda2730f1 100644
--- a/src/mongo/db/pipeline/pipeline_d.cpp
+++ b/src/mongo/db/pipeline/pipeline_d.cpp
@@ -35,17 +35,21 @@
#include "mongo/db/catalog/database.h"
#include "mongo/db/catalog/document_validation.h"
#include "mongo/db/concurrency/write_conflict_exception.h"
+#include "mongo/db/exec/fetch.h"
+#include "mongo/db/exec/index_iterator.h"
#include "mongo/db/exec/multi_iterator.h"
-#include "mongo/db/exec/working_set.h"
#include "mongo/db/exec/shard_filter.h"
+#include "mongo/db/exec/working_set.h"
#include "mongo/db/db_raii.h"
#include "mongo/db/dbdirectclient.h"
+#include "mongo/db/index/index_access_method.h"
#include "mongo/db/pipeline/document_source.h"
#include "mongo/db/pipeline/pipeline.h"
#include "mongo/db/query/get_executor.h"
#include "mongo/db/query/query_planner.h"
#include "mongo/db/service_context.h"
#include "mongo/db/storage/record_store.h"
+#include "mongo/db/storage/sorted_data_interface.h"
#include "mongo/db/s/sharded_connection_info.h"
#include "mongo/db/s/sharding_state.h"
#include "mongo/s/chunk_version.h"
@@ -130,15 +134,43 @@ shared_ptr<PlanExecutor> createRandomCursorExecutor(Collection* collection,
if (sampleSize > numRecords * kMaxSampleRatioForRandCursor || numRecords <= 100)
return {};
- // Attempt to get a random cursor from the storage engine.
- auto randCursor = collection->getRecordStore()->getRandomCursor(txn);
-
- if (!randCursor)
- return {};
+ // Attempt to get a random cursor from the RecordStore. If the RecordStore does not support
+ // random cursors, attempt to get one from the _id index.
+ std::unique_ptr<RecordCursor> rsRandCursor = collection->getRecordStore()->getRandomCursor(txn);
auto ws = stdx::make_unique<WorkingSet>();
- auto stage = stdx::make_unique<MultiIteratorStage>(txn, ws.get(), collection);
- stage->addIterator(std::move(randCursor));
+ std::unique_ptr<PlanStage> stage;
+
+ if (rsRandCursor) {
+ stage = stdx::make_unique<MultiIteratorStage>(txn, ws.get(), collection);
+ static_cast<MultiIteratorStage*>(stage.get())->addIterator(std::move(rsRandCursor));
+
+ } else {
+ auto indexCatalog = collection->getIndexCatalog();
+ auto indexDescriptor = indexCatalog->findIdIndex(txn);
+
+ if (!indexDescriptor) {
+ // There was no _id index.
+ return {};
+ }
+
+ IndexAccessMethod* idIam = indexCatalog->getIndex(indexDescriptor);
+ auto idxRandCursor = idIam->newRandomCursor(txn);
+
+ if (!idxRandCursor) {
+ // Storage engine does not support any type of random cursor.
+ return {};
+ }
+
+ auto idxIterator = stdx::make_unique<IndexIteratorStage>(txn,
+ ws.get(),
+ collection,
+ idIam,
+ indexDescriptor->keyPattern(),
+ idxRandCursor.release());
+ stage = stdx::make_unique<FetchStage>(
+ txn, ws.get(), idxIterator.release(), nullptr, collection);
+ }
ShardingState* const shardingState = ShardingState::get(txn);
diff --git a/src/mongo/db/query/stage_types.h b/src/mongo/db/query/stage_types.h
index 35964248e48..29d16653f72 100644
--- a/src/mongo/db/query/stage_types.h
+++ b/src/mongo/db/query/stage_types.h
@@ -72,6 +72,10 @@ enum StageType {
STAGE_GROUP,
STAGE_IDHACK,
+
+ // Simple wrapper to iterate a SortedDataInterface::Cursor.
+ STAGE_INDEX_ITERATOR,
+
STAGE_IXSCAN,
STAGE_LIMIT,
diff --git a/src/mongo/db/storage/SConscript b/src/mongo/db/storage/SConscript
index d1470c8c04e..1215d5afbbd 100644
--- a/src/mongo/db/storage/SConscript
+++ b/src/mongo/db/storage/SConscript
@@ -88,6 +88,7 @@ env.Library(
'sorted_data_interface_test_harness.cpp',
'sorted_data_interface_test_insert.cpp',
'sorted_data_interface_test_isempty.cpp',
+ 'sorted_data_interface_test_rand_cursor.cpp',
'sorted_data_interface_test_rollback.cpp',
'sorted_data_interface_test_spaceused.cpp',
'sorted_data_interface_test_touch.cpp',
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp
index 9232d3f5910..2ae4b0e38bb 100644
--- a/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp
@@ -346,6 +346,59 @@ public:
return stdx::make_unique<Cursor>(txn, _btree.get(), isForward);
}
+ class RandomCursor final : public SortedDataInterface::Cursor {
+ public:
+ RandomCursor(OperationContext* txn, const BtreeLogic<OnDiskFormat>* btree)
+ : _txn(txn), _btree(btree) {}
+
+ boost::optional<IndexKeyEntry> next(RequestedInfo parts) override {
+ if (_btree->isEmpty(_txn)) {
+ return {};
+ }
+ return _btree->getRandomEntry(_txn);
+ }
+
+ void detachFromOperationContext() final {
+ _txn = nullptr;
+ }
+
+ void reattachToOperationContext(OperationContext* txn) final {
+ _txn = txn;
+ }
+
+ //
+ // Should never be called.
+ //
+ void setEndPosition(const BSONObj& key, bool inclusive) override {
+ MONGO_UNREACHABLE;
+ }
+ boost::optional<IndexKeyEntry> seek(const BSONObj& key,
+ bool inclusive,
+ RequestedInfo parts) override {
+ MONGO_UNREACHABLE;
+ }
+ boost::optional<IndexKeyEntry> seek(const IndexSeekPoint& seekPoint,
+ RequestedInfo parts) override {
+ MONGO_UNREACHABLE;
+ }
+
+ //
+ // May be called, but are no-ops.
+ //
+ void save() override {}
+ void saveUnpositioned() override {}
+ void restore() override {}
+
+ private:
+ OperationContext* _txn;
+ const BtreeLogic<OnDiskFormat>* const _btree;
+ };
+
+ virtual std::unique_ptr<SortedDataInterface::Cursor> newRandomCursor(
+ OperationContext* txn) const {
+ return stdx::make_unique<RandomCursor>(txn, _btree.get());
+ }
+
virtual Status initAsEmpty(OperationContext* txn) {
return _btree->initAsEmpty(txn);
}
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp
index 11e31b3fce7..6e2d19d6bce 100644
--- a/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp
@@ -30,6 +30,9 @@
#include "mongo/platform/basic.h"
+#include <numeric>
+
+#include "mongo/db/client.h"
#include "mongo/db/jsobj.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/storage/mmap_v1/btree/btree_logic.h"
@@ -1911,6 +1914,101 @@ BSONObj BtreeLogic<BtreeLayout>::getKey(OperationContext* txn,
}
template <class BtreeLayout>
+IndexKeyEntry BtreeLogic<BtreeLayout>::getRandomEntry(OperationContext* txn) const {
+ // To ensure a uniform distribution, all keys must have an equal probability of being selected.
+ // Specifically, a key from the root should have the same probability of being selected as a key
+ // from a leaf.
+ //
+ // Here we do a random walk until we get to a leaf, storing a random key from each bucket along
+ // the way down. Because the root is always present in the random walk, but any given leaf would
+ // seldom be seen, we assign weights to each key such that the key from the leaf is much more
+ // likely to be selected than the key from the root. These weights attempt to ensure each entry
+ // is equally likely to be selected and avoid bias towards the entries closer to the root.
+ //
+ // As a simplification, we treat all buckets in a given level as having the same number of
+ // children. While this is inaccurate if the tree isn't perfectly balanced or if key-size
+ // greatly varies, it is assumed to be good enough for this purpose.
+ invariant(!isEmpty(txn));
+ BucketType* root = getRoot(txn);
+
+ vector<int64_t> nKeysInLevel;
+ vector<FullKey> selectedKeys;
+
+ auto& prng = txn->getClient()->getPrng();
+
+ int nRetries = 0;
+ const int kMaxRetries = 5;
+ do {
+ // See documentation below for description of parameters.
+ recordRandomWalk(txn, &prng, root, 1, &nKeysInLevel, &selectedKeys);
+ } while (selectedKeys.empty() && nRetries++ < kMaxRetries);
+ massert(28826,
+ str::stream() << "index " << _indexName << " may be corrupt, please repair",
+ !selectedKeys.empty());
+
+ invariant(nKeysInLevel.size() == selectedKeys.size());
+ // Select a key from the random walk such that each key from the B-tree has an equal probability
+ // of being selected.
+ //
+ // Let N be the sum of 'nKeysInLevel'. That is, the total number of keys in the B-tree.
+ //
+ // On our walk down the tree, we selected exactly one key from each level of the B-tree, where
+ // 'selectedKeys[i]' came from the ith level of the tree. On any given level, each key has an
+ // equal probability of being selected. Specifically, a key on level i has a probability of
+ // 1/'nKeysInLevel[i]' of being selected as 'selectedKeys[i]'. Then if, given our selected keys,
+ // we choose to return 'selectedKeys[i]' with a probability of 'nKeysInLevel[i]'/N, that key
+ // will be returned with a probability of 1/'nKeysInLevel[i]' * 'nKeysInLevel[i]'/N = 1/N.
+ //
+ // So 'selectedKeys[i]' should have a probability of 'nKeysInLevel[i]'/N of being returned. We
+ // will do so by picking a random number X in the range [0, N). Then, if X is in the first
+ // 'nKeysInLevel[0]' numbers, we will return 'selectedKeys[0]'. If X is in the next
+ // 'nKeysInLevel[1]' numbers, we will return 'selectedKeys[1]', and so on.
+ int64_t choice = prng.nextInt64(std::accumulate(nKeysInLevel.begin(), nKeysInLevel.end(), 0));
+ for (size_t i = 0; i < nKeysInLevel.size(); i++) {
+ if (choice < nKeysInLevel[i]) {
+ return {selectedKeys[i].data.toBson(), selectedKeys[i].header.recordLoc.toRecordId()};
+ }
+ choice -= nKeysInLevel[i];
+ }
+ MONGO_UNREACHABLE;
+}
+
+/**
+ * Does a random walk through the tree, recording information about the walk along the way.
+ *
+ * 'nKeysInLevel' will be filled in such that 'nKeysInLevel[i]' is an approximation of the number of
+ * keys in the ith level of the B-tree.
+ *
+ * 'selectedKeys' will be filled in such that 'selectedKeys[i]' will be a pseudo-random key selected
+ * from the bucket we went through on the ith level of the B-tree.
+ */
+template <class BtreeLayout>
+void BtreeLogic<BtreeLayout>::recordRandomWalk(OperationContext* txn,
+ PseudoRandom* prng,
+ BucketType* curBucket,
+ int64_t nBucketsInCurrentLevel,
+ vector<int64_t>* nKeysInLevel,
+ vector<FullKey>* selectedKeys) const {
+ // Select a random key from this bucket, and record it.
+ int nKeys = curBucket->n;
+ int keyToReturn = prng->nextInt32(nKeys);
+ auto fullKey = getFullKey(curBucket, keyToReturn);
+ // If the key is not used, just skip this level.
+ if (fullKey.header.isUsed()) {
+ selectedKeys->push_back(std::move(fullKey));
+ nKeysInLevel->push_back(nBucketsInCurrentLevel * nKeys);
+ }
+
+ // Select a random child and descend (if there are any).
+ int nChildren = nKeys + 1;
+ int nextChild = prng->nextInt32(nChildren);
+ if (auto child = childForPos(txn, curBucket, nextChild)) {
+ recordRandomWalk(
+ txn, prng, child, nBucketsInCurrentLevel * nChildren, nKeysInLevel, selectedKeys);
+ }
+}
+
+template <class BtreeLayout>
Status BtreeLogic<BtreeLayout>::touch(OperationContext* txn) const {
return _recordStore->touch(txn, NULL);
}
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic.h b/src/mongo/db/storage/mmap_v1/btree/btree_logic.h
index 3c742170bcd..4859b476723 100644
--- a/src/mongo/db/storage/mmap_v1/btree/btree_logic.h
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_logic.h
@@ -41,6 +41,7 @@
namespace mongo {
+class PseudoRandom;
class RecordStore;
class SavedCursorRegistry;
@@ -177,6 +178,12 @@ public:
BSONObj getKey(OperationContext* txn, const DiskLoc& bucketLoc, const int keyOffset) const;
+ /**
+ * Returns a pseudo-random element from the tree. It is an error to call this method if the tree
+ * is empty.
+ */
+ IndexKeyEntry getRandomEntry(OperationContext* txn) const;
+
DiskLoc getHead(OperationContext* txn) const {
return DiskLoc::fromRecordId(_headManager->getHead(txn));
}
@@ -541,6 +548,13 @@ private:
DiskLoc getRootLoc(OperationContext* txn) const;
+ void recordRandomWalk(OperationContext* txn,
+ PseudoRandom* prng,
+ BucketType* curBucket,
+ int64_t nBucketsInCurrentLevel,
+ std::vector<int64_t>* nKeysInLevel,
+ std::vector<FullKey>* selectedKeys) const;
+
//
// Data
//
diff --git a/src/mongo/db/storage/sorted_data_interface.h b/src/mongo/db/storage/sorted_data_interface.h
index d6b176f449e..9507339995a 100644
--- a/src/mongo/db/storage/sorted_data_interface.h
+++ b/src/mongo/db/storage/sorted_data_interface.h
@@ -355,6 +355,24 @@ public:
virtual std::unique_ptr<Cursor> newCursor(OperationContext* txn,
bool isForward = true) const = 0;
+ /**
+ * Constructs a cursor over an index that returns entries in a randomized order, and allows
+ * storage engines to provide a more efficient way to randomly sample a collection than
+ * MongoDB's default sampling methods, which are used when this method returns {}. Note if it is
+ * possible to implement RecordStore::getRandomCursor(), that method is preferred, as it will
+ * return the entire document, whereas this method will only return the index key and the
+ * RecordId, requiring an extra lookup.
+ *
+ * This method may be implemented using a pseudo-random walk over B-trees or a similar approach.
+ * Different cursors should return entries in a different order. Random cursors may return the
+ * same entry more than once and, as a result, may return more entries than exist in the index.
+ * Implementations should avoid obvious biases toward older, newer, larger smaller or other
+ * specific classes of entries.
+ */
+ virtual std::unique_ptr<Cursor> newRandomCursor(OperationContext* txn) const {
+ return {};
+ }
+
//
// Index creation
//
diff --git a/src/mongo/db/storage/sorted_data_interface_test_harness.h b/src/mongo/db/storage/sorted_data_interface_test_harness.h
index e90f6606e86..ddc34c6419b 100644
--- a/src/mongo/db/storage/sorted_data_interface_test_harness.h
+++ b/src/mongo/db/storage/sorted_data_interface_test_harness.h
@@ -36,6 +36,8 @@
#include "mongo/db/jsobj.h"
#include "mongo/db/operation_context_noop.h"
#include "mongo/db/record_id.h"
+#include "mongo/db/service_context.h"
+#include "mongo/db/service_context_noop.h"
#include "mongo/db/storage/sorted_data_interface.h"
#include "mongo/stdx/memory.h"
#include "mongo/util/unowned_ptr.h"
@@ -84,14 +86,18 @@ class RecoveryUnit;
class HarnessHelper {
public:
- HarnessHelper() {}
- virtual ~HarnessHelper() = default;
+ HarnessHelper() : _serviceContext(), _client(_serviceContext.makeClient("hh")) {}
+ virtual ~HarnessHelper() {}
virtual std::unique_ptr<SortedDataInterface> newSortedDataInterface(bool unique) = 0;
virtual std::unique_ptr<RecoveryUnit> newRecoveryUnit() = 0;
- virtual std::unique_ptr<OperationContext> newOperationContext() {
- return stdx::make_unique<OperationContextNoop>(newRecoveryUnit().release());
+ std::unique_ptr<OperationContext> newOperationContext(Client* client) {
+ return stdx::make_unique<OperationContextNoop>(client, 1, newRecoveryUnit().release());
+ }
+
+ std::unique_ptr<OperationContext> newOperationContext() {
+ return newOperationContext(nullptr);
}
/**
@@ -101,6 +107,18 @@ public:
*/
std::unique_ptr<SortedDataInterface> newSortedDataInterface(
bool unique, std::initializer_list<IndexKeyEntry> toInsert);
+
+ Client* client() {
+ return _client.get();
+ }
+
+ ServiceContext* serviceContext() {
+ return &_serviceContext;
+ }
+
+private:
+ ServiceContextNoop _serviceContext;
+ ServiceContext::UniqueClient _client;
};
/**
diff --git a/src/mongo/db/storage/sorted_data_interface_test_rand_cursor.cpp b/src/mongo/db/storage/sorted_data_interface_test_rand_cursor.cpp
new file mode 100644
index 00000000000..123776e1d8c
--- /dev/null
+++ b/src/mongo/db/storage/sorted_data_interface_test_rand_cursor.cpp
@@ -0,0 +1,246 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/storage/sorted_data_interface_test_harness.h"
+
+#include <memory>
+
+#include "mongo/db/storage/sorted_data_interface.h"
+#include "mongo/unittest/unittest.h"
+
+using std::unique_ptr;
+using std::set;
+using std::string;
+using std::stringstream;
+
+namespace mongo {
+
+// A random iterator should never return any entries from an empty index.
+TEST(SortedDataInterface, GetRandomIteratorEmpty) {
+ unique_ptr<HarnessHelper> harnessHelper(newHarnessHelper());
+ unique_ptr<SortedDataInterface> sorted(harnessHelper->newSortedDataInterface(true));
+
+ {
+ auto opCtx(harnessHelper->newOperationContext(harnessHelper->client()));
+ ASSERT(sorted->isEmpty(opCtx.get()));
+ }
+
+ {
+ auto opCtx(harnessHelper->newOperationContext(harnessHelper->client()));
+ auto cursor = sorted->newRandomCursor(opCtx.get());
+ if (!cursor) {
+ // newRandomCursor is not supported.
+ return;
+ }
+ ASSERT(!cursor->next());
+ }
+}
+
+// General test for 'randomness' of the cursor. With N entries in the index, we should see at least
+// N/4 distinct entries after iterating N - 1 times.
+TEST(SortedDataInterface, GetRandomIteratorNonEmpty) {
+ unique_ptr<HarnessHelper> harnessHelper(newHarnessHelper());
+ unique_ptr<SortedDataInterface> sorted(harnessHelper->newSortedDataInterface(false));
+
+ {
+ auto opCtx(harnessHelper->newOperationContext(harnessHelper->client()));
+ ASSERT_EQUALS(0, sorted->numEntries(opCtx.get()));
+ }
+
+ // Insert enough entries to generate multiple levels of a tree.
+ const unsigned nToInsert = 5000;
+ RecordId rids[nToInsert];
+ for (unsigned i = 0; i < nToInsert; i++) {
+ auto opCtx(harnessHelper->newOperationContext(harnessHelper->client()));
+ {
+ BSONObj data = BSON("" << i);
+
+ WriteUnitOfWork uow(opCtx.get());
+ rids[i] = RecordId(42, i * 2);
+ Status res = sorted->insert(opCtx.get(), data, rids[i], true);
+ ASSERT_OK(res);
+ uow.commit();
+ }
+ }
+
+ {
+ auto opCtx(harnessHelper->newOperationContext(harnessHelper->client()));
+ ASSERT_EQUALS(nToInsert, sorted->numEntries(opCtx.get()));
+ }
+
+ set<RecordId> remain(rids, rids + nToInsert);
+ {
+ auto opCtx(harnessHelper->newOperationContext(harnessHelper->client()));
+ auto cursor = sorted->newRandomCursor(opCtx.get());
+ if (!cursor) {
+ // newRandomCursor is not supported.
+ return;
+ }
+
+ // Iterate documents and mark those visited, but let at least one remain.
+ for (unsigned i = 0; i < nToInsert - 1; i++) {
+ // Get a new cursor once in a while, shouldn't affect things.
+ if (i % (nToInsert / 8) == 0) {
+ cursor = sorted->newRandomCursor(opCtx.get());
+ }
+ remain.erase(cursor->next()->loc); // Can happen more than once per doc.
+ }
+ ASSERT(!remain.empty());
+ ASSERT(cursor->next());
+
+ // We should have at least visited a quarter of the items if we're any random at all
+ // The expected fraction of visited entrys is 62.3%.
+ ASSERT_LT(remain.size(), nToInsert * 3 / 4);
+ }
+}
+
+// With only a single entry in the index, we should always receive that entry via a random cursor.
+TEST(SortedDataInterface, GetRandomIteratorSingleton) {
+ unique_ptr<HarnessHelper> harnessHelper(newHarnessHelper());
+ unique_ptr<SortedDataInterface> sorted(harnessHelper->newSortedDataInterface(true));
+
+ {
+ auto opCtx(harnessHelper->newOperationContext(harnessHelper->client()));
+ ASSERT(sorted->isEmpty(opCtx.get()));
+ }
+
+ // Insert one entry.
+ RecordId idToRetrieve(42, 0);
+ {
+ auto opCtx(harnessHelper->newOperationContext(harnessHelper->client()));
+ WriteUnitOfWork uow(opCtx.get());
+ Status res = sorted->insert(opCtx.get(), BSON("" << 0), idToRetrieve, false);
+ ASSERT_OK(res);
+ uow.commit();
+ }
+
+ // Double-check that the index has one entry in it now.
+ {
+ auto opCtx(harnessHelper->newOperationContext(harnessHelper->client()));
+ ASSERT_EQ(1, sorted->numEntries(opCtx.get()));
+ }
+
+ {
+ auto opCtx(harnessHelper->newOperationContext(harnessHelper->client()));
+ auto cursor = sorted->newRandomCursor(opCtx.get());
+ if (!cursor) {
+ // newRandomCursor is not supported.
+ return;
+ }
+
+ // The random cursor should keep returning the single existing entry.
+ for (int i = 0; i < 10; i++) {
+ auto entry = cursor->next();
+ ASSERT_EQ(entry->loc, idToRetrieve);
+ }
+
+ // Check deattaching / reattaching.
+ cursor->save();
+ cursor->detachFromOperationContext();
+ auto newClient = harnessHelper->serviceContext()->makeClient(
+ "New client for SortedDataInterface::GetRandomIteratorSingleton");
+ auto newOpCtx(harnessHelper->newOperationContext(newClient.get()));
+ cursor->reattachToOperationContext(newOpCtx.get());
+ cursor->restore();
+
+ auto entry = cursor->next();
+ ASSERT_EQ(entry->loc, idToRetrieve);
+
+ // The random cursor should keep returning the single existing entry.
+ for (int i = 0; i < 10; i++) {
+ entry = cursor->next();
+ ASSERT_EQ(entry->loc, idToRetrieve);
+ }
+ }
+}
+
+// With enough samples, we should eventually have returned every document in the tree at least once.
+TEST(SortedDataInterface, GetRandomIteratorLargeDocs) {
+ unique_ptr<HarnessHelper> harnessHelper(newHarnessHelper());
+ unique_ptr<SortedDataInterface> sorted(harnessHelper->newSortedDataInterface(true));
+
+ // Seed the random number generator.
+ harnessHelper->client()->getPrng() = PseudoRandom(1234);
+
+ {
+ auto opCtx(harnessHelper->newOperationContext(harnessHelper->client()));
+ ASSERT_EQUALS(0, sorted->numEntries(opCtx.get()));
+ }
+
+ // Insert enough entries to generate multiple levels of a tree.
+ const unsigned nToInsert = 200;
+ RecordId rids[nToInsert];
+ // Add a large string to the entries, to encourage more branching in the tree.
+ int strBytes = 500;
+ std::string keyStr('c', strBytes / sizeof('c'));
+ for (unsigned i = 0; i < nToInsert; i++) {
+ auto opCtx(harnessHelper->newOperationContext(harnessHelper->client()));
+ {
+ BSONObj data = BSON("" << i << "" << keyStr);
+
+ WriteUnitOfWork uow(opCtx.get());
+ rids[i] = RecordId(42, i * 2);
+ Status res = sorted->insert(opCtx.get(), data, rids[i], false);
+ ASSERT_OK(res);
+ uow.commit();
+ }
+ }
+
+ {
+ auto opCtx(harnessHelper->newOperationContext(harnessHelper->client()));
+ ASSERT_EQUALS(nToInsert, sorted->numEntries(opCtx.get()));
+ }
+
+ set<RecordId> remain(rids, rids + nToInsert);
+ {
+ auto opCtx(harnessHelper->newOperationContext(harnessHelper->client()));
+ auto cursor = sorted->newRandomCursor(opCtx.get());
+ if (!cursor) {
+ // newRandomCursor is not supported.
+ return;
+ }
+
+ // Iterate the cursor enough times so that there should be a large probability of seeing all
+ // the documents.
+ for (unsigned i = 0; i < nToInsert * 15; i++) {
+ // Get a new cursor once in a while, shouldn't affect things.
+ if (i % (nToInsert / 8) == 0) {
+ cursor = sorted->newRandomCursor(opCtx.get());
+ }
+ remain.erase(cursor->next()->loc); // Can happen more than once per doc.
+ }
+ // By this point, we should have visited all entries.
+ ASSERT_EQ(remain.size(), 0UL);
+ }
+}
+
+} // namespace mongo