summaryrefslogtreecommitdiff
path: root/src/mongo/db/storage
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/db/storage')
-rw-r--r--src/mongo/db/storage/capped_callback.h54
-rw-r--r--src/mongo/db/storage/heap1/SConscript13
-rw-r--r--src/mongo/db/storage/heap1/heap1_btree_impl.cpp18
-rw-r--r--src/mongo/db/storage/heap1/heap1_btree_impl.h5
-rw-r--r--src/mongo/db/storage/heap1/heap1_database_catalog_entry.cpp62
-rw-r--r--src/mongo/db/storage/heap1/record_store_heap.cpp494
-rw-r--r--src/mongo/db/storage/heap1/record_store_heap.h241
-rw-r--r--src/mongo/db/storage/mmap_v1/SConscript119
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_builder_test.cpp133
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp266
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_interface.h50
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp2519
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_logic.h593
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp2207
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h380
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp247
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/btree_test_help.h154
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/bucket_deletion_notification.h54
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/key.cpp691
-rw-r--r--src/mongo/db/storage/mmap_v1/btree/key.h130
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/hashtab.h180
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/index_details.cpp40
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/index_details.h69
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace-inl.h74
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace.cpp49
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace.h92
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp244
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_details.h229
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp333
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h109
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp225
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h111
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp205
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_index.h94
-rw-r--r--src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp67
-rw-r--r--src/mongo/db/storage/mmap_v1/dur_recover.cpp1
-rw-r--r--src/mongo/db/storage/mmap_v1/extent.h2
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp27
-rw-r--r--src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h2
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp974
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_base.h303
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp717
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_capped.h139
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp237
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h100
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp558
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp192
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h96
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp505
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_simple.h95
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp130
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h73
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp775
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp608
-rw-r--r--src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h198
-rw-r--r--src/mongo/db/storage/record_store.h291
-rw-r--r--src/mongo/db/storage/rocks/rocks_btree_impl.cpp6
-rw-r--r--src/mongo/db/storage/rocks/rocks_btree_impl.h4
-rw-r--r--src/mongo/db/storage/rocks/rocks_btree_impl_test.cpp6
-rw-r--r--src/mongo/db/storage/sorted_data_interface.h200
60 files changed, 16723 insertions, 67 deletions
diff --git a/src/mongo/db/storage/capped_callback.h b/src/mongo/db/storage/capped_callback.h
new file mode 100644
index 00000000000..59c23f9dab9
--- /dev/null
+++ b/src/mongo/db/storage/capped_callback.h
@@ -0,0 +1,54 @@
+// record_store_v1_capped.h
+
+/**
+* Copyright (C) 2014 MongoDB Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#pragma once
+
+#include "mongo/db/diskloc.h"
+
+namespace mongo {
+
+ class OperationContext;
+
+ /**
+ * When a capped collection has to delete a document, it needs a way to tell the caller
+ * what its deleting so it can unindex or do any other cleanup.
+ * This is that way.
+ */
+ class CappedDocumentDeleteCallback {
+ public:
+ virtual ~CappedDocumentDeleteCallback(){}
+
+ /**
+ * This will be called right before loc is deleted when wrapping.
+ */
+ virtual Status aboutToDeleteCapped( OperationContext* txn, const DiskLoc& loc ) = 0;
+ };
+
+}
diff --git a/src/mongo/db/storage/heap1/SConscript b/src/mongo/db/storage/heap1/SConscript
index 0b1a6db0383..feb9fcbb2d1 100644
--- a/src/mongo/db/storage/heap1/SConscript
+++ b/src/mongo/db/storage/heap1/SConscript
@@ -8,9 +8,20 @@ env.Library(
'heap1_engine.cpp',
],
LIBDEPS= [
+ 'heap_record_store',
'$BUILD_DIR/mongo/bson',
'$BUILD_DIR/mongo/db/catalog/collection_options',
- '$BUILD_DIR/mongo/db/structure/record_store',
+ '$BUILD_DIR/mongo/foundation',
+ ]
+ )
+
+env.Library(
+ target= 'heap_record_store',
+ source= [
+ 'record_store_heap.cpp'
+ ],
+ LIBDEPS= [
+ '$BUILD_DIR/mongo/bson',
'$BUILD_DIR/mongo/foundation',
]
)
diff --git a/src/mongo/db/storage/heap1/heap1_btree_impl.cpp b/src/mongo/db/storage/heap1/heap1_btree_impl.cpp
index 2d5ae2fc63b..9a2ec04417a 100644
--- a/src/mongo/db/storage/heap1/heap1_btree_impl.cpp
+++ b/src/mongo/db/storage/heap1/heap1_btree_impl.cpp
@@ -200,7 +200,7 @@ namespace {
return it->loc != loc;
}
- class Heap1BtreeBuilderImpl : public BtreeBuilderInterface {
+ class Heap1BtreeBuilderImpl : public SortedDataBuilderInterface {
public:
Heap1BtreeBuilderImpl(IndexSet* data, bool dupsAllowed)
: _data(data),
@@ -241,14 +241,14 @@ namespace {
bool _committed;
};
- class Heap1BtreeImpl : public BtreeInterface {
+ class Heap1BtreeImpl : public SortedDataInterface {
public:
Heap1BtreeImpl(const IndexCatalogEntry& info, IndexSet* data)
: _info(info),
_data(data)
{}
- virtual BtreeBuilderInterface* getBulkBuilder(OperationContext* txn, bool dupsAllowed) {
+ virtual SortedDataBuilderInterface* getBulkBuilder(OperationContext* txn, bool dupsAllowed) {
return new Heap1BtreeBuilderImpl(_data, dupsAllowed);
}
@@ -300,7 +300,7 @@ namespace {
return Status::OK();
}
- class ForwardCursor : public BtreeInterface::Cursor {
+ class ForwardCursor : public SortedDataInterface::Cursor {
public:
ForwardCursor(const IndexSet& data, OperationContext* txn)
: _txn(txn),
@@ -314,7 +314,7 @@ namespace {
return _it == _data.end();
}
- virtual bool pointsToSamePlaceAs(const BtreeInterface::Cursor& otherBase) const {
+ virtual bool pointsToSamePlaceAs(const SortedDataInterface::Cursor& otherBase) const {
const ForwardCursor& other = static_cast<const ForwardCursor&>(otherBase);
invariant(&_data == &other._data); // iterators over same index
return _it == other._it;
@@ -399,7 +399,7 @@ namespace {
};
// TODO see if this can share any code with ForwardIterator
- class ReverseCursor : public BtreeInterface::Cursor {
+ class ReverseCursor : public SortedDataInterface::Cursor {
public:
ReverseCursor(const IndexSet& data, OperationContext* txn)
: _txn(txn),
@@ -413,7 +413,7 @@ namespace {
return _it == _data.rend();
}
- virtual bool pointsToSamePlaceAs(const BtreeInterface::Cursor& otherBase) const {
+ virtual bool pointsToSamePlaceAs(const SortedDataInterface::Cursor& otherBase) const {
const ReverseCursor& other = static_cast<const ReverseCursor&>(otherBase);
invariant(&_data == &other._data); // iterators over same index
return _it == other._it;
@@ -512,7 +512,7 @@ namespace {
DiskLoc _savedLoc;
};
- virtual BtreeInterface::Cursor* newCursor(OperationContext* txn, int direction) const {
+ virtual SortedDataInterface::Cursor* newCursor(OperationContext* txn, int direction) const {
if (direction == 1)
return new ForwardCursor(*_data, txn);
@@ -533,7 +533,7 @@ namespace {
// IndexCatalogEntry argument taken by non-const pointer for consistency with other Btree
// factories. We don't actually modify it.
- BtreeInterface* getHeap1BtreeImpl(IndexCatalogEntry* info, boost::shared_ptr<void>* dataInOut) {
+ SortedDataInterface* getHeap1BtreeImpl(IndexCatalogEntry* info, boost::shared_ptr<void>* dataInOut) {
invariant(info);
invariant(dataInOut);
if (!*dataInOut) {
diff --git a/src/mongo/db/storage/heap1/heap1_btree_impl.h b/src/mongo/db/storage/heap1/heap1_btree_impl.h
index 72b38ce3696..7187dc589dc 100644
--- a/src/mongo/db/storage/heap1/heap1_btree_impl.h
+++ b/src/mongo/db/storage/heap1/heap1_btree_impl.h
@@ -28,17 +28,18 @@
#include <boost/shared_ptr.hpp>
-#include "mongo/db/structure/btree/btree_interface.h"
+#include "mongo/db/storage/sorted_data_interface.h"
#pragma once
namespace mongo {
+
class IndexCatalogEntry;
/**
* Caller takes ownership.
* All permanent data will be stored and fetch from dataInOut.
*/
- BtreeInterface* getHeap1BtreeImpl(IndexCatalogEntry* info, boost::shared_ptr<void>* dataInOut);
+ SortedDataInterface* getHeap1BtreeImpl(IndexCatalogEntry* info, boost::shared_ptr<void>* dataInOut);
} // namespace mongo
diff --git a/src/mongo/db/storage/heap1/heap1_database_catalog_entry.cpp b/src/mongo/db/storage/heap1/heap1_database_catalog_entry.cpp
index 53dea7f10c7..58e069d9863 100644
--- a/src/mongo/db/storage/heap1/heap1_database_catalog_entry.cpp
+++ b/src/mongo/db/storage/heap1/heap1_database_catalog_entry.cpp
@@ -1,32 +1,30 @@
-// heap1_database_catalog_entry.cpp
-
/**
-* Copyright (C) 2014 MongoDB Inc.
-*
-* This program is free software: you can redistribute it and/or modify
-* it under the terms of the GNU Affero General Public License, version 3,
-* as published by the Free Software Foundation.
-*
-* This program is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-* GNU Affero General Public License for more details.
-*
-* You should have received a copy of the GNU Affero General Public License
-* along with this program. If not, see <http://www.gnu.org/licenses/>.
-*
-* As a special exception, the copyright holders give permission to link the
-* code of portions of this program with the OpenSSL library under certain
-* conditions as described in each individual source file and distribute
-* linked combinations including the program with the OpenSSL library. You
-* must comply with the GNU Affero General Public License in all respects for
-* all of the code used other than as permitted herein. If you modify file(s)
-* with this exception, you may extend this exception to your version of the
-* file(s), but you are not obligated to do so. If you do not wish to do so,
-* delete this exception statement from your version. If you delete this
-* exception statement from all source files in the program, then also delete
-* it in the license file.
-*/
+ * Copyright (C) 2014 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
#include "mongo/db/storage/heap1/heap1_database_catalog_entry.h"
@@ -43,7 +41,7 @@
#include "mongo/db/operation_context.h"
#include "mongo/db/storage/heap1/heap1_btree_impl.h"
#include "mongo/db/storage/heap1/heap1_recovery_unit.h"
-#include "mongo/db/structure/record_store_heap.h"
+#include "mongo/db/storage/heap1/record_store_heap.h"
namespace mongo {
@@ -159,14 +157,14 @@ namespace mongo {
index->headManager()->setHead(txn, DiskLoc(0xDEAD, 0xBEAF));
// When is a btree not a Btree? When it is a Heap1BtreeImpl!
- std::auto_ptr<BtreeInterface> btree(getHeap1BtreeImpl(index, &i->second->data));
+ std::auto_ptr<SortedDataInterface> btree(getHeap1BtreeImpl(index, &i->second->data));
#else
if (!i->second->rs)
i->second->rs.reset(new HeapRecordStore( index->descriptor()->indexName() ));
- std::auto_ptr<BtreeInterface> btree(
- BtreeInterface::getInterface(index->headManager(),
+ std::auto_ptr<SortedDataInterface> btree(
+ SortedDataInterface::getInterface(index->headManager(),
i->second->rs,
index->ordering(),
index->descriptor()->indexNamespace(),
diff --git a/src/mongo/db/storage/heap1/record_store_heap.cpp b/src/mongo/db/storage/heap1/record_store_heap.cpp
new file mode 100644
index 00000000000..e0578dc5c71
--- /dev/null
+++ b/src/mongo/db/storage/heap1/record_store_heap.cpp
@@ -0,0 +1,494 @@
+/**
+ * Copyright (C) 2014 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/storage/heap1/record_store_heap.h"
+
+namespace mongo {
+
+ //
+ // RecordStore
+ //
+
+ HeapRecordStore::HeapRecordStore(const StringData& ns,
+ bool isCapped,
+ int64_t cappedMaxSize,
+ int64_t cappedMaxDocs,
+ CappedDocumentDeleteCallback* cappedDeleteCallback)
+ : RecordStore(ns),
+ _isCapped(isCapped),
+ _cappedMaxSize(cappedMaxSize),
+ _cappedMaxDocs(cappedMaxDocs),
+ _cappedDeleteCallback(cappedDeleteCallback),
+ _dataSize(0),
+ _nextId(1) { // DiskLoc(0,0) isn't valid for records.
+
+ if (_isCapped) {
+ invariant(_cappedMaxSize > 0);
+ invariant(_cappedMaxDocs == -1 || _cappedMaxDocs > 0);
+ }
+ else {
+ invariant(_cappedMaxSize == -1);
+ invariant(_cappedMaxDocs == -1);
+ }
+ }
+
+ const char* HeapRecordStore::name() const { return "heap"; }
+
+ RecordData HeapRecordStore::dataFor( const DiskLoc& loc ) const {
+ return recordFor(loc)->toRecordData();
+ }
+
+ HeapRecordStore::HeapRecord* HeapRecordStore::recordFor(const DiskLoc& loc) const {
+ Records::const_iterator it = _records.find(loc);
+ invariant(it != _records.end());
+ return reinterpret_cast<HeapRecord*>(it->second.get());
+ }
+
+ void HeapRecordStore::deleteRecord(OperationContext* txn, const DiskLoc& loc) {
+ HeapRecord* rec = recordFor(loc);
+ _dataSize -= rec->netLength();
+ invariant(_records.erase(loc) == 1);
+ }
+
+ bool HeapRecordStore::cappedAndNeedDelete() const {
+ if (!_isCapped)
+ return false;
+
+ if (_dataSize > _cappedMaxSize)
+ return true;
+
+ if ((_cappedMaxDocs != -1) && (numRecords() > _cappedMaxDocs))
+ return true;
+
+ return false;
+ }
+
+ void HeapRecordStore::cappedDeleteAsNeeded(OperationContext* txn) {
+ while (cappedAndNeedDelete()) {
+ invariant(!_records.empty());
+
+ DiskLoc oldest = _records.begin()->first;
+
+ if (_cappedDeleteCallback)
+ uassertStatusOK(_cappedDeleteCallback->aboutToDeleteCapped(txn, oldest));
+
+ deleteRecord(txn, oldest);
+ }
+ }
+
+ StatusWith<DiskLoc> HeapRecordStore::insertRecord(OperationContext* txn,
+ const char* data,
+ int len,
+ bool enforceQuota) {
+ if (_isCapped && len > _cappedMaxSize) {
+ // We use dataSize for capped rollover and we don't want to delete everything if we know
+ // this won't fit.
+ return StatusWith<DiskLoc>(ErrorCodes::BadValue,
+ "object to insert exceeds cappedMaxSize");
+ }
+
+ // TODO padding?
+ const int lengthWithHeaders = len + HeapRecord::HeaderSize;
+ boost::shared_array<char> buf(new char[lengthWithHeaders]);
+ HeapRecord* rec = reinterpret_cast<HeapRecord*>(buf.get());
+ rec->lengthWithHeaders() = lengthWithHeaders;
+ memcpy(rec->data(), data, len);
+
+ const DiskLoc loc = allocateLoc();
+ _records[loc] = buf;
+ _dataSize += len;
+
+ cappedDeleteAsNeeded(txn);
+
+ return StatusWith<DiskLoc>(loc);
+ }
+
+ StatusWith<DiskLoc> HeapRecordStore::insertRecord(OperationContext* txn,
+ const DocWriter* doc,
+ bool enforceQuota) {
+ const int len = doc->documentSize();
+ if (_isCapped && len > _cappedMaxSize) {
+ // We use dataSize for capped rollover and we don't want to delete everything if we know
+ // this won't fit.
+ return StatusWith<DiskLoc>(ErrorCodes::BadValue,
+ "object to insert exceeds cappedMaxSize");
+ }
+
+ // TODO padding?
+ const int lengthWithHeaders = len + HeapRecord::HeaderSize;
+ boost::shared_array<char> buf(new char[lengthWithHeaders]);
+ HeapRecord* rec = reinterpret_cast<HeapRecord*>(buf.get());
+ rec->lengthWithHeaders() = lengthWithHeaders;
+ doc->writeDocument(rec->data());
+
+ const DiskLoc loc = allocateLoc();
+ _records[loc] = buf;
+ _dataSize += len;
+
+ cappedDeleteAsNeeded(txn);
+
+ return StatusWith<DiskLoc>(loc);
+ }
+
+ StatusWith<DiskLoc> HeapRecordStore::updateRecord(OperationContext* txn,
+ const DiskLoc& oldLocation,
+ const char* data,
+ int len,
+ bool enforceQuota,
+ UpdateMoveNotifier* notifier ) {
+ HeapRecord* oldRecord = recordFor( oldLocation );
+ int oldLen = oldRecord->netLength();
+
+ // If the length of the new data is <= the length of the old data then just
+ // memcopy into the old space
+ if ( len <= oldLen) {
+ memcpy(oldRecord->data(), data, len);
+ _dataSize += len - oldLen;
+ return StatusWith<DiskLoc>(oldLocation);
+ }
+
+ if ( _isCapped ) {
+ return StatusWith<DiskLoc>( ErrorCodes::InternalError,
+ "failing update: objects in a capped ns cannot grow",
+ 10003 );
+ }
+
+ // If the length of the new data exceeds the size of the old Record, we need to allocate
+ // a new Record, and delete the old one
+
+ const int lengthWithHeaders = len + HeapRecord::HeaderSize;
+ boost::shared_array<char> buf(new char[lengthWithHeaders]);
+ HeapRecord* rec = reinterpret_cast<HeapRecord*>(buf.get());
+ rec->lengthWithHeaders() = lengthWithHeaders;
+ memcpy(rec->data(), data, len);
+
+ _records[oldLocation] = buf;
+ _dataSize += len - oldLen;
+
+ cappedDeleteAsNeeded(txn);
+
+ return StatusWith<DiskLoc>(oldLocation);
+ }
+
+ Status HeapRecordStore::updateWithDamages( OperationContext* txn,
+ const DiskLoc& loc,
+ const char* damangeSource,
+ const mutablebson::DamageVector& damages ) {
+ HeapRecord* rec = recordFor( loc );
+ char* root = rec->data();
+
+ // All updates were in place. Apply them via durability and writing pointer.
+ mutablebson::DamageVector::const_iterator where = damages.begin();
+ const mutablebson::DamageVector::const_iterator end = damages.end();
+ for( ; where != end; ++where ) {
+ const char* sourcePtr = damangeSource + where->sourceOffset;
+ char* targetPtr = root + where->targetOffset;
+ std::memcpy(targetPtr, sourcePtr, where->size);
+ }
+
+ return Status::OK();
+ }
+
+ RecordIterator* HeapRecordStore::getIterator(OperationContext* txn,
+ const DiskLoc& start,
+ bool tailable,
+ const CollectionScanParams::Direction& dir) const {
+ if (tailable)
+ invariant(_isCapped && dir == CollectionScanParams::FORWARD);
+
+ if (dir == CollectionScanParams::FORWARD) {
+ return new HeapRecordIterator(txn, _records, *this, start, tailable);
+ }
+ else {
+ return new HeapRecordIterator(txn, _records, *this, start);
+ }
+ }
+
+ RecordIterator* HeapRecordStore::getIteratorForRepair(OperationContext* txn) const {
+ // TODO maybe make different from HeapRecordIterator
+ return new HeapRecordIterator(txn, _records, *this);
+ }
+
+ std::vector<RecordIterator*> HeapRecordStore::getManyIterators(OperationContext* txn) const {
+ std::vector<RecordIterator*> out;
+ // TODO maybe find a way to return multiple iterators.
+ out.push_back(new HeapRecordIterator(txn, _records, *this));
+ return out;
+ }
+
+ Status HeapRecordStore::truncate(OperationContext* txn) {
+ _records.clear();
+ _dataSize = 0;
+ return Status::OK();
+ }
+
+ void HeapRecordStore::temp_cappedTruncateAfter(OperationContext* txn,
+ DiskLoc end,
+ bool inclusive) {
+ Records::iterator it = inclusive ? _records.lower_bound(end)
+ : _records.upper_bound(end);
+ while(it != _records.end()) {
+ _dataSize -= reinterpret_cast<HeapRecord*>(it->second.get())->netLength();
+ _records.erase(it++);
+ }
+ }
+
+ bool HeapRecordStore::compactSupported() const {
+ return false;
+ }
+ Status HeapRecordStore::compact(OperationContext* txn,
+ RecordStoreCompactAdaptor* adaptor,
+ const CompactOptions* options,
+ CompactStats* stats) {
+ // TODO might be possible to do something here
+ invariant(!"compact not yet implemented");
+ }
+
+ Status HeapRecordStore::validate(OperationContext* txn,
+ bool full,
+ bool scanData,
+ ValidateAdaptor* adaptor,
+ ValidateResults* results,
+ BSONObjBuilder* output) const {
+ results->valid = true;
+ if (scanData && full) {
+ for (Records::const_iterator it = _records.begin(); it != _records.end(); ++it) {
+ HeapRecord* rec = reinterpret_cast<HeapRecord*>(it->second.get());
+ size_t dataSize;
+ const Status status = adaptor->validate(rec->toRecordData(), &dataSize);
+ if (!status.isOK()) {
+ results->valid = false;
+ results->errors.push_back("invalid object detected (see logs)");
+ log() << "Invalid object detected in " << _ns << ": " << status.reason();
+ }
+ }
+ }
+
+ output->appendNumber( "nrecords", _records.size() );
+
+ return Status::OK();
+
+ }
+
+ void HeapRecordStore::appendCustomStats( OperationContext* txn,
+ BSONObjBuilder* result,
+ double scale ) const {
+ result->append( "note", "HeapRecordStore has no cusom stats yet" );
+ }
+
+ Status HeapRecordStore::touch(OperationContext* txn, BSONObjBuilder* output) const {
+ if (output) {
+ output->append("numRanges", 1);
+ output->append("millis", 0);
+ }
+ return Status::OK();
+ }
+
+ Status HeapRecordStore::setCustomOption(
+ OperationContext* txn, const BSONElement& option, BSONObjBuilder* info) {
+ invariant(!"setCustomOption not yet implemented");
+ }
+
+ void HeapRecordStore::increaseStorageSize(OperationContext* txn, int size, bool enforceQuota) {
+ // unclear what this would mean for this class. For now, just error if called.
+ invariant(!"increaseStorageSize not yet implemented");
+ }
+
+ int64_t HeapRecordStore::storageSize(OperationContext* txn,
+ BSONObjBuilder* extraInfo,
+ int infoLevel) const {
+ // Note: not making use of extraInfo or infoLevel since we don't have extents
+ const int64_t recordOverhead = numRecords() * HeapRecord::HeaderSize;
+ return _dataSize + recordOverhead;
+ }
+
+ DiskLoc HeapRecordStore::allocateLoc() {
+ const int64_t id = _nextId++;
+ // This is a hack, but both the high and low order bits of DiskLoc offset must be 0, and the
+ // file must fit in 23 bits. This gives us a total of 30 + 23 == 53 bits.
+ invariant(id < (1LL << 53));
+ return DiskLoc(int(id >> 30), int((id << 1) & ~(1<<31)));
+ }
+
+ //
+ // Forward Iterator
+ //
+
+ HeapRecordIterator::HeapRecordIterator(OperationContext* txn,
+ const HeapRecordStore::Records& records,
+ const HeapRecordStore& rs,
+ DiskLoc start,
+ bool tailable)
+ : _txn(txn),
+ _tailable(tailable),
+ _lastLoc(minDiskLoc),
+ _killedByInvalidate(false),
+ _records(records),
+ _rs(rs) {
+ if (start.isNull()) {
+ _it = _records.begin();
+ }
+ else {
+ _it = _records.find(start);
+ invariant(_it != _records.end());
+ }
+ }
+
+ bool HeapRecordIterator::isEOF() {
+ return _it == _records.end();
+ }
+
+ DiskLoc HeapRecordIterator::curr() {
+ if (isEOF())
+ return DiskLoc();
+ return _it->first;
+ }
+
+ DiskLoc HeapRecordIterator::getNext() {
+ if (isEOF()) {
+ if (!_tailable)
+ return DiskLoc();
+
+ if (_records.empty())
+ return DiskLoc();
+
+ invariant(!_killedByInvalidate);
+
+ // recover to last returned record
+ invariant(!_lastLoc.isNull());
+ _it = _records.find(_lastLoc);
+ invariant(_it != _records.end());
+
+ if (++_it == _records.end())
+ return DiskLoc();
+ }
+
+ const DiskLoc out = _it->first;
+ ++_it;
+ if (_tailable && _it == _records.end())
+ _lastLoc = out;
+ return out;
+ }
+
+ void HeapRecordIterator::invalidate(const DiskLoc& loc) {
+ if (_rs.isCapped()) {
+ // Capped iterators die on invalidation rather than advancing.
+ if (isEOF()) {
+ if (_lastLoc == loc) {
+ _killedByInvalidate = true;
+ }
+ }
+ else if (_it->first == loc) {
+ _killedByInvalidate = true;
+ }
+
+ return;
+ }
+
+ if (_it != _records.end() && _it->first == loc)
+ ++_it;
+ }
+
+ void HeapRecordIterator::prepareToYield() {
+ }
+
+ bool HeapRecordIterator::recoverFromYield() {
+ return !_killedByInvalidate;
+ }
+
+ RecordData HeapRecordIterator::dataFor(const DiskLoc& loc) const {
+ return _rs.dataFor(loc);
+ }
+
+ //
+ // Reverse Iterator
+ //
+
+ HeapRecordReverseIterator::HeapRecordReverseIterator(OperationContext* txn,
+ const HeapRecordStore::Records& records,
+ const HeapRecordStore& rs,
+ DiskLoc start)
+ : _txn(txn),
+ _killedByInvalidate(false),
+ _records(records),
+ _rs(rs) {
+ if (start.isNull()) {
+ _it = _records.rbegin();
+ }
+ else {
+ _it = HeapRecordStore::Records::const_reverse_iterator(_records.find(start));
+ invariant(_it != _records.rend());
+ }
+ }
+
+ bool HeapRecordReverseIterator::isEOF() {
+ return _it == _records.rend();
+ }
+
+ DiskLoc HeapRecordReverseIterator::curr() {
+ if (isEOF())
+ return DiskLoc();
+ return _it->first;
+ }
+
+ DiskLoc HeapRecordReverseIterator::getNext() {
+ if (isEOF())
+ return DiskLoc();
+
+ const DiskLoc out = _it->first;
+ ++_it;
+ return out;
+ }
+
+ void HeapRecordReverseIterator::invalidate(const DiskLoc& loc) {
+ if (isEOF())
+ return;
+
+ if (_it->first == loc) {
+ if (_rs.isCapped()) {
+ // Capped iterators die on invalidation rather than advancing.
+ _killedByInvalidate = true;
+ return;
+ }
+ ++_it;
+ }
+ }
+
+ void HeapRecordReverseIterator::prepareToYield() {
+ }
+
+ bool HeapRecordReverseIterator::recoverFromYield() {
+ return !_killedByInvalidate;
+ }
+
+ RecordData HeapRecordReverseIterator::dataFor(const DiskLoc& loc) const {
+ return _rs.dataFor(loc);
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/heap1/record_store_heap.h b/src/mongo/db/storage/heap1/record_store_heap.h
new file mode 100644
index 00000000000..f4810b04972
--- /dev/null
+++ b/src/mongo/db/storage/heap1/record_store_heap.h
@@ -0,0 +1,241 @@
+// record_store_heap.h
+
+/**
+* Copyright (C) 2014 MongoDB Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#pragma once
+
+#include <boost/shared_array.hpp>
+#include <map>
+
+#include "mongo/db/storage/capped_callback.h"
+#include "mongo/db/storage/record_store.h"
+
+namespace mongo {
+
+ class HeapRecordIterator;
+
+ /**
+ * A RecordStore that stores all data on the heap.
+ *
+ * @param cappedMaxSize - required if isCapped. limit uses dataSize() in this impl.
+ */
+ class HeapRecordStore : public RecordStore {
+ public:
+ explicit HeapRecordStore(const StringData& ns,
+ bool isCapped = false,
+ int64_t cappedMaxSize = -1,
+ int64_t cappedMaxDocs = -1,
+ CappedDocumentDeleteCallback* cappedDeleteCallback = NULL);
+
+ virtual const char* name() const;
+
+ virtual RecordData dataFor( const DiskLoc& loc ) const;
+
+ virtual void deleteRecord( OperationContext* txn, const DiskLoc& dl );
+
+ virtual StatusWith<DiskLoc> insertRecord( OperationContext* txn,
+ const char* data,
+ int len,
+ bool enforceQuota );
+
+ virtual StatusWith<DiskLoc> insertRecord( OperationContext* txn,
+ const DocWriter* doc,
+ bool enforceQuota );
+
+ virtual StatusWith<DiskLoc> updateRecord( OperationContext* txn,
+ const DiskLoc& oldLocation,
+ const char* data,
+ int len,
+ bool enforceQuota,
+ UpdateMoveNotifier* notifier );
+
+ virtual Status updateWithDamages( OperationContext* txn,
+ const DiskLoc& loc,
+ const char* damangeSource,
+ const mutablebson::DamageVector& damages );
+
+ virtual RecordIterator* getIterator( OperationContext* txn,
+ const DiskLoc& start,
+ bool tailable,
+ const CollectionScanParams::Direction& dir) const;
+
+ virtual RecordIterator* getIteratorForRepair( OperationContext* txn ) const;
+
+ virtual std::vector<RecordIterator*> getManyIterators( OperationContext* txn ) const;
+
+ virtual Status truncate( OperationContext* txn );
+
+ virtual void temp_cappedTruncateAfter( OperationContext* txn, DiskLoc end, bool inclusive );
+
+ virtual bool compactSupported() const;
+ virtual Status compact( OperationContext* txn,
+ RecordStoreCompactAdaptor* adaptor,
+ const CompactOptions* options,
+ CompactStats* stats );
+
+ virtual Status validate( OperationContext* txn,
+ bool full,
+ bool scanData,
+ ValidateAdaptor* adaptor,
+ ValidateResults* results, BSONObjBuilder* output ) const;
+
+ virtual void appendCustomStats( OperationContext* txn,
+ BSONObjBuilder* result,
+ double scale ) const;
+
+ virtual Status touch( OperationContext* txn, BSONObjBuilder* output ) const;
+
+ virtual Status setCustomOption( OperationContext* txn,
+ const BSONElement& option,
+ BSONObjBuilder* info = NULL );
+
+ virtual void increaseStorageSize( OperationContext* txn, int size, bool enforceQuota );
+
+ virtual int64_t storageSize( OperationContext* txn,
+ BSONObjBuilder* extraInfo = NULL,
+ int infoLevel = 0) const;
+
+ virtual long long dataSize() const { return _dataSize; }
+
+ virtual long long numRecords() const { return _records.size(); }
+
+ protected:
+ class HeapRecord {
+ public:
+ enum HeaderSizeValue { HeaderSize = 16 };
+
+ int lengthWithHeaders() const { return _lengthWithHeaders; }
+ int& lengthWithHeaders() { return _lengthWithHeaders; }
+
+ const char* data() const { return _data; }
+ char* data() { return _data; }
+
+ int netLength() const { return _lengthWithHeaders - HeaderSize; }
+
+ RecordData toRecordData() const { return RecordData(_data, netLength()); }
+
+ private:
+ int _lengthWithHeaders;
+ char _data[4];
+ };
+
+ virtual HeapRecord* recordFor( const DiskLoc& loc ) const;
+
+ public:
+ //
+ // Not in RecordStore interface
+ //
+
+ typedef std::map<DiskLoc, boost::shared_array<char> > Records;
+
+ bool isCapped() const { return _isCapped; }
+ void setCappedDeleteCallback(CappedDocumentDeleteCallback* cb) { _cappedDeleteCallback = cb; }
+ bool cappedMaxDocs() const { invariant(_isCapped); return _cappedMaxDocs; }
+ bool cappedMaxSize() const { invariant(_isCapped); return _cappedMaxSize; }
+
+ private:
+ DiskLoc allocateLoc();
+ bool cappedAndNeedDelete() const;
+ void cappedDeleteAsNeeded(OperationContext* txn);
+
+ // TODO figure out a proper solution to metadata
+ const bool _isCapped;
+ const int64_t _cappedMaxSize;
+ const int64_t _cappedMaxDocs;
+ CappedDocumentDeleteCallback* _cappedDeleteCallback;
+ int64_t _dataSize;
+
+ Records _records;
+ int64_t _nextId;
+ };
+
+ class HeapRecordIterator : public RecordIterator {
+ public:
+ HeapRecordIterator(OperationContext* txn,
+ const HeapRecordStore::Records& records,
+ const HeapRecordStore& rs,
+ DiskLoc start = DiskLoc(),
+ bool tailable = false);
+
+ virtual bool isEOF();
+
+ virtual DiskLoc curr();
+
+ virtual DiskLoc getNext();
+
+ virtual void invalidate(const DiskLoc& dl);
+
+ virtual void prepareToYield();
+
+ virtual bool recoverFromYield();
+
+ virtual RecordData dataFor( const DiskLoc& loc ) const;
+
+ private:
+ OperationContext* _txn; // not owned
+ HeapRecordStore::Records::const_iterator _it;
+ bool _tailable;
+ DiskLoc _lastLoc; // only for restarting tailable
+ bool _killedByInvalidate;
+
+ const HeapRecordStore::Records& _records;
+ const HeapRecordStore& _rs;
+ };
+
+ class HeapRecordReverseIterator : public RecordIterator {
+ public:
+ HeapRecordReverseIterator(OperationContext* txn,
+ const HeapRecordStore::Records& records,
+ const HeapRecordStore& rs,
+ DiskLoc start = DiskLoc());
+
+ virtual bool isEOF();
+
+ virtual DiskLoc curr();
+
+ virtual DiskLoc getNext();
+
+ virtual void invalidate(const DiskLoc& dl);
+
+ virtual void prepareToYield();
+
+ virtual bool recoverFromYield();
+
+ virtual RecordData dataFor( const DiskLoc& loc ) const;
+
+ private:
+ OperationContext* _txn; // not owned
+ HeapRecordStore::Records::const_reverse_iterator _it;
+ bool _killedByInvalidate;
+
+ const HeapRecordStore::Records& _records;
+ const HeapRecordStore& _rs;
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/SConscript b/src/mongo/db/storage/mmap_v1/SConscript
index 5f7ac5eabd2..11b6b06b3e7 100644
--- a/src/mongo/db/storage/mmap_v1/SConscript
+++ b/src/mongo/db/storage/mmap_v1/SConscript
@@ -1,6 +1,34 @@
Import("env")
env.Library(
+ target = 'storage_mmapv1',
+ source = [ "catalog/index_details.cpp",
+ "catalog/namespace.cpp",
+ "catalog/namespace_details.cpp",
+ "catalog/namespace_details_collection_entry.cpp",
+ "catalog/namespace_details_rsv1_metadata.cpp",
+ "catalog/namespace_index.cpp",
+ "data_file.cpp",
+ "durable_mapped_file.cpp",
+ "dur.cpp",
+ "durop.cpp",
+ "dur_writetodatafiles.cpp",
+ "dur_preplogbuffer.cpp",
+ "dur_commitjob.cpp",
+ "dur_recover.cpp",
+ "dur_journal.cpp",
+ "dur_recovery_unit.cpp",
+ "mmap_v1_database_catalog_entry.cpp",
+ "mmap_v1_engine.cpp",
+ "mmap_v1_extent_manager.cpp",
+ "repair_database.cpp",
+ ],
+ LIBDEPS = [
+ 'record_store_v1',
+ 'btree']
+ )
+
+env.Library(
target= 'extent',
source= [
'extent.cpp',
@@ -11,3 +39,94 @@ env.Library(
'$BUILD_DIR/mongo/foundation',
]
)
+
+env.Library(
+ target= 'record_store_v1',
+ source= [
+ 'record_store_v1_base.cpp',
+ 'record_store_v1_capped.cpp',
+ 'record_store_v1_capped_iterator.cpp',
+ 'record_store_v1_repair_iterator.cpp',
+ 'record_store_v1_simple.cpp',
+ 'record_store_v1_simple_iterator.cpp',
+ ],
+ LIBDEPS= [
+ 'extent',
+ '$BUILD_DIR/mongo/mongocommon', # for ProgressMeter
+ '$BUILD_DIR/mongo/db/commands/server_status_core',
+ ]
+ )
+
+env.Library(
+ target='record_store_v1_test_help',
+ source=['record_store_v1_test_help.cpp',
+ ],
+ LIBDEPS=[
+ 'record_store_v1'
+ ]
+ )
+
+env.CppUnitTest(target = 'namespace_test',
+ source = ['catalog/namespace_test.cpp'],
+ LIBDEPS = ['$BUILD_DIR/mongo/foundation'])
+
+env.CppUnitTest(
+ target='record_store_v1_simple_test',
+ source=['record_store_v1_simple_test.cpp',
+ ],
+ LIBDEPS=[
+ 'record_store_v1_test_help'
+ ]
+ )
+
+env.CppUnitTest(
+ target='record_store_v1_capped_test',
+ source=['record_store_v1_capped_test.cpp',
+ ],
+ LIBDEPS=[
+ 'record_store_v1_test_help'
+ ]
+ )
+
+env.Library(
+ target= 'btree',
+ source= [
+ 'btree/btree_logic.cpp',
+ 'btree/btree_interface.cpp',
+ 'btree/key.cpp'
+ ],
+ LIBDEPS= [
+ '$BUILD_DIR/mongo/bson'
+ ]
+ )
+
+env.Library(
+ target= 'btree_test_help',
+ source= [
+ 'btree/btree_test_help.cpp'
+ ],
+ LIBDEPS= [
+ 'btree',
+ '$BUILD_DIR/mongo/mongocommon', # for ProgressMeter
+ '$BUILD_DIR/mongo/db/storage/mmap_v1/record_store_v1_test_help',
+ '$BUILD_DIR/mongo/db/storage/heap1/heap_record_store' # XXX?
+ ]
+ )
+
+env.CppUnitTest(
+ target='btree_logic_test',
+ source=['btree/btree_logic_test.cpp'
+ ],
+ LIBDEPS=[
+ 'btree_test_help'
+ ]
+ )
+
+env.CppUnitTest(
+ target='btree_builder_test',
+ source=['btree/btree_builder_test.cpp'
+ ],
+ LIBDEPS=[
+ 'btree_test_help'
+ ]
+ )
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_builder_test.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_builder_test.cpp
new file mode 100644
index 00000000000..89d2ffc4d98
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_builder_test.cpp
@@ -0,0 +1,133 @@
+// btree_builder_test.cpp : Btree builder unit test
+
+/**
+ * Copyright (C) 2014 MongoDB
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects
+ * for all of the code used other than as permitted herein. If you modify
+ * file(s) with this exception, you may extend this exception to your
+ * version of the file(s), but you are not obligated to do so. If you do not
+ * wish to do so, delete this exception statement from your version. If you
+ * delete this exception statement from all source files in the program,
+ * then also delete it in the license file.
+ */
+
+// This file contains simple tests to check the Btree builder logic,
+// including handling of interruptions.
+
+#include "mongo/db/instance.h"
+#include "mongo/db/operation_context_noop.h"
+#include "mongo/db/storage/mmap_v1/btree/btree_test_help.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+
+ class MockOperationContextKillable : public OperationContextNoop {
+ public:
+ MockOperationContextKillable()
+ : _killPending(false) {
+ }
+
+ virtual void checkForInterrupt(bool heedMutex = true) const {
+ if (_killPending) {
+ throw UserException(ErrorCodes::Interrupted, "interrupted");
+ }
+ }
+
+ virtual void kill() {
+ _killPending = true;
+ }
+
+ private:
+ bool _killPending;
+ };
+
+ /**
+ * Builder::commit() is interrupted if there is a request to kill the current operation.
+ */
+ template<class OnDiskFormat>
+ class InterruptCommit {
+ public:
+ typedef typename BtreeLogic<OnDiskFormat>::Builder Builder;
+
+ InterruptCommit( bool mayInterrupt ) :
+ _mayInterrupt( mayInterrupt ),
+ _helper(BSON( "a" << 1 )) {
+ }
+
+ void run() {
+ // Create a btree builder.
+ MockOperationContextKillable txn;
+ Builder* builder = _helper.btree.newBuilder(&txn, false);
+
+ // Add some keys to the builder, in order. We need enough keys to build an internal
+ // node in order to check for an interrupt.
+ int32_t nKeys = 1000;
+ for( int32_t i = 0; i < nKeys; ++i ) {
+ BSONObj key = BSON( "a" << i );
+ builder->addKey( key, /* dummy location */ DiskLoc() );
+ }
+
+ // The root of the index has not yet been set.
+ ASSERT( _helper.headManager.getHead().isNull() );
+ // Register a request to kill the current operation.
+ txn.kill();
+ if ( _mayInterrupt ) {
+ // Call commit on the builder, which will be aborted due to the kill request.
+ ASSERT_THROWS( builder->commit( _mayInterrupt ), UserException );
+ // The root of the index is not set because commit() did not complete.
+ ASSERT( _helper.headManager.getHead().isNull() );
+ }
+ else {
+ // Call commit on the builder, which will not be aborted because mayInterrupt is
+ // false.
+ builder->commit( _mayInterrupt );
+ // The root of the index is set because commit() completed.
+ ASSERT( !_helper.headManager.getHead().isNull() );
+ }
+ }
+
+ private:
+ bool _mayInterrupt;
+ BtreeLogicTestHelper<OnDiskFormat> _helper;
+ };
+
+
+ //
+ // TEST SUITE DEFINITION
+ //
+
+ template<class OnDiskFormat>
+ class BtreeBuilderTestSuite : public unittest::Suite {
+ public:
+ BtreeBuilderTestSuite(const std::string& name) : Suite(name) {
+
+ }
+
+ void setupTests() {
+
+ add< InterruptCommit<OnDiskFormat> >( false );
+ add< InterruptCommit<OnDiskFormat> >( true );
+ }
+ };
+
+ // Test suite for both V0 and V1
+ static BtreeBuilderTestSuite<BtreeLayoutV0> SUITE_V0("BtreeBuilderTests V0");
+ static BtreeBuilderTestSuite<BtreeLayoutV1> SUITE_V1("BtreeBuilderTests V1");
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp
new file mode 100644
index 00000000000..6d2fae7bffa
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp
@@ -0,0 +1,266 @@
+/**
+ * Copyright (C) 2014 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/storage/sorted_data_interface.h"
+
+#include "mongo/db/operation_context.h"
+#include "mongo/db/storage/mmap_v1/btree/btree_logic.h"
+
+
+namespace mongo {
+
+ template <class OnDiskFormat>
+ class BtreeBuilderInterfaceImpl : public SortedDataBuilderInterface {
+ public:
+ BtreeBuilderInterfaceImpl(OperationContext* trans,
+ typename BtreeLogic<OnDiskFormat>::Builder* builder)
+ : _builder(builder), _trans(trans) { }
+
+ virtual ~BtreeBuilderInterfaceImpl() { }
+
+ Status addKey(const BSONObj& key, const DiskLoc& loc) {
+ return _builder->addKey(key, loc);
+ }
+
+ unsigned long long commit(bool mayInterrupt) {
+ return _builder->commit(mayInterrupt);
+ }
+
+ private:
+ typename BtreeLogic<OnDiskFormat>::Builder* _builder;
+
+ // Not owned here.
+ OperationContext* _trans;
+ };
+
+ template <class OnDiskFormat>
+ class BtreeInterfaceImpl : public SortedDataInterface {
+ public:
+ BtreeInterfaceImpl(HeadManager* headManager,
+ RecordStore* recordStore,
+ const Ordering& ordering,
+ const string& indexName,
+ BucketDeletionNotification* bucketDeletionNotification) {
+
+ _btree.reset(new BtreeLogic<OnDiskFormat>(headManager,
+ recordStore,
+ ordering,
+ indexName,
+ bucketDeletionNotification));
+ }
+
+ virtual ~BtreeInterfaceImpl() { }
+
+ virtual SortedDataBuilderInterface* getBulkBuilder(OperationContext* txn,
+ bool dupsAllowed) {
+
+ return new BtreeBuilderInterfaceImpl<OnDiskFormat>(
+ txn, _btree->newBuilder(txn, dupsAllowed));
+ }
+
+ virtual Status insert(OperationContext* txn,
+ const BSONObj& key,
+ const DiskLoc& loc,
+ bool dupsAllowed) {
+
+ return _btree->insert(txn, key, loc, dupsAllowed);
+ }
+
+ virtual bool unindex(OperationContext* txn,
+ const BSONObj& key,
+ const DiskLoc& loc) {
+
+ return _btree->unindex(txn, key, loc);
+ }
+
+ virtual void fullValidate(OperationContext* txn, long long *numKeysOut) {
+ *numKeysOut = _btree->fullValidate(txn, NULL, false, false, 0);
+ }
+
+ virtual Status dupKeyCheck(OperationContext* txn,
+ const BSONObj& key,
+ const DiskLoc& loc) {
+ return _btree->dupKeyCheck(txn, key, loc);
+ }
+
+ virtual bool isEmpty() {
+ return _btree->isEmpty();
+ }
+
+ virtual Status touch(OperationContext* txn) const{
+ return _btree->touch(txn);
+ }
+
+ class Cursor : public SortedDataInterface::Cursor {
+ public:
+ Cursor(OperationContext* txn,
+ const BtreeLogic<OnDiskFormat>* btree,
+ int direction)
+ : _txn(txn),
+ _btree(btree),
+ _direction(direction),
+ _bucket(btree->getHead()), // XXX this shouldn't be nessisary, but is.
+ _ofs(0) {
+ }
+
+ virtual int getDirection() const { return _direction; }
+
+ virtual bool isEOF() const { return _bucket.isNull(); }
+
+ virtual bool pointsToSamePlaceAs(const SortedDataInterface::Cursor& otherBase) const {
+ const Cursor& other = static_cast<const Cursor&>(otherBase);
+ if (isEOF())
+ return other.isEOF();
+
+ return _bucket == other._bucket && _ofs == other._ofs;
+
+ }
+
+ virtual void aboutToDeleteBucket(const DiskLoc& bucket) {
+ if (_bucket == bucket)
+ _ofs = -1;
+ }
+
+ virtual bool locate(const BSONObj& key, const DiskLoc& loc) {
+ return _btree->locate(_txn, key, loc, _direction, &_ofs, &_bucket);
+ }
+
+ virtual void customLocate(const BSONObj& keyBegin,
+ int keyBeginLen,
+ bool afterKey,
+ const vector<const BSONElement*>& keyEnd,
+ const vector<bool>& keyEndInclusive) {
+
+ _btree->customLocate(_txn,
+ &_bucket,
+ &_ofs,
+ keyBegin,
+ keyBeginLen,
+ afterKey,
+ keyEnd,
+ keyEndInclusive,
+ _direction);
+ }
+
+ void advanceTo(const BSONObj &keyBegin,
+ int keyBeginLen,
+ bool afterKey,
+ const vector<const BSONElement*>& keyEnd,
+ const vector<bool>& keyEndInclusive) {
+
+ _btree->advanceTo(_txn,
+ &_bucket,
+ &_ofs,
+ keyBegin,
+ keyBeginLen,
+ afterKey,
+ keyEnd,
+ keyEndInclusive,
+ _direction);
+ }
+
+ virtual BSONObj getKey() const {
+ return _btree->getKey(_bucket, _ofs);
+ }
+
+ virtual DiskLoc getDiskLoc() const {
+ return _btree->getDiskLoc(_bucket, _ofs);
+ }
+
+ virtual void advance() {
+ _btree->advance(_txn, &_bucket, &_ofs, _direction);
+ }
+
+ virtual void savePosition() {
+ if (!_bucket.isNull()) {
+ _savedKey = getKey().getOwned();
+ _savedLoc = getDiskLoc();
+ }
+ }
+
+ virtual void restorePosition() {
+ if (!_bucket.isNull()) {
+ _btree->restorePosition(_txn,
+ _savedKey,
+ _savedLoc,
+ _direction,
+ &_bucket,
+ &_ofs);
+ }
+ }
+
+ private:
+ OperationContext* _txn; // not owned
+ const BtreeLogic<OnDiskFormat>* const _btree;
+ const int _direction;
+
+ DiskLoc _bucket;
+ int _ofs;
+
+ // Only used by save/restorePosition() if _bucket is non-Null.
+ BSONObj _savedKey;
+ DiskLoc _savedLoc;
+ };
+
+ virtual Cursor* newCursor(OperationContext* txn, int direction) const {
+ return new Cursor(txn, _btree.get(), direction);
+ }
+
+ virtual Status initAsEmpty(OperationContext* txn) {
+ return _btree->initAsEmpty(txn);
+ }
+
+ private:
+ scoped_ptr<BtreeLogic<OnDiskFormat> > _btree;
+ };
+
+ SortedDataInterface* getMMAPV1Interface(HeadManager* headManager,
+ RecordStore* recordStore,
+ const Ordering& ordering,
+ const string& indexName,
+ int version,
+ BucketDeletionNotification* bucketDeletion) {
+
+ if (0 == version) {
+ return new BtreeInterfaceImpl<BtreeLayoutV0>(headManager,
+ recordStore,
+ ordering,
+ indexName,
+ bucketDeletion);
+ }
+ else {
+ invariant(1 == version);
+ return new BtreeInterfaceImpl<BtreeLayoutV1>(headManager,
+ recordStore,
+ ordering,
+ indexName,
+ bucketDeletion);
+ }
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_interface.h b/src/mongo/db/storage/mmap_v1/btree/btree_interface.h
new file mode 100644
index 00000000000..ad0d07b7ece
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_interface.h
@@ -0,0 +1,50 @@
+/**
+ * Copyright (C) 2014 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/bson/ordering.h"
+#include "mongo/db/catalog/head_manager.h"
+#include "mongo/db/diskloc.h"
+#include "mongo/db/jsobj.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/db/storage/record_store.h"
+#include "mongo/db/storage/sorted_data_interface.h"
+
+#pragma once
+
+namespace mongo {
+
+ class BucketDeletionNotification;
+
+ SortedDataInterface* getMMAPV1Interface(HeadManager* headManager,
+ RecordStore* recordStore,
+ const Ordering& ordering,
+ const string& indexName,
+ int version,
+ BucketDeletionNotification* bucketDeletion);
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp
new file mode 100644
index 00000000000..93f802bc4a5
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp
@@ -0,0 +1,2519 @@
+/**
+ * Copyright (C) 2014 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/diskloc.h"
+#include "mongo/db/jsobj.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/db/storage/mmap_v1/btree/btree_logic.h"
+#include "mongo/db/storage/mmap_v1/btree/key.h"
+#include "mongo/db/storage/record_store.h"
+#include "mongo/util/log.h"
+#include "mongo/util/mongoutils/str.h"
+
+namespace mongo {
+
+ MONGO_LOG_DEFAULT_COMPONENT_FILE(::mongo::logger::LogComponent::kIndexing);
+
+ //
+ // Public Builder logic
+ //
+
+ template <class BtreeLayout>
+ typename BtreeLogic<BtreeLayout>::Builder*
+ BtreeLogic<BtreeLayout>::newBuilder(OperationContext* txn, bool dupsAllowed) {
+ return new Builder(this, txn, dupsAllowed);
+ }
+
+ template <class BtreeLayout>
+ BtreeLogic<BtreeLayout>::Builder::Builder(BtreeLogic* logic,
+ OperationContext* txn,
+ bool dupsAllowed)
+ : _logic(logic),
+ _dupsAllowed(dupsAllowed),
+ _numAdded(0),
+ _txn(txn) {
+
+ // XXX: Due to the way bulk building works, we may already have an empty root bucket that we
+ // must now dispose of. This isn't the case in some unit tests that use the Builder directly
+ // rather than going through an IndexAccessMethod.
+ DiskLoc oldHead = _logic->_headManager->getHead();
+ if (!oldHead.isNull()) {
+ _logic->_headManager->setHead(_txn, DiskLoc());
+ _logic->_recordStore->deleteRecord(_txn, oldHead);
+ }
+
+ _first = _cur = _logic->_addBucket(txn);
+ _b = _getModifiableBucket(_cur);
+ _committed = false;
+ }
+
+ template <class BtreeLayout>
+ Status BtreeLogic<BtreeLayout>::Builder::addKey(const BSONObj& keyObj, const DiskLoc& loc) {
+ auto_ptr<KeyDataOwnedType> key(new KeyDataOwnedType(keyObj));
+
+ if (key->dataSize() > BtreeLayout::KeyMax) {
+ string msg = str::stream() << "Btree::insert: key too large to index, failing "
+ << _logic->_indexName
+ << ' ' << key->dataSize() << ' ' << key->toString();
+ log() << msg << endl;
+ return Status(ErrorCodes::KeyTooLong, msg);
+ }
+
+ // If we have a previous key to compare to...
+ if (_numAdded > 0) {
+ int cmp = _keyLast->woCompare(*key, _logic->_ordering);
+
+ // This shouldn't happen ever. We expect keys in sorted order.
+ if (cmp > 0) {
+ return Status(ErrorCodes::InternalError, "Bad key order in btree builder");
+ }
+
+ // This could easily happen..
+ if (!_dupsAllowed && (cmp == 0)) {
+ return Status(ErrorCodes::DuplicateKey, _logic->dupKeyError(*_keyLast));
+ }
+ }
+
+ if (!_logic->_pushBack(_b, loc, *key, DiskLoc())) {
+ // bucket was full
+ newBucket();
+ _logic->pushBack(_b, loc, *key, DiskLoc());
+ }
+
+ _keyLast = key;
+ _numAdded++;
+ mayCommitProgressDurably();
+ return Status::OK();
+ }
+
+ template <class BtreeLayout>
+ unsigned long long BtreeLogic<BtreeLayout>::Builder::commit(bool mayInterrupt) {
+ buildNextLevel(_first, mayInterrupt);
+ _committed = true;
+ return _numAdded;
+ }
+
+ //
+ // Private Builder logic
+ //
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::Builder::newBucket() {
+ DiskLoc newBucketLoc = _logic->_addBucket(_txn);
+ _b->parent = newBucketLoc;
+ _cur = newBucketLoc;
+ _b = _getModifiableBucket(_cur);
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::Builder::buildNextLevel(DiskLoc loc, bool mayInterrupt) {
+ for (;;) {
+ if (_getBucket(loc)->parent.isNull()) {
+ // only 1 bucket at this level. we are done.
+ _logic->_headManager->setHead(_txn, loc);
+ break;
+ }
+
+ DiskLoc upLoc = _logic->_addBucket(_txn);
+ DiskLoc upStart = upLoc;
+ BucketType* up = _getModifiableBucket(upLoc);
+
+ DiskLoc xloc = loc;
+ while (!xloc.isNull()) {
+ if (_txn->recoveryUnit()->commitIfNeeded()) {
+ _b = _getModifiableBucket(_cur);
+ up = _getModifiableBucket(upLoc);
+ }
+
+ if (mayInterrupt) {
+ _txn->checkForInterrupt();
+ }
+
+ BucketType* x = _getModifiableBucket(xloc);
+ KeyDataType k;
+ DiskLoc r;
+ _logic->popBack(x, &r, &k);
+ bool keepX = (x->n != 0);
+ DiskLoc keepLoc = keepX ? xloc : x->nextChild;
+
+ if (!_logic->_pushBack(up, r, k, keepLoc)) {
+ // current bucket full
+ DiskLoc n = _logic->_addBucket(_txn);
+ up->parent = n;
+ upLoc = n;
+ up = _getModifiableBucket(upLoc);
+ _logic->pushBack(up, r, k, keepLoc);
+ }
+
+ DiskLoc nextLoc = x->parent;
+ if (keepX) {
+ x->parent = upLoc;
+ }
+ else {
+ if (!x->nextChild.isNull()) {
+ DiskLoc ll = x->nextChild;
+ _getModifiableBucket(ll)->parent = upLoc;
+ }
+ _logic->deallocBucket(_txn, x, xloc);
+ }
+ xloc = nextLoc;
+ }
+
+ loc = upStart;
+ mayCommitProgressDurably();
+ }
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::Builder::mayCommitProgressDurably() {
+ if (_txn->recoveryUnit()->commitIfNeeded()) {
+ _b = _getModifiableBucket(_cur);
+ }
+ }
+
+ template <class BtreeLayout>
+ typename BtreeLogic<BtreeLayout>::BucketType*
+ BtreeLogic<BtreeLayout>::Builder::_getModifiableBucket(DiskLoc loc) {
+ return _logic->btreemod(_txn, _logic->getBucket(loc));
+ }
+
+ template <class BtreeLayout>
+ typename BtreeLogic<BtreeLayout>::BucketType*
+ BtreeLogic<BtreeLayout>::Builder::_getBucket(DiskLoc loc) {
+ return _logic->getBucket(loc);
+ }
+
+ //
+ // BtreeLogic logic
+ //
+
+ // static
+ template <class BtreeLayout>
+ typename BtreeLogic<BtreeLayout>::FullKey
+ BtreeLogic<BtreeLayout>::getFullKey(const BucketType* bucket, int i) {
+ if (i >= bucket->n) {
+ int code = 13000;
+ massert(code,
+ (string)"invalid keyNode: " + BSON( "i" << i << "n" << bucket->n ).jsonString(),
+ i < bucket->n );
+ }
+ return FullKey(bucket, i);
+ }
+
+ // static
+ template <class BtreeLayout>
+ typename BtreeLogic<BtreeLayout>::KeyHeaderType&
+ BtreeLogic<BtreeLayout>::getKeyHeader(BucketType* bucket, int i) {
+ return ((KeyHeaderType*)bucket->data)[i];
+ }
+
+ // static
+ template <class BtreeLayout>
+ const typename BtreeLogic<BtreeLayout>::KeyHeaderType&
+ BtreeLogic<BtreeLayout>::getKeyHeader(const BucketType* bucket, int i) {
+ return ((const KeyHeaderType*)bucket->data)[i];
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::markUnused(BucketType* bucket, int keyPos) {
+ invariant(keyPos >= 0 && keyPos < bucket->n);
+ getKeyHeader(bucket, keyPos).setUnused();
+ }
+
+ template <class BtreeLayout>
+ char* BtreeLogic<BtreeLayout>::dataAt(BucketType* bucket, short ofs) {
+ return bucket->data + ofs;
+ }
+
+ template <class BtreeLayout>
+ typename BtreeLogic<BtreeLayout>::BucketType*
+ BtreeLogic<BtreeLayout>::btreemod(OperationContext* txn, BucketType* bucket) {
+ txn->recoveryUnit()->writingPtr(bucket, BtreeLayout::BucketSize);
+ return bucket;
+ }
+
+ template <class BtreeLayout>
+ int BtreeLogic<BtreeLayout>::totalDataSize(BucketType* bucket) {
+ return (int) (BtreeLayout::BucketSize - (bucket->data - (char*)bucket));
+ }
+
+ // We define this value as the maximum number of bytes such that, if we have
+ // fewer than this many bytes, we must be able to either merge with or receive
+ // keys from any neighboring node. If our utilization goes below this value we
+ // know we can bring up the utilization with a simple operation. Ignoring the
+ // 90/10 split policy which is sometimes employed and our 'unused' nodes, this
+ // is a lower bound on bucket utilization for non root buckets.
+ //
+ // Note that the exact value here depends on the implementation of
+ // _rebalancedSeparatorPos(). The conditions for lowWaterMark - 1 are as
+ // follows: We know we cannot merge with the neighbor, so the total data size
+ // for us, the neighbor, and the separator must be at least
+ // BucketType::bodySize() + 1. We must be able to accept one key of any
+ // allowed size, so our size plus storage for that additional key must be
+ // <= BucketType::bodySize() / 2. This way, with the extra key we'll have a
+ // new bucket data size < half the total data size and by the implementation
+ // of _rebalancedSeparatorPos() the key must be added.
+ template <class BtreeLayout>
+ int BtreeLogic<BtreeLayout>::lowWaterMark() {
+ return BtreeLayout::BucketBodySize / 2 - BtreeLayout::KeyMax - sizeof(KeyHeaderType) + 1;
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::init(BucketType* bucket) {
+ BtreeLayout::initBucket(bucket);
+ bucket->parent.Null();
+ bucket->nextChild.Null();
+ bucket->flags = Packed;
+ bucket->n = 0;
+ bucket->emptySize = totalDataSize(bucket);
+ bucket->topSize = 0;
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::_unalloc(BucketType* bucket, int bytes) {
+ bucket->topSize -= bytes;
+ bucket->emptySize += bytes;
+ }
+
+ /**
+ * We allocate space from the end of the buffer for data. The keynodes grow from the front.
+ */
+ template <class BtreeLayout>
+ int BtreeLogic<BtreeLayout>::_alloc(BucketType* bucket, int bytes) {
+ invariant(bucket->emptySize >= bytes);
+ bucket->topSize += bytes;
+ bucket->emptySize -= bytes;
+ int ofs = totalDataSize(bucket) - bucket->topSize;
+ invariant(ofs > 0);
+ return ofs;
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::setNotPacked(BucketType* bucket) {
+ bucket->flags &= ~Packed;
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::setPacked(BucketType* bucket) {
+ bucket->flags |= Packed;
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::_delKeyAtPos(BucketType* bucket, int keypos, bool mayEmpty) {
+ invariant(keypos >= 0 && keypos <= bucket->n);
+ invariant(childLocForPos(bucket, keypos).isNull());
+ invariant((mayEmpty && bucket->n > 0) || bucket->n > 1 || bucket->nextChild.isNull());
+
+ bucket->emptySize += sizeof(KeyHeaderType);
+ bucket->n--;
+
+ for (int j = keypos; j < bucket->n; j++) {
+ getKeyHeader(bucket, j) = getKeyHeader(bucket, j + 1);
+ }
+
+ setNotPacked(bucket);
+ }
+
+ /**
+ * Pull rightmost key from the bucket. This version requires its right child to be null so it
+ * does not bother returning that value.
+ */
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::popBack(BucketType* bucket,
+ DiskLoc* recordLocOut,
+ KeyDataType *keyDataOut) {
+
+ massert(17435, "n==0 in btree popBack()", bucket->n > 0 );
+
+ invariant(getKeyHeader(bucket, bucket->n - 1).isUsed());
+
+ FullKey kn = getFullKey(bucket, bucket->n - 1);
+ *recordLocOut = kn.recordLoc;
+ keyDataOut->assign(kn.data);
+ int keysize = kn.data.dataSize();
+
+ massert(17436, "rchild not null in btree popBack()", bucket->nextChild.isNull());
+
+ // Weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't
+ // full.
+ bucket->nextChild = kn.prevChildBucket;
+ bucket->n--;
+ // This is risky because the key we are returning points to this unalloc'ed memory,
+ // and we are assuming that the last key points to the last allocated
+ // bson region.
+ bucket->emptySize += sizeof(KeyHeaderType);
+ _unalloc(bucket, keysize);
+ }
+
+ /**
+ * Add a key. Must be > all existing. Be careful to set next ptr right.
+ */
+ template <class BtreeLayout>
+ bool BtreeLogic<BtreeLayout>::_pushBack(BucketType* bucket,
+ const DiskLoc recordLoc,
+ const KeyDataType& key,
+ const DiskLoc prevChild) {
+
+ int bytesNeeded = key.dataSize() + sizeof(KeyHeaderType);
+ if (bytesNeeded > bucket->emptySize) {
+ return false;
+ }
+ invariant(bytesNeeded <= bucket->emptySize);
+
+ if (bucket->n) {
+ const FullKey klast = getFullKey(bucket, bucket->n - 1);
+ if (klast.data.woCompare(key, _ordering) > 0) {
+ log() << "btree bucket corrupt? "
+ "consider reindexing or running validate command" << endl;
+ log() << " klast: " << klast.data.toString() << endl;
+ log() << " key: " << key.toString() << endl;
+ invariant(false);
+ }
+ }
+
+ bucket->emptySize -= sizeof(KeyHeaderType);
+ KeyHeaderType& kn = getKeyHeader(bucket, bucket->n++);
+ kn.prevChildBucket = prevChild;
+ kn.recordLoc = recordLoc;
+ kn.setKeyDataOfs((short)_alloc(bucket, key.dataSize()));
+ short ofs = kn.keyDataOfs();
+ char *p = dataAt(bucket, ofs);
+ memcpy(p, key.data(), key.dataSize());
+ return true;
+ }
+
+ /**
+ * Durability note:
+ *
+ * We do separate intent declarations herein. Arguably one could just declare the whole bucket
+ * given we do group commits. This is something we could investigate later as to what is
+ * faster.
+ **/
+
+ /**
+ * Insert a key in a bucket with no complexity -- no splits required
+ * Returns false if a split is required.
+ */
+ template <class BtreeLayout>
+ bool BtreeLogic<BtreeLayout>::basicInsert(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int& keypos,
+ const KeyDataType& key,
+ const DiskLoc recordLoc) {
+ invariant(bucket->n < 1024);
+ invariant(keypos >= 0 && keypos <= bucket->n);
+
+ int bytesNeeded = key.dataSize() + sizeof(KeyHeaderType);
+ if (bytesNeeded > bucket->emptySize) {
+ _pack(txn, bucket, bucketLoc, keypos);
+ if (bytesNeeded > bucket->emptySize) {
+ return false;
+ }
+ }
+
+ invariant(getBucket(bucketLoc) == bucket);
+
+ {
+ // declare that we will write to [k(keypos),k(n)]
+ char* start = reinterpret_cast<char*>(&getKeyHeader(bucket, keypos));
+ char* end = reinterpret_cast<char*>(&getKeyHeader(bucket, bucket->n + 1));
+
+ // Declare that we will write to [k(keypos),k(n)]
+ txn->recoveryUnit()->writingPtr(start, end - start);
+ }
+
+ // e.g. for n==3, keypos==2
+ // 1 4 9 -> 1 4 _ 9
+ for (int j = bucket->n; j > keypos; j--) {
+ getKeyHeader(bucket, j) = getKeyHeader(bucket, j - 1);
+ }
+
+ size_t writeLen = sizeof(bucket->emptySize) + sizeof(bucket->topSize) + sizeof(bucket->n);
+ txn->recoveryUnit()->writingPtr(&bucket->emptySize, writeLen);
+ bucket->emptySize -= sizeof(KeyHeaderType);
+ bucket->n++;
+
+ // This _KeyNode was marked for writing above.
+ KeyHeaderType& kn = getKeyHeader(bucket, keypos);
+ kn.prevChildBucket.Null();
+ kn.recordLoc = recordLoc;
+ kn.setKeyDataOfs((short) _alloc(bucket, key.dataSize()));
+ char *p = dataAt(bucket, kn.keyDataOfs());
+ txn->recoveryUnit()->writingPtr(p, key.dataSize());
+ memcpy(p, key.data(), key.dataSize());
+ return true;
+ }
+
+ /**
+ * With this implementation, refPos == 0 disregards effect of refPos. index > 0 prevents
+ * creation of an empty bucket.
+ */
+ template <class BtreeLayout>
+ bool BtreeLogic<BtreeLayout>::mayDropKey(BucketType* bucket, int index, int refPos) {
+ return index > 0
+ && (index != refPos)
+ && getKeyHeader(bucket, index).isUnused()
+ && getKeyHeader(bucket, index).prevChildBucket.isNull();
+ }
+
+ template <class BtreeLayout>
+ int BtreeLogic<BtreeLayout>::_packedDataSize(BucketType* bucket, int refPos) {
+ if (bucket->flags & Packed) {
+ return BtreeLayout::BucketSize - bucket->emptySize - BucketType::HeaderSize;
+ }
+
+ int size = 0;
+ for (int j = 0; j < bucket->n; ++j) {
+ if (mayDropKey(bucket, j, refPos)) {
+ continue;
+ }
+ size += getFullKey(bucket, j).data.dataSize() + sizeof(KeyHeaderType);
+ }
+
+ return size;
+ }
+
+ /**
+ * When we delete things, we just leave empty space until the node is full and then we repack
+ * it.
+ */
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::_pack(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc thisLoc,
+ int &refPos) {
+
+ invariant(getBucket(thisLoc) == bucket);
+
+ if (bucket->flags & Packed) {
+ return;
+ }
+
+ _packReadyForMod(btreemod(txn, bucket), refPos);
+ }
+
+ /**
+ * Version when write intent already declared.
+ */
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::_packReadyForMod(BucketType* bucket, int &refPos) {
+ if (bucket->flags & Packed) {
+ return;
+ }
+
+ int tdz = totalDataSize(bucket);
+ char temp[BtreeLayout::BucketSize];
+ int ofs = tdz;
+ bucket->topSize = 0;
+
+ int i = 0;
+ for (int j = 0; j < bucket->n; j++) {
+ if (mayDropKey(bucket, j, refPos)) {
+ // key is unused and has no children - drop it
+ continue;
+ }
+
+ if (i != j) {
+ if (refPos == j) {
+ // i < j so j will never be refPos again
+ refPos = i;
+ }
+ getKeyHeader(bucket, i) = getKeyHeader(bucket, j);
+ }
+
+ short ofsold = getKeyHeader(bucket, i).keyDataOfs();
+ int sz = getFullKey(bucket, i).data.dataSize();
+ ofs -= sz;
+ bucket->topSize += sz;
+ memcpy(temp + ofs, dataAt(bucket, ofsold), sz);
+ getKeyHeader(bucket, i).setKeyDataOfsSavingUse(ofs);
+ ++i;
+ }
+
+ if (refPos == bucket->n) {
+ refPos = i;
+ }
+
+ bucket->n = i;
+ int dataUsed = tdz - ofs;
+ memcpy(bucket->data + ofs, temp + ofs, dataUsed);
+
+ bucket->emptySize = tdz - dataUsed - bucket->n * sizeof(KeyHeaderType);
+ int foo = bucket->emptySize;
+ invariant( foo >= 0 );
+ setPacked(bucket);
+ assertValid(_indexName, bucket, _ordering);
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::truncateTo(BucketType* bucket,
+ int N,
+ int &refPos) {
+ bucket->n = N;
+ setNotPacked(bucket);
+ _packReadyForMod(bucket, refPos);
+ }
+
+ /**
+ * In the standard btree algorithm, we would split based on the
+ * existing keys _and_ the new key. But that's more work to
+ * implement, so we split the existing keys and then add the new key.
+ *
+ * There are several published heuristic algorithms for doing splits, but basically what you
+ * want are (1) even balancing between the two sides and (2) a small split key so the parent can
+ * have a larger branching factor.
+ *
+ * We just have a simple algorithm right now: if a key includes the halfway point (or 10% way
+ * point) in terms of bytes, split on that key; otherwise split on the key immediately to the
+ * left of the halfway point (or 10% point).
+ *
+ * This function is expected to be called on a packed bucket.
+ */
+ template <class BtreeLayout>
+ int BtreeLogic<BtreeLayout>::splitPos(BucketType* bucket, int keypos) {
+ invariant(bucket->n > 2);
+ int split = 0;
+ int rightSize = 0;
+
+ // When splitting a btree node, if the new key is greater than all the other keys, we should
+ // not do an even split, but a 90/10 split. see SERVER-983. TODO I think we only want to
+ // do the 90% split on the rhs node of the tree.
+ int rightSizeLimit = (bucket->topSize + sizeof(KeyHeaderType) * bucket->n)
+ / (keypos == bucket->n ? 10 : 2);
+
+ for (int i = bucket->n - 1; i > -1; --i) {
+ rightSize += getFullKey(bucket, i).data.dataSize() + sizeof(KeyHeaderType);
+ if (rightSize > rightSizeLimit) {
+ split = i;
+ break;
+ }
+ }
+
+ // safeguards - we must not create an empty bucket
+ if (split < 1) {
+ split = 1;
+ }
+ else if (split > bucket->n - 2) {
+ split = bucket->n - 2;
+ }
+
+ return split;
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::reserveKeysFront(BucketType* bucket, int nAdd) {
+ invariant(bucket->emptySize >= int(sizeof(KeyHeaderType) * nAdd));
+ bucket->emptySize -= sizeof(KeyHeaderType) * nAdd;
+ for (int i = bucket->n - 1; i > -1; --i) {
+ getKeyHeader(bucket, i + nAdd) = getKeyHeader(bucket, i);
+ }
+ bucket->n += nAdd;
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::setKey(BucketType* bucket,
+ int i,
+ const DiskLoc recordLoc,
+ const KeyDataType& key,
+ const DiskLoc prevChildBucket) {
+ KeyHeaderType &kn = getKeyHeader(bucket, i);
+ kn.recordLoc = recordLoc;
+ kn.prevChildBucket = prevChildBucket;
+ short ofs = (short) _alloc(bucket, key.dataSize());
+ kn.setKeyDataOfs(ofs);
+ char *p = dataAt(bucket, ofs);
+ memcpy(p, key.data(), key.dataSize());
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::dropFront(BucketType* bucket,
+ int nDrop,
+ int &refpos) {
+ for (int i = nDrop; i < bucket->n; ++i) {
+ getKeyHeader(bucket, i - nDrop) = getKeyHeader(bucket, i);
+ }
+ bucket->n -= nDrop;
+ setNotPacked(bucket);
+ _packReadyForMod(bucket, refpos );
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::customLocate(OperationContext* txn,
+ DiskLoc* locInOut,
+ int* keyOfsInOut,
+ const BSONObj& keyBegin,
+ int keyBeginLen,
+ bool afterKey,
+ const vector<const BSONElement*>& keyEnd,
+ const vector<bool>& keyEndInclusive,
+ int direction) const {
+ pair<DiskLoc, int> unused;
+
+ customLocate(txn,
+ locInOut,
+ keyOfsInOut,
+ keyBegin,
+ keyBeginLen,
+ afterKey,
+ keyEnd,
+ keyEndInclusive,
+ direction,
+ unused);
+
+ skipUnusedKeys(txn, locInOut, keyOfsInOut, direction);
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::advance(OperationContext* txn,
+ DiskLoc* bucketLocInOut,
+ int* posInOut,
+ int direction) const {
+
+ *bucketLocInOut = advance(txn, *bucketLocInOut, posInOut, direction);
+ skipUnusedKeys(txn, bucketLocInOut, posInOut, direction);
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::skipUnusedKeys(OperationContext* txn,
+ DiskLoc* loc,
+ int* pos,
+ int direction) const {
+ while (!loc->isNull() && !keyIsUsed(*loc, *pos)) {
+ *loc = advance(txn, *loc, pos, direction);
+ }
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::advanceTo(OperationContext* txn,
+ DiskLoc* thisLocInOut,
+ int* keyOfsInOut,
+ const BSONObj &keyBegin,
+ int keyBeginLen,
+ bool afterKey,
+ const vector<const BSONElement*>& keyEnd,
+ const vector<bool>& keyEndInclusive,
+ int direction) const {
+
+ advanceToImpl(txn,
+ thisLocInOut,
+ keyOfsInOut,
+ keyBegin,
+ keyBeginLen,
+ afterKey,
+ keyEnd,
+ keyEndInclusive,
+ direction);
+
+ skipUnusedKeys(txn, thisLocInOut, keyOfsInOut, direction);
+ }
+
+ /**
+ * find smallest/biggest value greater-equal/less-equal than specified
+ *
+ * starting thisLoc + keyOfs will be strictly less than/strictly greater than
+ * keyBegin/keyBeginLen/keyEnd
+ *
+ * All the direction checks below allowed me to refactor the code, but possibly separate forward
+ * and reverse implementations would be more efficient
+ */
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::advanceToImpl(OperationContext* txn,
+ DiskLoc* thisLocInOut,
+ int* keyOfsInOut,
+ const BSONObj &keyBegin,
+ int keyBeginLen,
+ bool afterKey,
+ const vector<const BSONElement*>& keyEnd,
+ const vector<bool>& keyEndInclusive,
+ int direction) const {
+
+ BucketType* bucket = getBucket(*thisLocInOut);
+
+ int l, h;
+ bool dontGoUp;
+
+ if (direction > 0) {
+ l = *keyOfsInOut;
+ h = bucket->n - 1;
+ int cmpResult = customBSONCmp(getFullKey(bucket, h).data.toBson(),
+ keyBegin,
+ keyBeginLen,
+ afterKey,
+ keyEnd,
+ keyEndInclusive,
+ _ordering,
+ direction);
+ dontGoUp = (cmpResult >= 0);
+ }
+ else {
+ l = 0;
+ h = *keyOfsInOut;
+ int cmpResult = customBSONCmp(getFullKey(bucket, l).data.toBson(),
+ keyBegin,
+ keyBeginLen,
+ afterKey,
+ keyEnd,
+ keyEndInclusive,
+ _ordering,
+ direction);
+ dontGoUp = (cmpResult <= 0);
+ }
+
+ pair<DiskLoc, int> bestParent;
+
+ if (dontGoUp) {
+ // this comparison result assures h > l
+ if (!customFind(l,
+ h,
+ keyBegin,
+ keyBeginLen,
+ afterKey,
+ keyEnd,
+ keyEndInclusive,
+ _ordering,
+ direction,
+ thisLocInOut,
+ keyOfsInOut,
+ bestParent)) {
+ return;
+ }
+ }
+ else {
+ // go up parents until rightmost/leftmost node is >=/<= target or at top
+ while (!bucket->parent.isNull()) {
+ *thisLocInOut = bucket->parent;
+ bucket = getBucket(*thisLocInOut);
+
+ if (direction > 0) {
+ if (customBSONCmp(getFullKey(bucket, bucket->n - 1).data.toBson(),
+ keyBegin,
+ keyBeginLen,
+ afterKey,
+ keyEnd,
+ keyEndInclusive,
+ _ordering,
+ direction) >= 0 ) {
+ break;
+ }
+ }
+ else {
+ if (customBSONCmp(getFullKey(bucket, 0).data.toBson(),
+ keyBegin,
+ keyBeginLen,
+ afterKey,
+ keyEnd,
+ keyEndInclusive,
+ _ordering,
+ direction) <= 0) {
+ break;
+ }
+ }
+ }
+ }
+
+ customLocate(txn,
+ thisLocInOut,
+ keyOfsInOut,
+ keyBegin,
+ keyBeginLen,
+ afterKey,
+ keyEnd,
+ keyEndInclusive,
+ direction,
+ bestParent);
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::customLocate(OperationContext* txn,
+ DiskLoc* locInOut,
+ int* keyOfsInOut,
+ const BSONObj& keyBegin,
+ int keyBeginLen,
+ bool afterKey,
+ const vector<const BSONElement*>& keyEnd,
+ const vector<bool>& keyEndInclusive,
+ int direction,
+ pair<DiskLoc, int>& bestParent) const {
+
+ BucketType* bucket = getBucket(*locInOut);
+
+ if (0 == bucket->n) {
+ *locInOut = DiskLoc();
+ return;
+ }
+
+ // go down until find smallest/biggest >=/<= target
+ for (;;) {
+ int l = 0;
+ int h = bucket->n - 1;
+
+ // +direction: 0, -direction: h
+ int z = (direction > 0) ? 0 : h;
+
+ // leftmost/rightmost key may possibly be >=/<= search key
+ int res = customBSONCmp(getFullKey(bucket, z).data.toBson(),
+ keyBegin,
+ keyBeginLen,
+ afterKey,
+ keyEnd,
+ keyEndInclusive,
+ _ordering,
+ direction);
+
+
+ if (direction * res >= 0) {
+ DiskLoc next;
+ *keyOfsInOut = z;
+
+ if (direction > 0) {
+ dassert(z == 0);
+ next = getKeyHeader(bucket, 0).prevChildBucket;
+ }
+ else {
+ next = bucket->nextChild;
+ }
+
+ if (!next.isNull()) {
+ bestParent = pair<DiskLoc, int>(*locInOut, *keyOfsInOut);
+ *locInOut = next;
+ bucket = getBucket(*locInOut);
+ continue;
+ }
+ else {
+ return;
+ }
+ }
+
+ res = customBSONCmp(getFullKey(bucket, h - z).data.toBson(),
+ keyBegin,
+ keyBeginLen,
+ afterKey,
+ keyEnd,
+ keyEndInclusive,
+ _ordering,
+ direction);
+
+ if (direction * res < 0) {
+ DiskLoc next;
+ if (direction > 0) {
+ next = bucket->nextChild;
+ }
+ else {
+ next = getKeyHeader(bucket, 0).prevChildBucket;
+ }
+
+ if (next.isNull()) {
+ // if bestParent is null, we've hit the end and locInOut gets set to DiskLoc()
+ *locInOut = bestParent.first;
+ *keyOfsInOut = bestParent.second;
+ return;
+ }
+ else {
+ *locInOut = next;
+ bucket = getBucket(*locInOut);
+ continue;
+ }
+ }
+
+ if (!customFind(l,
+ h,
+ keyBegin,
+ keyBeginLen,
+ afterKey,
+ keyEnd,
+ keyEndInclusive,
+ _ordering,
+ direction,
+ locInOut,
+ keyOfsInOut,
+ bestParent)) {
+ return;
+ }
+
+ bucket = getBucket(*locInOut);
+ }
+ }
+
+ template <class BtreeLayout>
+ bool BtreeLogic<BtreeLayout>::customFind(int low,
+ int high,
+ const BSONObj& keyBegin,
+ int keyBeginLen,
+ bool afterKey,
+ const vector<const BSONElement*>& keyEnd,
+ const vector<bool>& keyEndInclusive,
+ const Ordering& order,
+ int direction,
+ DiskLoc* thisLocInOut,
+ int* keyOfsInOut,
+ pair<DiskLoc, int>& bestParent) const {
+
+ const BucketType* bucket = getBucket(*thisLocInOut);
+
+ for (;;) {
+ if (low + 1 == high) {
+ *keyOfsInOut = (direction > 0) ? high : low;
+ DiskLoc next = getKeyHeader(bucket, high).prevChildBucket;
+ if (!next.isNull()) {
+ bestParent = make_pair(*thisLocInOut, *keyOfsInOut);
+ *thisLocInOut = next;
+ return true;
+ }
+ else {
+ return false;
+ }
+ }
+
+ int middle = low + (high - low) / 2;
+
+ int cmp = customBSONCmp(getFullKey(bucket, middle).data.toBson(),
+ keyBegin,
+ keyBeginLen,
+ afterKey,
+ keyEnd,
+ keyEndInclusive,
+ order,
+ direction);
+
+ if (cmp < 0) {
+ low = middle;
+ }
+ else if (cmp > 0) {
+ high = middle;
+ }
+ else {
+ if (direction < 0) {
+ low = middle;
+ }
+ else {
+ high = middle;
+ }
+ }
+ }
+ }
+
+ /**
+ * NOTE: Currently the Ordering implementation assumes a compound index will not have more keys
+ * than an unsigned variable has bits. The same assumption is used in the implementation below
+ * with respect to the 'mask' variable.
+ *
+ * 'l' is a regular bsonobj
+ *
+ * 'rBegin' is composed partly of an existing bsonobj, and the remaining keys are taken from a
+ * vector of elements that frequently changes
+ *
+ * see https://jira.mongodb.org/browse/SERVER-371
+ */
+ // static
+ template <class BtreeLayout>
+ int BtreeLogic<BtreeLayout>::customBSONCmp(const BSONObj& l,
+ const BSONObj& rBegin,
+ int rBeginLen,
+ bool rSup,
+ const vector<const BSONElement*>& rEnd,
+ const vector<bool>& rEndInclusive,
+ const Ordering& o,
+ int direction) const {
+ // XXX: make this readable
+ BSONObjIterator ll( l );
+ BSONObjIterator rr( rBegin );
+ vector< const BSONElement * >::const_iterator rr2 = rEnd.begin();
+ vector< bool >::const_iterator inc = rEndInclusive.begin();
+ unsigned mask = 1;
+ for( int i = 0; i < rBeginLen; ++i, mask <<= 1 ) {
+ BSONElement lll = ll.next();
+ BSONElement rrr = rr.next();
+ ++rr2;
+ ++inc;
+
+ int x = lll.woCompare( rrr, false );
+ if ( o.descending( mask ) )
+ x = -x;
+ if ( x != 0 )
+ return x;
+ }
+ if ( rSup ) {
+ return -direction;
+ }
+ for( ; ll.more(); mask <<= 1 ) {
+ BSONElement lll = ll.next();
+ BSONElement rrr = **rr2;
+ ++rr2;
+ int x = lll.woCompare( rrr, false );
+ if ( o.descending( mask ) )
+ x = -x;
+ if ( x != 0 )
+ return x;
+ if ( !*inc ) {
+ return -direction;
+ }
+ ++inc;
+ }
+ return 0;
+ }
+
+ template <class BtreeLayout>
+ bool BtreeLogic<BtreeLayout>::exists(OperationContext* txn, const KeyDataType& key) const {
+ int position = 0;
+
+ // Find the DiskLoc
+ bool found;
+
+ DiskLoc bucket = _locate(txn, getRootLoc(), key, &position, &found, minDiskLoc, 1);
+
+ while (!bucket.isNull()) {
+ FullKey fullKey = getFullKey(getBucket(bucket), position);
+ if (fullKey.header.isUsed()) {
+ return fullKey.data.woEqual(key);
+ }
+ bucket = advance(txn, bucket, &position, 1);
+ }
+
+ return false;
+ }
+
+ template <class BtreeLayout>
+ Status BtreeLogic<BtreeLayout>::dupKeyCheck(OperationContext* txn,
+ const BSONObj& key,
+ const DiskLoc& loc) const {
+ KeyDataOwnedType theKey(key);
+ if (!wouldCreateDup(txn, theKey, loc)) {
+ return Status::OK();
+ }
+
+ return Status(ErrorCodes::DuplicateKey, dupKeyError(theKey));
+ }
+
+ template <class BtreeLayout>
+ bool BtreeLogic<BtreeLayout>::wouldCreateDup(OperationContext* txn,
+ const KeyDataType& key,
+ const DiskLoc self) const {
+ int position;
+ bool found;
+
+ DiskLoc posLoc = _locate(txn, getRootLoc(), key, &position, &found, minDiskLoc, 1);
+
+ while (!posLoc.isNull()) {
+ FullKey fullKey = getFullKey(getBucket(posLoc), position);
+ if (fullKey.header.isUsed()) {
+ // TODO: we may not need fullKey.data until we know fullKey.header.isUsed() here
+ // and elsewhere.
+ if (fullKey.data.woEqual(key)) {
+ return fullKey.recordLoc != self;
+ }
+ break;
+ }
+
+ posLoc = advance(txn, posLoc, &position, 1);
+ }
+ return false;
+ }
+
+ template <class BtreeLayout>
+ string BtreeLogic<BtreeLayout>::dupKeyError(const KeyDataType& key) const {
+ stringstream ss;
+ ss << "E11000 duplicate key error ";
+ ss << "index: " << _indexName << " ";
+ ss << "dup key: " << key.toString();
+ return ss.str();
+ }
+
+ /**
+ * Find a key within this btree bucket.
+ *
+ * When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the
+ * key. That assures that even when there are many duplicates (e.g., 1 million) for a key, our
+ * performance is still good.
+ *
+ * assertIfDup: if the key exists (ignoring the recordLoc), uassert
+ *
+ * pos: for existing keys k0...kn-1.
+ * returns # it goes BEFORE. so key[pos-1] < key < key[pos]
+ * returns n if it goes after the last existing key.
+ * note result might be an Unused location!
+ */
+ template <class BtreeLayout>
+ Status BtreeLogic<BtreeLayout>::_find(OperationContext* txn,
+ BucketType* bucket,
+ const KeyDataType& key,
+ const DiskLoc& recordLoc,
+ bool errorIfDup,
+ int* keyPositionOut,
+ bool* foundOut) const {
+
+ // XXX: fix the ctor for DiskLoc56bit so we can just convert w/o assignment operator
+ LocType genericRecordLoc;
+ genericRecordLoc = recordLoc;
+
+ bool dupsChecked = false;
+
+ int low = 0;
+ int high = bucket->n - 1;
+ int middle = (low + high) / 2;
+
+ while (low <= high) {
+ FullKey fullKey = getFullKey(bucket, middle);
+ int cmp = key.woCompare(fullKey.data, _ordering);
+
+ // The key data is the same.
+ if (0 == cmp) {
+ // Found the key in this bucket. If we're checking for dups...
+ if (errorIfDup) {
+ if (fullKey.header.isUnused()) {
+ // It's ok that the key is there if it is unused. We need to check that
+ // there aren't other entries for the key then. as it is very rare that
+ // we get here, we don't put any coding effort in here to make this
+ // particularly fast
+ if (!dupsChecked) {
+ // This is expensive and we only want to do it once(? -- when would
+ // it happen twice).
+ dupsChecked = true;
+ if (exists(txn, key)) {
+ if (wouldCreateDup(txn, key, genericRecordLoc)) {
+ return Status(ErrorCodes::DuplicateKey, dupKeyError(key), 11000);
+ }
+ else {
+ return Status(ErrorCodes::UniqueIndexViolation, "FIXME");
+ }
+ }
+ }
+ }
+ else {
+ if (fullKey.recordLoc == recordLoc) {
+ return Status(ErrorCodes::UniqueIndexViolation, "FIXME");
+ }
+ else {
+ return Status(ErrorCodes::DuplicateKey, dupKeyError(key), 11000);
+ }
+ }
+ }
+
+ // If we're here dup keys are allowed, or the key is a dup but unused.
+ LocType recordLocCopy = fullKey.recordLoc;
+
+ // We clear this bit so we can test equality without the used bit messing us up.
+ // XXX: document this
+ // XXX: kill this GETOFS stuff
+ recordLocCopy.GETOFS() &= ~1;
+
+ // Set 'cmp' to the comparison w/the DiskLoc and fall through below.
+ cmp = recordLoc.compare(recordLocCopy);
+ }
+
+ if (cmp < 0) {
+ high = middle - 1;
+ }
+ else if (cmp > 0) {
+ low = middle + 1;
+ }
+ else {
+ // Found it!
+ *keyPositionOut = middle;
+ *foundOut = true;
+ return Status::OK();
+ }
+
+ middle = (low + high) / 2;
+ }
+
+ // Not found.
+ *keyPositionOut = low;
+
+ // Some debugging checks.
+ if (low != bucket->n) {
+ wassert(key.woCompare(getFullKey(bucket, low).data, _ordering) <= 0);
+
+ if (low > 0) {
+ if (getFullKey(bucket, low - 1).data.woCompare(key, _ordering) > 0) {
+ DEV {
+ log() << key.toString() << endl;
+ log() << getFullKey(bucket, low - 1).data.toString() << endl;
+ }
+ wassert(false);
+ }
+ }
+ }
+
+ *foundOut = false;
+ return Status::OK();
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::delBucket(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc) {
+ invariant(bucketLoc != getRootLoc());
+
+ _bucketDeletion->aboutToDeleteBucket(bucketLoc);
+
+ BucketType* p = getBucket(bucket->parent);
+ int parentIdx = indexInParent(txn, bucket, bucketLoc);
+ *txn->recoveryUnit()->writing(&childLocForPos(p, parentIdx)) = DiskLoc();
+ deallocBucket(txn, bucket, bucketLoc);
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::deallocBucket(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc) {
+ bucket->n = BtreeLayout::INVALID_N_SENTINEL;
+ bucket->parent.Null();
+ _recordStore->deleteRecord(txn, bucketLoc);
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::restorePosition(OperationContext* txn,
+ const BSONObj& savedKey,
+ const DiskLoc& savedLoc,
+ int direction,
+ DiskLoc* bucketLocInOut,
+ int* keyOffsetInOut) const {
+
+ // _keyOffset is -1 if the bucket was deleted. When buckets are deleted the Btree calls
+ // a clientcursor function that calls down to all BTree buckets. Really, this deletion
+ // thing should be kept BTree-internal. This'll go away with finer grained locking: we
+ // can hold on to a bucket for as long as we need it.
+ if (-1 == *keyOffsetInOut) {
+ locate(txn, savedKey, savedLoc, direction, keyOffsetInOut, bucketLocInOut);
+ return;
+ }
+
+ invariant(*keyOffsetInOut >= 0);
+
+ BucketType* bucket = getBucket(*bucketLocInOut);
+ invariant(bucket);
+ invariant(BtreeLayout::INVALID_N_SENTINEL != bucket->n);
+
+ if (_keyIsAt(savedKey, savedLoc, bucket, *keyOffsetInOut)) {
+ skipUnusedKeys(txn, bucketLocInOut, keyOffsetInOut, direction);
+ return;
+ }
+
+ if (*keyOffsetInOut > 0) {
+ (*keyOffsetInOut)--;
+ if (_keyIsAt(savedKey, savedLoc, bucket, *keyOffsetInOut)) {
+ skipUnusedKeys(txn, bucketLocInOut, keyOffsetInOut, direction);
+ return;
+ }
+ }
+
+ locate(txn, savedKey, savedLoc, direction, keyOffsetInOut, bucketLocInOut);
+ }
+
+ template <class BtreeLayout>
+ bool BtreeLogic<BtreeLayout>::_keyIsAt(const BSONObj& savedKey,
+ const DiskLoc& savedLoc,
+ BucketType* bucket,
+ int keyPos) const {
+ if (keyPos >= bucket->n) {
+ return false;
+ }
+
+ FullKey key = getFullKey(bucket, keyPos);
+ if (!key.data.toBson().binaryEqual(savedKey)) {
+ return false;
+ }
+ return key.header.recordLoc == savedLoc;
+ }
+
+ /**
+ * May delete the bucket 'bucket' rendering 'bucketLoc' invalid.
+ */
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::delKeyAtPos(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int p) {
+ invariant(bucket->n > 0);
+ DiskLoc left = childLocForPos(bucket, p);
+ if (bucket->n == 1) {
+ if (left.isNull() && bucket->nextChild.isNull()) {
+ _delKeyAtPos(bucket, p);
+ if (isHead(bucket)) {
+ // we don't delete the top bucket ever
+ }
+ else {
+ if (!mayBalanceWithNeighbors(txn, bucket, bucketLoc)) {
+ // An empty bucket is only allowed as a txnient state. If
+ // there are no neighbors to balance with, we delete ourself.
+ // This condition is only expected in legacy btrees.
+ delBucket(txn, bucket, bucketLoc);
+ }
+ }
+ return;
+ }
+ deleteInternalKey(txn, bucket, bucketLoc, p);
+ return;
+ }
+
+ if (left.isNull()) {
+ _delKeyAtPos(bucket, p);
+ mayBalanceWithNeighbors(txn, bucket, bucketLoc);
+ }
+ else {
+ deleteInternalKey(txn, bucket, bucketLoc, p);
+ }
+ }
+
+ /**
+ * This function replaces the specified key (k) by either the prev or next key in the btree
+ * (k'). We require that k have either a left or right child. If k has a left child, we set k'
+ * to the prev key of k, which must be a leaf present in the left child. If k does not have a
+ * left child, we set k' to the next key of k, which must be a leaf present in the right child.
+ * When we replace k with k', we copy k' over k (which may cause a split) and then remove k'
+ * from its original location. Because k' is stored in a descendent of k, replacing k by k'
+ * will not modify the storage location of the original k', and we can easily remove k' from its
+ * original location.
+ *
+ * This function is only needed in cases where k has a left or right child; in other cases a
+ * simpler key removal implementation is possible.
+ *
+ * NOTE on noncompliant BtreeBuilder btrees: It is possible (though likely rare) for btrees
+ * created by BtreeBuilder to have k' that is not a leaf, see SERVER-2732. These cases are
+ * handled in the same manner as described in the "legacy btree structures" note below.
+ *
+ * NOTE on legacy btree structures: In legacy btrees, k' can be a nonleaf. In such a case we
+ * 'delete' k by marking it as an unused node rather than replacing it with k'. Also, k' may be
+ * a leaf but marked as an unused node. In such a case we replace k by k', preserving the key's
+ * unused marking. This function is only expected to mark a key as unused when handling a
+ * legacy btree.
+ */
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::deleteInternalKey(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int keypos) {
+ DiskLoc lchild = childLocForPos(bucket, keypos);
+ DiskLoc rchild = childLocForPos(bucket, keypos + 1);
+ invariant(!lchild.isNull() || !rchild.isNull());
+ int advanceDirection = lchild.isNull() ? 1 : -1;
+ int advanceKeyOfs = keypos;
+ DiskLoc advanceLoc = advance(txn, bucketLoc, &advanceKeyOfs, advanceDirection);
+ // advanceLoc must be a descentant of thisLoc, because thisLoc has a
+ // child in the proper direction and all descendants of thisLoc must be
+ // nonempty because they are not the root.
+ BucketType* advanceBucket = getBucket(advanceLoc);
+
+ if (!childLocForPos(advanceBucket, advanceKeyOfs).isNull()
+ || !childLocForPos(advanceBucket, advanceKeyOfs + 1).isNull()) {
+
+ markUnused(bucket, keypos);
+ return;
+ }
+
+ FullKey kn = getFullKey(advanceBucket, advanceKeyOfs);
+ // Because advanceLoc is a descendant of thisLoc, updating thisLoc will
+ // not affect packing or keys of advanceLoc and kn will be stable
+ // during the following setInternalKey()
+ setInternalKey(txn, bucket, bucketLoc, keypos, kn.recordLoc, kn.data,
+ childLocForPos(bucket, keypos),
+ childLocForPos(bucket, keypos + 1));
+ delKeyAtPos(txn, btreemod(txn, advanceBucket), advanceLoc, advanceKeyOfs);
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::replaceWithNextChild(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc) {
+
+ invariant(bucket->n == 0 && !bucket->nextChild.isNull() );
+ if (bucket->parent.isNull()) {
+ invariant(getRootLoc() == bucketLoc);
+ _headManager->setHead(txn, bucket->nextChild);
+ }
+ else {
+ BucketType* parentBucket = getBucket(bucket->parent);
+ int bucketIndexInParent = indexInParent(txn, bucket, bucketLoc);
+ *txn->recoveryUnit()->writing(&childLocForPos(parentBucket, bucketIndexInParent)) =
+ bucket->nextChild;
+ }
+
+ *txn->recoveryUnit()->writing(&getBucket(bucket->nextChild)->parent) = bucket->parent;
+ _bucketDeletion->aboutToDeleteBucket(bucketLoc);
+ deallocBucket(txn, bucket, bucketLoc);
+ }
+
+ template <class BtreeLayout>
+ bool BtreeLogic<BtreeLayout>::canMergeChildren(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ const int leftIndex) {
+ invariant(leftIndex >= 0 && leftIndex < bucket->n);
+
+ DiskLoc leftNodeLoc = childLocForPos(bucket, leftIndex);
+ DiskLoc rightNodeLoc = childLocForPos(bucket, leftIndex + 1);
+
+ if (leftNodeLoc.isNull() || rightNodeLoc.isNull()) {
+ return false;
+ }
+
+ int pos = 0;
+
+ BucketType* leftBucket = getBucket(leftNodeLoc);
+ BucketType* rightBucket = getBucket(rightNodeLoc);
+
+ int sum = BucketType::HeaderSize
+ + _packedDataSize(leftBucket, pos)
+ + _packedDataSize(rightBucket, pos)
+ + getFullKey(bucket, leftIndex).data.dataSize()
+ + sizeof(KeyHeaderType);
+
+ return sum <= BtreeLayout::BucketSize;
+ }
+
+ /**
+ * This implementation must respect the meaning and value of lowWaterMark. Also see comments in
+ * splitPos().
+ */
+ template <class BtreeLayout>
+ int BtreeLogic<BtreeLayout>::_rebalancedSeparatorPos(OperationContext* txn,
+ BucketType* bucket,
+ int leftIndex) {
+ int split = -1;
+ int rightSize = 0;
+
+ const BucketType* l = childForPos(bucket, leftIndex);
+ const BucketType* r = childForPos(bucket, leftIndex + 1);
+
+ int KNS = sizeof(KeyHeaderType);
+ int rightSizeLimit = ( l->topSize
+ + l->n * KNS
+ + getFullKey(bucket, leftIndex).data.dataSize()
+ + KNS
+ + r->topSize
+ + r->n * KNS ) / 2;
+
+ // This constraint should be ensured by only calling this function
+ // if we go below the low water mark.
+ invariant(rightSizeLimit < BtreeLayout::BucketBodySize);
+
+ for (int i = r->n - 1; i > -1; --i) {
+ rightSize += getFullKey(r, i).data.dataSize() + KNS;
+ if (rightSize > rightSizeLimit) {
+ split = l->n + 1 + i;
+ break;
+ }
+ }
+
+ if (split == -1) {
+ rightSize += getFullKey(bucket, leftIndex).data.dataSize() + KNS;
+ if (rightSize > rightSizeLimit) {
+ split = l->n;
+ }
+ }
+
+ if (split == -1) {
+ for (int i = l->n - 1; i > -1; --i) {
+ rightSize += getFullKey(l, i).data.dataSize() + KNS;
+ if (rightSize > rightSizeLimit) {
+ split = i;
+ break;
+ }
+ }
+ }
+
+ // safeguards - we must not create an empty bucket
+ if (split < 1) {
+ split = 1;
+ }
+ else if (split > l->n + 1 + r->n - 2) {
+ split = l->n + 1 + r->n - 2;
+ }
+
+ return split;
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::doMergeChildren(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int leftIndex) {
+
+ DiskLoc leftNodeLoc = childLocForPos(bucket, leftIndex);
+ DiskLoc rightNodeLoc = childLocForPos(bucket, leftIndex + 1);
+
+ BucketType* l = btreemod(txn, getBucket(leftNodeLoc));
+ BucketType* r = btreemod(txn, getBucket(rightNodeLoc));
+
+ int pos = 0;
+ _packReadyForMod(l, pos);
+ _packReadyForMod(r, pos);
+
+ // We know the additional keys below will fit in l because canMergeChildren() must be true.
+ int oldLNum = l->n;
+ // left child's right child becomes old parent key's left child
+ FullKey knLeft = getFullKey(bucket, leftIndex);
+ pushBack(l, knLeft.recordLoc, knLeft.data, l->nextChild);
+
+ for (int i = 0; i < r->n; ++i) {
+ FullKey kn = getFullKey(r, i);
+ pushBack(l, kn.recordLoc, kn.data, kn.prevChildBucket);
+ }
+
+ l->nextChild = r->nextChild;
+ fixParentPtrs(txn, l, leftNodeLoc, oldLNum);
+ delBucket(txn, r, rightNodeLoc);
+
+ childLocForPos(bucket, leftIndex + 1) = leftNodeLoc;
+ childLocForPos(bucket, leftIndex) = DiskLoc();
+ _delKeyAtPos(bucket, leftIndex, true);
+
+ if (bucket->n == 0) {
+ // Will trash bucket and bucketLoc.
+ //
+ // TODO To ensure all leaves are of equal height, we should ensure this is only called
+ // on the root.
+ replaceWithNextChild(txn, bucket, bucketLoc);
+ }
+ else {
+ mayBalanceWithNeighbors(txn, bucket, bucketLoc);
+ }
+ }
+
+ template <class BtreeLayout>
+ int BtreeLogic<BtreeLayout>::indexInParent(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc) const {
+ invariant(!bucket->parent.isNull());
+ const BucketType* p = getBucket(bucket->parent);
+ if (p->nextChild == bucketLoc) {
+ return p->n;
+ }
+
+ for (int i = 0; i < p->n; ++i) {
+ if (getKeyHeader(p, i).prevChildBucket == bucketLoc) {
+ return i;
+ }
+ }
+
+ log() << "ERROR: can't find ref to child bucket.\n";
+ log() << "child: " << bucketLoc << "\n";
+ //dump();
+ log() << "Parent: " << bucket->parent << "\n";
+ //p->dump();
+ invariant(false);
+ return -1; // just to compile
+ }
+
+ template <class BtreeLayout>
+ bool BtreeLogic<BtreeLayout>::tryBalanceChildren(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int leftIndex) {
+
+ // If we can merge, then we must merge rather than balance to preserve bucket utilization
+ // constraints.
+ if (canMergeChildren(txn, bucket, bucketLoc, leftIndex)) {
+ return false;
+ }
+
+ doBalanceChildren(txn, btreemod(txn, bucket), bucketLoc, leftIndex);
+ return true;
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::doBalanceLeftToRight(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int leftIndex,
+ int split,
+ BucketType* l,
+ const DiskLoc lchild,
+ BucketType* r,
+ const DiskLoc rchild) {
+
+ // TODO maybe do some audits the same way pushBack() does? As a precondition, rchild + the
+ // old separator are <= half a body size, and lchild is at most completely full. Based on
+ // the value of split, rchild will get <= half of the total bytes which is at most 75% of a
+ // full body. So rchild will have room for the following keys:
+ int rAdd = l->n - split;
+ reserveKeysFront(r, rAdd);
+
+ for (int i = split + 1, j = 0; i < l->n; ++i, ++j) {
+ FullKey kn = getFullKey(l, i);
+ setKey(r, j, kn.recordLoc, kn.data, kn.prevChildBucket);
+ }
+
+ FullKey leftIndexKN = getFullKey(bucket, leftIndex);
+ setKey(r, rAdd - 1, leftIndexKN.recordLoc, leftIndexKN.data, l->nextChild);
+
+ fixParentPtrs(txn, r, rchild, 0, rAdd - 1);
+
+ FullKey kn = getFullKey(l, split);
+ l->nextChild = kn.prevChildBucket;
+
+ // Because lchild is a descendant of thisLoc, updating thisLoc will not affect packing or
+ // keys of lchild and kn will be stable during the following setInternalKey()
+ setInternalKey(txn, bucket, bucketLoc, leftIndex, kn.recordLoc, kn.data, lchild, rchild);
+
+ // lchild and rchild cannot be merged, so there must be >0 (actually more) keys to the left
+ // of split.
+ int zeropos = 0;
+ truncateTo(l, split, zeropos);
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::doBalanceRightToLeft(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int leftIndex,
+ int split,
+ BucketType* l,
+ const DiskLoc lchild,
+ BucketType* r,
+ const DiskLoc rchild) {
+ // As a precondition, lchild + the old separator are <= half a body size,
+ // and rchild is at most completely full. Based on the value of split,
+ // lchild will get less than half of the total bytes which is at most 75%
+ // of a full body. So lchild will have room for the following keys:
+ int lN = l->n;
+
+ {
+ // left child's right child becomes old parent key's left child
+ FullKey kn = getFullKey(bucket, leftIndex);
+ pushBack(l, kn.recordLoc, kn.data, l->nextChild);
+ }
+
+ for (int i = 0; i < split - lN - 1; ++i) {
+ FullKey kn = getFullKey(r, i);
+ pushBack(l, kn.recordLoc, kn.data, kn.prevChildBucket);
+ }
+
+ {
+ FullKey kn = getFullKey(r, split - lN - 1);
+ l->nextChild = kn.prevChildBucket;
+ // Child lN was lchild's old nextChild, and don't need to fix that one.
+ fixParentPtrs(txn, l, lchild, lN + 1, l->n);
+ // Because rchild is a descendant of thisLoc, updating thisLoc will
+ // not affect packing or keys of rchild and kn will be stable
+ // during the following setInternalKey()
+ setInternalKey(txn, bucket, bucketLoc, leftIndex, kn.recordLoc, kn.data, lchild, rchild);
+ }
+
+ // lchild and rchild cannot be merged, so there must be >0 (actually more)
+ // keys to the right of split.
+ int zeropos = 0;
+ dropFront(r, split - lN, zeropos);
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::doBalanceChildren(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int leftIndex) {
+
+ DiskLoc lchild = childLocForPos(bucket, leftIndex);
+ DiskLoc rchild = childLocForPos(bucket, leftIndex + 1);
+
+ int zeropos = 0;
+ BucketType* l = btreemod(txn, getBucket(lchild));
+ _packReadyForMod(l, zeropos);
+
+ BucketType* r = btreemod(txn, getBucket(rchild));
+ _packReadyForMod(r, zeropos);
+
+ int split = _rebalancedSeparatorPos(txn, bucket, leftIndex);
+
+ // By definition, if we are below the low water mark and cannot merge
+ // then we must actively balance.
+ invariant(split != l->n);
+ if (split < l->n) {
+ doBalanceLeftToRight(txn, bucket, bucketLoc, leftIndex, split, l, lchild, r, rchild);
+ }
+ else {
+ doBalanceRightToLeft(txn, bucket, bucketLoc, leftIndex, split, l, lchild, r, rchild);
+ }
+ }
+
+ template <class BtreeLayout>
+ bool BtreeLogic<BtreeLayout>::mayBalanceWithNeighbors(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc) {
+ if (bucket->parent.isNull()) {
+ return false;
+ }
+
+ if (_packedDataSize(bucket, 0) >= lowWaterMark()) {
+ return false;
+ }
+
+ BucketType* p = getBucket(bucket->parent);
+ int parentIdx = indexInParent(txn, bucket, bucketLoc);
+
+ // TODO will missing neighbor case be possible long term? Should we try to merge/balance
+ // somehow in that case if so?
+ bool mayBalanceRight = (parentIdx < p->n) && !childLocForPos(p, parentIdx + 1).isNull();
+ bool mayBalanceLeft = ( parentIdx > 0 ) && !childLocForPos(p, parentIdx - 1).isNull();
+
+ // Balance if possible on one side - we merge only if absolutely necessary to preserve btree
+ // bucket utilization constraints since that's a more heavy duty operation (especially if we
+ // must re-split later).
+ if (mayBalanceRight && tryBalanceChildren(txn, p, bucket->parent, parentIdx)) {
+ return true;
+ }
+
+ if (mayBalanceLeft && tryBalanceChildren(txn, p, bucket->parent, parentIdx - 1)) {
+ return true;
+ }
+
+ BucketType* pm = btreemod(txn, getBucket(bucket->parent));
+ if (mayBalanceRight) {
+ doMergeChildren(txn, pm, bucket->parent, parentIdx);
+ return true;
+ }
+ else if (mayBalanceLeft) {
+ doMergeChildren(txn, pm, bucket->parent, parentIdx - 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ template <class BtreeLayout>
+ bool BtreeLogic<BtreeLayout>::unindex(OperationContext* txn,
+ const BSONObj& key,
+ const DiskLoc& recordLoc) {
+ int pos;
+ bool found = false;
+ KeyDataOwnedType ownedKey(key);
+
+ DiskLoc loc = _locate(txn, getRootLoc(), ownedKey, &pos, &found, recordLoc, 1);
+ if (found) {
+ BucketType* bucket = btreemod(txn, getBucket(loc));
+ delKeyAtPos(txn, bucket, loc, pos);
+ assertValid(_indexName, getRoot(), _ordering);
+ }
+ return found;
+ }
+
+ template <class BtreeLayout>
+ bool BtreeLogic<BtreeLayout>::isEmpty() const {
+ return getRoot()->n == 0;
+ }
+
+ /**
+ * This can cause a lot of additional page writes when we assign buckets to different parents.
+ * Maybe get rid of parent ptrs?
+ */
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::fixParentPtrs(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int firstIndex,
+ int lastIndex) {
+
+ invariant(getBucket(bucketLoc) == bucket);
+
+ if (lastIndex == -1) {
+ lastIndex = bucket->n;
+ }
+
+ for (int i = firstIndex; i <= lastIndex; i++) {
+ const DiskLoc childLoc = childLocForPos(bucket, i);
+ if (!childLoc.isNull()) {
+ *txn->recoveryUnit()->writing(&getBucket(childLoc)->parent) = bucketLoc;
+ }
+ }
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::setInternalKey(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int keypos,
+ const DiskLoc recordLoc,
+ const KeyDataType& key,
+ const DiskLoc lchild,
+ const DiskLoc rchild) {
+ childLocForPos(bucket, keypos).Null();
+ // This may leave the bucket empty (n == 0) which is ok only as a txnient state. In the
+ // instant case, the implementation of insertHere behaves correctly when n == 0 and as a
+ // side effect increments n.
+ _delKeyAtPos(bucket, keypos, true);
+
+ // Ensure we do not orphan neighbor's old child.
+ invariant(childLocForPos(bucket, keypos ) == rchild);
+
+ // Just set temporarily - required to pass validation in insertHere()
+ childLocForPos(bucket, keypos) = lchild;
+
+ insertHere(txn, bucketLoc, keypos, key, recordLoc, lchild, rchild);
+ }
+
+ /**
+ * insert a key in this bucket, splitting if necessary.
+ *
+ * @keypos - where to insert the key in range 0..n. 0=make leftmost, n=make rightmost. NOTE
+ * this function may free some data, and as a result the value passed for keypos may be invalid
+ * after calling insertHere()
+ *
+ * Some of the write intent signaling below relies on the implementation of the optimized write
+ * intent code in basicInsert().
+ */
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::insertHere(OperationContext* txn,
+ const DiskLoc bucketLoc,
+ int pos,
+ const KeyDataType& key,
+ const DiskLoc recordLoc,
+ const DiskLoc leftChildLoc,
+ const DiskLoc rightChildLoc) {
+
+ BucketType* bucket = getBucket(bucketLoc);
+
+ if (!basicInsert(txn, bucket, bucketLoc, pos, key, recordLoc)) {
+ // If basicInsert() fails, the bucket will be packed as required by split().
+ split(txn, btreemod(txn, bucket), bucketLoc, pos, recordLoc, key, leftChildLoc, rightChildLoc);
+ return;
+ }
+
+ KeyHeaderType* kn = &getKeyHeader(bucket, pos);
+ if (pos + 1 == bucket->n) {
+ // It's the last key.
+ if (bucket->nextChild != leftChildLoc) {
+ // XXX log more
+ invariant(false);
+ }
+ kn->prevChildBucket = bucket->nextChild;
+ invariant(kn->prevChildBucket == leftChildLoc);
+ *txn->recoveryUnit()->writing(&bucket->nextChild) = rightChildLoc;
+ if (!rightChildLoc.isNull()) {
+ *txn->recoveryUnit()->writing(&getBucket(rightChildLoc)->parent) = bucketLoc;
+ }
+ }
+ else {
+ kn->prevChildBucket = leftChildLoc;
+ if (getKeyHeader(bucket, pos + 1).prevChildBucket != leftChildLoc) {
+ // XXX: log more
+ invariant(false);
+ }
+ const LocType *pc = &getKeyHeader(bucket, pos + 1).prevChildBucket;
+ // Intent declared in basicInsert()
+ *const_cast<LocType*>(pc) = rightChildLoc;
+ if (!rightChildLoc.isNull()) {
+ *txn->recoveryUnit()->writing(&getBucket(rightChildLoc)->parent) = bucketLoc;
+ }
+ }
+ }
+
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::split(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int keypos,
+ const DiskLoc recordLoc,
+ const KeyDataType& key,
+ const DiskLoc lchild,
+ const DiskLoc rchild) {
+
+ int split = splitPos(bucket, keypos);
+ DiskLoc rLoc = _addBucket(txn);
+ BucketType* r = btreemod(txn, getBucket(rLoc));
+
+ for (int i = split + 1; i < bucket->n; i++) {
+ FullKey kn = getFullKey(bucket, i);
+ pushBack(r, kn.recordLoc, kn.data, kn.prevChildBucket);
+ }
+ r->nextChild = bucket->nextChild;
+ assertValid(_indexName, r, _ordering);
+
+ r = NULL;
+ fixParentPtrs(txn, getBucket(rLoc), rLoc);
+
+ FullKey splitkey = getFullKey(bucket, split);
+ // splitkey key gets promoted, its children will be thisLoc (l) and rLoc (r)
+ bucket->nextChild = splitkey.prevChildBucket;
+
+ // Because thisLoc is a descendant of parent, updating parent will not affect packing or
+ // keys of thisLoc and splitkey will be stable during the following:
+
+ if (bucket->parent.isNull()) {
+ // promote splitkey to a parent this->node make a new parent if we were the root
+ DiskLoc L = _addBucket(txn);
+ BucketType* p = btreemod(txn, getBucket(L));
+ pushBack(p, splitkey.recordLoc, splitkey.data, bucketLoc);
+ p->nextChild = rLoc;
+ assertValid(_indexName, p, _ordering);
+ bucket->parent = L;
+ _headManager->setHead(txn, L);
+ *txn->recoveryUnit()->writing(&getBucket(rLoc)->parent) = bucket->parent;
+ }
+ else {
+ // set this before calling _insert - if it splits it will do fixParent() logic and
+ // change the value.
+ *txn->recoveryUnit()->writing(&getBucket(rLoc)->parent) = bucket->parent;
+ _insert(txn,
+ getBucket(bucket->parent),
+ bucket->parent,
+ splitkey.data,
+ splitkey.recordLoc,
+ true, // dupsallowed
+ bucketLoc,
+ rLoc);
+ }
+
+ int newpos = keypos;
+ // note this may trash splitkey.key. thus we had to promote it before finishing up here.
+ truncateTo(bucket, split, newpos);
+
+ // add our this->new key, there is room this->now
+ if (keypos <= split) {
+ insertHere(txn, bucketLoc, newpos, key, recordLoc, lchild, rchild);
+ }
+ else {
+ int kp = keypos - split - 1;
+ invariant(kp >= 0);
+ insertHere(txn, rLoc, kp, key, recordLoc, lchild, rchild);
+ }
+ }
+
+ class DummyDocWriter : public DocWriter {
+ public:
+ DummyDocWriter(size_t sz) : _sz(sz) { }
+ virtual void writeDocument(char* buf) const { /* no-op */ }
+ virtual size_t documentSize() const { return _sz; }
+ private:
+ size_t _sz;
+ };
+
+ template <class BtreeLayout>
+ Status BtreeLogic<BtreeLayout>::initAsEmpty(OperationContext* txn) {
+ if (!_headManager->getHead().isNull()) {
+ return Status(ErrorCodes::InternalError, "index already initialized");
+ }
+
+ _headManager->setHead(txn, _addBucket(txn));
+ return Status::OK();
+ }
+
+ template <class BtreeLayout>
+ DiskLoc BtreeLogic<BtreeLayout>::_addBucket(OperationContext* txn) {
+ DummyDocWriter docWriter(BtreeLayout::BucketSize);
+ StatusWith<DiskLoc> loc = _recordStore->insertRecord(txn, &docWriter, false);
+ // XXX: remove this(?) or turn into massert or sanely bubble it back up.
+ uassertStatusOK(loc.getStatus());
+
+ // this is a new bucket, not referenced by anyone, probably don't need this lock
+ BucketType* b = btreemod(txn, getBucket(loc.getValue()));
+ init(b);
+ return loc.getValue();
+ }
+
+ // static
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::dumpBucket(const BucketType* bucket, int indentLength) {
+ log() << "BUCKET n:" << bucket->n << ", parent:" << hex << bucket->parent.getOfs() << dec;
+
+ const string indent = string(indentLength, ' ');
+
+ for (int i = 0; i < bucket->n; i++) {
+ log() << '\n' << indent;
+ FullKey k = getFullKey(bucket, i);
+ string ks = k.data.toString();
+ log() << " " << hex << k.prevChildBucket.getOfs() << "<-- prevChildBucket for " << i << '\n';
+ log() << indent << " " << i << ' ' << ks.substr(0, 30)
+ << " Loc:" << k.recordLoc.toString() << dec;
+ if (getKeyHeader(bucket, i).isUnused()) {
+ log() << " UNUSED";
+ }
+ }
+
+ log() << "\n" << indent << " " << hex << bucket->nextChild.getOfs() << dec << endl;
+ }
+
+ template <class BtreeLayout>
+ DiskLoc BtreeLogic<BtreeLayout>::getDiskLoc(const DiskLoc& bucketLoc, const int keyOffset) const {
+ invariant(!bucketLoc.isNull());
+ BucketType* bucket = getBucket(bucketLoc);
+ return getKeyHeader(bucket, keyOffset).recordLoc;
+ }
+
+ template <class BtreeLayout>
+ BSONObj BtreeLogic<BtreeLayout>::getKey(const DiskLoc& bucketLoc, const int keyOffset) const {
+ invariant(!bucketLoc.isNull());
+ BucketType* bucket = getBucket(bucketLoc);
+ int n = bucket->n;
+ invariant(n != BtreeLayout::INVALID_N_SENTINEL);
+ invariant(n >= 0);
+ invariant(n < 10000);
+ invariant(n != 0xffff);
+
+ invariant(keyOffset >= 0);
+ invariant(keyOffset < n);
+
+ // XXX: should we really return an empty obj if keyOffset>=n?
+ if (keyOffset >= n) {
+ return BSONObj();
+ }
+ else {
+ return getFullKey(bucket, keyOffset).data.toBson();
+ }
+ }
+
+ template <class BtreeLayout>
+ Status BtreeLogic<BtreeLayout>::touch(OperationContext* txn) const {
+ return _recordStore->touch( txn, NULL );
+ }
+
+ template <class BtreeLayout>
+ long long BtreeLogic<BtreeLayout>::fullValidate(OperationContext* txn,
+ long long *unusedCount,
+ bool strict,
+ bool dumpBuckets,
+ unsigned depth) {
+ return _fullValidate(txn, getRootLoc(), unusedCount, strict, dumpBuckets, depth);
+ }
+
+ template <class BtreeLayout>
+ long long BtreeLogic<BtreeLayout>::_fullValidate(OperationContext* txn,
+ const DiskLoc bucketLoc,
+ long long *unusedCount,
+ bool strict,
+ bool dumpBuckets,
+ unsigned depth) {
+ BucketType* bucket = getBucket(bucketLoc);
+ assertValid(_indexName, bucket, _ordering, true);
+
+ if (dumpBuckets) {
+ log() << bucketLoc.toString() << ' ';
+ dumpBucket(bucket, depth);
+ }
+
+ long long keyCount = 0;
+
+ for (int i = 0; i < bucket->n; i++) {
+ KeyHeaderType& kn = getKeyHeader(bucket, i);
+
+ if (kn.isUsed()) {
+ keyCount++;
+ }
+ else if (NULL != unusedCount) {
+ ++(*unusedCount);
+ }
+
+ if (!kn.prevChildBucket.isNull()) {
+ DiskLoc left = kn.prevChildBucket;
+ BucketType* b = getBucket(left);
+
+ if (strict) {
+ invariant(b->parent == bucketLoc);
+ }
+ else {
+ wassert(b->parent == bucketLoc);
+ }
+
+ keyCount += _fullValidate(txn, left, unusedCount, strict, dumpBuckets, depth + 1);
+ }
+ }
+
+ if (!bucket->nextChild.isNull()) {
+ BucketType* b = getBucket(bucket->nextChild);
+ if (strict) {
+ invariant(b->parent == bucketLoc);
+ }
+ else {
+ wassert(b->parent == bucketLoc);
+ }
+
+ keyCount += _fullValidate(txn, bucket->nextChild, unusedCount, strict, dumpBuckets, depth + 1);
+ }
+
+ return keyCount;
+ }
+
+ // XXX: remove this(?) used to not dump every key in assertValid.
+ int nDumped = 0;
+
+ // static
+ template <class BtreeLayout>
+ void BtreeLogic<BtreeLayout>::assertValid(const std::string& ns,
+ BucketType* bucket,
+ const Ordering& ordering,
+ bool force) {
+ if (!force) {
+ return;
+ }
+
+ // this is very slow so don't do often
+ {
+ static int _k;
+ if (++_k % 128) {
+ return;
+ }
+ }
+
+ DEV {
+ // slow:
+ for (int i = 0; i < bucket->n - 1; i++) {
+ FullKey firstKey = getFullKey(bucket, i);
+ FullKey secondKey = getFullKey(bucket, i + 1);
+ int z = firstKey.data.woCompare(secondKey.data, ordering);
+ if (z > 0) {
+ log() << "ERROR: btree key order corrupt. Keys:" << endl;
+ if (++nDumped < 5) {
+ for (int j = 0; j < bucket->n; j++) {
+ log() << " " << getFullKey(bucket, j).data.toString() << endl;
+ }
+ dumpBucket(bucket);
+ }
+ wassert(false);
+ break;
+ }
+ else if (z == 0) {
+ if (!(firstKey.header.recordLoc < secondKey.header.recordLoc)) {
+ log() << "ERROR: btree key order corrupt (recordlocs wrong):" << endl;
+ log() << " k(" << i << ")" << firstKey.data.toString()
+ << " RL:" << firstKey.header.recordLoc.toString() << endl;
+ log() << " k(" << i + 1 << ")" << secondKey.data.toString()
+ << " RL:" << secondKey.header.recordLoc.toString() << endl;
+ wassert(firstKey.header.recordLoc < secondKey.header.recordLoc);
+ }
+ }
+ }
+ }
+ else {
+ //faster:
+ if (bucket->n > 1) {
+ FullKey k1 = getFullKey(bucket, 0);
+ FullKey k2 = getFullKey(bucket, bucket->n - 1);
+ int z = k1.data.woCompare(k2.data, ordering);
+ //wassert( z <= 0 );
+ if (z > 0) {
+ log() << "Btree keys out of order in collection " << ns;
+ ONCE {
+ dumpBucket(bucket);
+ }
+ invariant(false);
+ }
+ }
+ }
+ }
+
+ template <class BtreeLayout>
+ Status BtreeLogic<BtreeLayout>::insert(OperationContext* txn,
+ const BSONObj& rawKey,
+ const DiskLoc& value,
+ bool dupsAllowed) {
+ KeyDataOwnedType key(rawKey);
+
+ if (key.dataSize() > BtreeLayout::KeyMax) {
+ string msg = str::stream() << "Btree::insert: key too large to index, failing "
+ << _indexName << ' '
+ << key.dataSize() << ' ' << key.toString();
+ return Status(ErrorCodes::KeyTooLong, msg);
+ }
+
+ Status status = _insert(txn,
+ getRoot(),
+ getRootLoc(),
+ key,
+ value,
+ dupsAllowed,
+ DiskLoc(),
+ DiskLoc());
+
+ assertValid(_indexName, getRoot(), _ordering);
+ return status;
+ }
+
+ template <class BtreeLayout>
+ Status BtreeLogic<BtreeLayout>::_insert(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ const KeyDataType& key,
+ const DiskLoc recordLoc,
+ bool dupsAllowed,
+ const DiskLoc leftChild,
+ const DiskLoc rightChild) {
+ invariant( key.dataSize() > 0 );
+
+ int pos;
+ bool found;
+ Status findStatus = _find(txn, bucket, key, recordLoc, !dupsAllowed, &pos, &found);
+ if (!findStatus.isOK()) {
+ return findStatus;
+ }
+
+ if (found) {
+ static KeyHeaderType& header = getKeyHeader(bucket, pos);
+ if (header.isUnused()) {
+ LOG(4) << "btree _insert: reusing unused key" << endl;
+ massert(17433, "_insert: reuse key but lchild is not null", leftChild.isNull());
+ massert(17434, "_insert: reuse key but rchild is not null", rightChild.isNull());
+ txn->recoveryUnit()->writing(&header)->setUsed();
+ return Status::OK();
+ }
+ return Status(ErrorCodes::UniqueIndexViolation, "FIXME");
+ }
+
+ DiskLoc childLoc = childLocForPos(bucket, pos);
+
+ // In current usage, rightChild is NULL for a new key and is not NULL when we are
+ // promoting a split key. These are the only two cases where _insert() is called
+ // currently.
+ if (childLoc.isNull() || !rightChild.isNull()) {
+ insertHere(txn, bucketLoc, pos, key, recordLoc, leftChild, rightChild);
+ return Status::OK();
+ }
+ else {
+ return _insert(txn,
+ getBucket(childLoc),
+ childLoc,
+ key,
+ recordLoc,
+ dupsAllowed,
+ DiskLoc(),
+ DiskLoc());
+ }
+ }
+
+ template <class BtreeLayout>
+ DiskLoc BtreeLogic<BtreeLayout>::advance(OperationContext* txn,
+ const DiskLoc& bucketLoc,
+ int* posInOut,
+ int direction) const {
+ BucketType* bucket = getBucket(bucketLoc);
+
+ if (*posInOut < 0 || *posInOut >= bucket->n ) {
+ log() << "ASSERT failure advancing btree bucket" << endl;
+ log() << " thisLoc: " << bucketLoc.toString() << endl;
+ log() << " keyOfs: " << *posInOut << " n:" << bucket->n << " direction: " << direction << endl;
+ // log() << bucketSummary() << endl;
+ invariant(false);
+ }
+
+ // XXX document
+ int adj = direction < 0 ? 1 : 0;
+ int ko = *posInOut + direction;
+
+ // Look down if we need to.
+ DiskLoc nextDownLoc = childLocForPos(bucket, ko + adj);
+ BucketType* nextDown = getBucket(nextDownLoc);
+ if (NULL != nextDown) {
+ for (;;) {
+ if (direction > 0) {
+ *posInOut = 0;
+ }
+ else {
+ *posInOut = nextDown->n - 1;
+ }
+ DiskLoc newNextDownLoc = childLocForPos(nextDown, *posInOut + adj);
+ BucketType* newNextDownBucket = getBucket(newNextDownLoc);
+ if (NULL == newNextDownBucket) {
+ break;
+ }
+ nextDownLoc = newNextDownLoc;
+ nextDown = newNextDownBucket;
+ }
+ return nextDownLoc;
+ }
+
+ // Looking down isn't the right choice, move forward.
+ if (ko < bucket->n && ko >= 0) {
+ *posInOut = ko;
+ return bucketLoc;
+ }
+
+ // Hit the end of the bucket, move up and over.
+ DiskLoc childLoc = bucketLoc;
+ DiskLoc ancestor = getBucket(bucketLoc)->parent;
+ for (;;) {
+ if (ancestor.isNull()) {
+ break;
+ }
+ BucketType* an = getBucket(ancestor);
+ for (int i = 0; i < an->n; i++) {
+ if (childLocForPos(an, i + adj) == childLoc) {
+ *posInOut = i;
+ return ancestor;
+ }
+ }
+ invariant(direction < 0 || an->nextChild == childLoc);
+ // parent exhausted also, keep going up
+ childLoc = ancestor;
+ ancestor = an->parent;
+ }
+
+ return DiskLoc();
+ }
+
+ template <class BtreeLayout>
+ bool BtreeLogic<BtreeLayout>::keyIsUsed(const DiskLoc& loc, const int& pos) const {
+ return getKeyHeader(getBucket(loc), pos).isUsed();
+ }
+
+ template <class BtreeLayout>
+ bool BtreeLogic<BtreeLayout>::locate(OperationContext* txn,
+ const BSONObj& key,
+ const DiskLoc& recordLoc,
+ const int direction,
+ int* posOut,
+ DiskLoc* bucketLocOut) const {
+ // Clear out any data.
+ *posOut = 0;
+ *bucketLocOut = DiskLoc();
+
+ bool found = false;
+ KeyDataOwnedType owned(key);
+
+ *bucketLocOut = _locate(txn, getRootLoc(), owned, posOut, &found, recordLoc, direction);
+
+ if (!found) {
+ return false;
+ }
+
+ skipUnusedKeys(txn, bucketLocOut, posOut, direction);
+
+ return found;
+ }
+
+ /**
+ * Recursively walk down the btree, looking for a match of key and recordLoc.
+ * Caller should have acquired lock on bucketLoc.
+ */
+ template <class BtreeLayout>
+ DiskLoc BtreeLogic<BtreeLayout>::_locate(OperationContext* txn,
+ const DiskLoc& bucketLoc,
+ const KeyDataType& key,
+ int* posOut,
+ bool* foundOut,
+ const DiskLoc& recordLoc,
+ const int direction) const {
+ int position;
+ BucketType* bucket = getBucket(bucketLoc);
+ // XXX: owned to not owned conversion(?)
+ _find(txn, bucket, key, recordLoc, false, &position, foundOut);
+
+ // Look in our current bucket.
+ if (*foundOut) {
+ *posOut = position;
+ return bucketLoc;
+ }
+
+ // Not in our current bucket. 'position' tells us where there may be a child.
+ DiskLoc childLoc = childLocForPos(bucket, position);
+
+ if (!childLoc.isNull()) {
+ DiskLoc inChild = _locate(txn, childLoc, key, posOut, foundOut, recordLoc, direction);
+ if (!inChild.isNull()) {
+ return inChild;
+ }
+ }
+
+ *posOut = position;
+
+ if (direction < 0) {
+ // The key *would* go to our left.
+ (*posOut)--;
+ if (-1 == *posOut) {
+ // But there's no space for that in our bucket.
+ return DiskLoc();
+ }
+ else {
+ return bucketLoc;
+ }
+ }
+ else {
+ // The key would go to our right...
+ if (bucket->n == *posOut) {
+ return DiskLoc();
+ }
+ else {
+ // But only if there is space.
+ return bucketLoc;
+ }
+ }
+ }
+
+ // TODO relcoate
+ template <class BtreeLayout>
+ bool BtreeLogic<BtreeLayout>::isHead(BucketType* bucket) {
+ return bucket->parent.isNull();
+ }
+
+ template <class BtreeLayout>
+ typename BtreeLogic<BtreeLayout>::BucketType*
+ BtreeLogic<BtreeLayout>::getBucket(const DiskLoc dl) const {
+ if (dl.isNull()) {
+ return NULL;
+ }
+
+ RecordData recordData = _recordStore->dataFor(dl);
+
+ // we need to be working on the raw bytes, not a transient copy
+ invariant(!recordData.isOwned());
+
+ return reinterpret_cast<BucketType*>(const_cast<char*>(recordData.data()));
+ }
+
+ template <class BtreeLayout>
+ typename BtreeLogic<BtreeLayout>::BucketType*
+ BtreeLogic<BtreeLayout>::getRoot() const {
+ return getBucket(_headManager->getHead());
+ }
+
+ template <class BtreeLayout>
+ DiskLoc
+ BtreeLogic<BtreeLayout>::getRootLoc() const {
+ return _headManager->getHead();
+ }
+
+ template <class BtreeLayout>
+ typename BtreeLogic<BtreeLayout>::BucketType*
+ BtreeLogic<BtreeLayout>::childForPos(BucketType* bucket, int pos) const {
+ DiskLoc loc = childLocForPos(bucket, pos);
+ return getBucket(loc);
+ }
+
+ template <class BtreeLayout>
+ typename BtreeLogic<BtreeLayout>::LocType&
+ BtreeLogic<BtreeLayout>::childLocForPos(BucketType* bucket, int pos) {
+ if (bucket->n == pos) {
+ return bucket->nextChild;
+ }
+ else {
+ return getKeyHeader(bucket, pos).prevChildBucket;
+ }
+ }
+
+ //
+ // And, template stuff.
+ //
+
+ // V0 format.
+ template struct FixedWidthKey<DiskLoc>;
+ template class BtreeLogic<BtreeLayoutV0>;
+
+ // V1 format.
+ template struct FixedWidthKey<DiskLoc56Bit>;
+ template class BtreeLogic<BtreeLayoutV1>;
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic.h b/src/mongo/db/storage/mmap_v1/btree/btree_logic.h
new file mode 100644
index 00000000000..ff7d7718de9
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_logic.h
@@ -0,0 +1,593 @@
+/**
+ * Copyright (C) 2014 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/catalog/head_manager.h"
+#include "mongo/db/catalog/index_catalog_entry.h"
+#include "mongo/db/diskloc.h"
+#include "mongo/db/jsobj.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/db/storage/mmap_v1/btree/btree_ondisk.h"
+#include "mongo/db/storage/mmap_v1/btree/key.h"
+#include "mongo/db/storage/mmap_v1/btree/bucket_deletion_notification.h"
+
+
+namespace mongo {
+
+ class BucketDeletionNotification;
+ class RecordStore;
+
+ // Used for unit-testing only
+ template <class BtreeLayout> class BtreeLogicTestBase;
+ template <class BtreeLayout> class ArtificialTreeBuilder;
+
+ /**
+ * This is the logic for manipulating the Btree. It is (mostly) independent of the on-disk
+ * format.
+ */
+ template <class BtreeLayout>
+ class BtreeLogic {
+ public:
+ // AKA _keyNode
+ typedef typename BtreeLayout::FixedWidthKeyType KeyHeaderType;
+
+ // AKA Key
+ typedef typename BtreeLayout::KeyType KeyDataType;
+
+ // AKA KeyOwned
+ typedef typename BtreeLayout::KeyOwnedType KeyDataOwnedType;
+
+ // AKA Loc
+ typedef typename BtreeLayout::LocType LocType;
+
+ // AKA BucketBasics or BtreeBucket, either one.
+ typedef typename BtreeLayout::BucketType BucketType;
+
+ /**
+ * 'head' manages the catalog information.
+ * 'store' allocates and frees buckets.
+ * 'ordering' is meta-information we store in the catalog.
+ * 'indexName' is a string identifying the index that we use to print errors with.
+ */
+ BtreeLogic(HeadManager* head,
+ RecordStore* store,
+ const Ordering& ordering,
+ const string& indexName,
+ BucketDeletionNotification* bucketDeletion)
+ : _headManager(head),
+ _recordStore(store),
+ _ordering(ordering),
+ _indexName(indexName),
+ _bucketDeletion(bucketDeletion) {
+
+ }
+
+ //
+ // Public-facing
+ //
+
+ class Builder {
+ public:
+ typedef typename BtreeLayout::KeyOwnedType KeyDataOwnedType;
+ typedef typename BtreeLayout::KeyType KeyDataType;
+
+ Status addKey(const BSONObj& key, const DiskLoc& loc);
+
+ // XXX: status, outparam for # keys?
+ unsigned long long commit(bool mayInterrupt);
+
+ private:
+ friend class BtreeLogic;
+
+ Builder(BtreeLogic* logic, OperationContext* txn, bool dupsAllowed);
+
+ // Direct ports of functionality
+ void newBucket();
+ void buildNextLevel(DiskLoc loc, bool mayInterrupt);
+ void mayCommitProgressDurably();
+ BucketType* _getModifiableBucket(DiskLoc loc);
+ BucketType* _getBucket(DiskLoc loc);
+ // Direct ports of functionality
+
+ // Not owned.
+ BtreeLogic* _logic;
+
+ // Direct port of names.
+ DiskLoc _cur;
+ DiskLoc _first;
+ BucketType* _b;
+ bool _committed;
+ bool _dupsAllowed;
+ long long _numAdded;
+ auto_ptr<KeyDataOwnedType> _keyLast;
+
+ // Not owned.
+ OperationContext* _txn;
+ };
+
+ /**
+ * Caller owns the returned pointer.
+ * 'this' must outlive the returned pointer.
+ */
+ Builder* newBuilder(OperationContext* txn, bool dupsAllowed);
+
+ Status dupKeyCheck(OperationContext* txn,
+ const BSONObj& key,
+ const DiskLoc& loc) const;
+
+ Status insert(OperationContext* txn,
+ const BSONObj& rawKey,
+ const DiskLoc& value,
+ bool dupsAllowed);
+
+ /**
+ * Navigates down the tree and locates the bucket and position containing a record with
+ * the specified <key, recordLoc> combination.
+ *
+ * @return true if the exact <key, recordLoc> was found. Otherwise, false and the
+ * bucketLocOut would contain the bucket containing key which is before or after the
+ * searched one (dependent on the direction).
+ */
+ bool locate(OperationContext* txn,
+ const BSONObj& key,
+ const DiskLoc& recordLoc,
+ const int direction,
+ int* posOut,
+ DiskLoc* bucketLocOut) const;
+
+ void advance(OperationContext* txn,
+ DiskLoc* bucketLocInOut,
+ int* posInOut,
+ int direction) const;
+
+ bool exists(OperationContext* txn, const KeyDataType& key) const;
+
+ bool unindex(OperationContext* txn,
+ const BSONObj& key,
+ const DiskLoc& recordLoc);
+
+ bool isEmpty() const;
+
+ long long fullValidate(OperationContext*,
+ long long *unusedCount,
+ bool strict,
+ bool dumpBuckets,
+ unsigned depth);
+
+ DiskLoc getDiskLoc(const DiskLoc& bucketLoc, const int keyOffset) const;
+
+ BSONObj getKey(const DiskLoc& bucketLoc, const int keyOffset) const;
+
+ DiskLoc getHead() const { return _headManager->getHead(); }
+
+ Status touch(OperationContext* txn) const;
+
+ //
+ // Composite key navigation methods
+ //
+
+ void customLocate(OperationContext* txn,
+ DiskLoc* locInOut,
+ int* keyOfsInOut,
+ const BSONObj& keyBegin,
+ int keyBeginLen,
+ bool afterKey,
+ const vector<const BSONElement*>& keyEnd,
+ const vector<bool>& keyEndInclusive,
+ int direction) const;
+
+ void advanceTo(OperationContext*,
+ DiskLoc* thisLocInOut,
+ int* keyOfsInOut,
+ const BSONObj &keyBegin,
+ int keyBeginLen,
+ bool afterKey,
+ const vector<const BSONElement*>& keyEnd,
+ const vector<bool>& keyEndInclusive,
+ int direction) const;
+
+ void restorePosition(OperationContext* txn,
+ const BSONObj& savedKey,
+ const DiskLoc& savedLoc,
+ int direction,
+ DiskLoc* bucketInOut,
+ int* keyOffsetInOut) const;
+
+ //
+ // Creation and deletion
+ //
+
+ /**
+ * Returns OK if the index was uninitialized before, error status otherwise.
+ */
+ Status initAsEmpty(OperationContext* txn);
+
+ //
+ // Size constants
+ //
+
+ static int lowWaterMark();
+
+ private:
+ friend class BtreeLogic::Builder;
+
+ // Used for unit-testing only
+ friend class BtreeLogicTestBase<BtreeLayout>;
+ friend class ArtificialTreeBuilder<BtreeLayout>;
+
+ /**
+ * This is an in memory wrapper for the variable length data associated with a
+ * KeyHeaderType. It points to on-disk data but is not itself on-disk data.
+ *
+ * This object and its BSONObj 'key' will become invalid if the KeyHeaderType data that owns
+ * this it is moved within the btree. In general, a KeyWrapper should not be expected to be
+ * valid after a write.
+ */
+ struct FullKey {
+ FullKey(const BucketType* bucket, int i)
+ : header(getKeyHeader(bucket, i)),
+ prevChildBucket(header.prevChildBucket),
+ recordLoc(header.recordLoc),
+ data(bucket->data + header.keyDataOfs()) { }
+
+ // This is actually a reference to something on-disk.
+ const KeyHeaderType& header;
+
+ // These are actually in 'header'.
+ const LocType& prevChildBucket;
+ const LocType& recordLoc;
+
+ // This is *not* memory-mapped but its members point to something on-disk.
+ KeyDataType data;
+ };
+
+ //
+ // Functions that depend on the templated type info but nothing in 'this'.
+ //
+
+ static LocType& childLocForPos(BucketType* bucket, int pos);
+
+ static FullKey getFullKey(const BucketType* bucket, int i);
+
+ static KeyHeaderType& getKeyHeader(BucketType* bucket, int i);
+
+ static const KeyHeaderType& getKeyHeader(const BucketType* bucket, int i);
+
+ static char* dataAt(BucketType* bucket, short ofs);
+
+ static void markUnused(BucketType* bucket, int keypos);
+
+ static int totalDataSize(BucketType* bucket);
+
+ static void init(BucketType* bucket);
+
+ static int _alloc(BucketType* bucket, int bytes);
+
+ static void _unalloc(BucketType* bucket, int bytes);
+
+ static void _delKeyAtPos(BucketType* bucket, int keypos, bool mayEmpty = false);
+
+ static void popBack(BucketType* bucket, DiskLoc* recordLocOut, KeyDataType *keyDataOut);
+
+ static bool mayDropKey(BucketType* bucket, int index, int refPos);
+
+ static int _packedDataSize(BucketType* bucket, int refPos);
+
+ static void setPacked(BucketType* bucket);
+
+ static void setNotPacked(BucketType* bucket);
+
+ static BucketType* btreemod(OperationContext* txn, BucketType* bucket);
+
+ static int splitPos(BucketType* bucket, int keypos);
+
+ static void reserveKeysFront(BucketType* bucket, int nAdd);
+
+ static void setKey(BucketType* bucket,
+ int i,
+ const DiskLoc recordLoc,
+ const KeyDataType &key,
+ const DiskLoc prevChildBucket);
+
+ static bool isHead(BucketType* bucket);
+
+ static void dumpBucket(const BucketType* bucket, int indentLength = 0);
+
+ static void assertValid(const std::string& ns,
+ BucketType* bucket,
+ const Ordering& ordering,
+ bool force = false);
+
+ //
+ // 'this'-specific helpers (require record store, catalog information, or ordering, or type
+ // information).
+ //
+
+ bool basicInsert(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int& keypos,
+ const KeyDataType& key,
+ const DiskLoc recordLoc);
+
+ void dropFront(BucketType* bucket, int nDrop, int& refpos);
+
+ void _pack(OperationContext* txn, BucketType* bucket, const DiskLoc thisLoc, int &refPos);
+
+ void customLocate(OperationContext* txn,
+ DiskLoc* locInOut,
+ int* keyOfsInOut,
+ const BSONObj& keyBegin,
+ int keyBeginLen,
+ bool afterKey,
+ const vector<const BSONElement*>& keyEnd,
+ const vector<bool>& keyEndInclusive,
+ int direction,
+ pair<DiskLoc, int>& bestParent) const;
+
+ Status _find(OperationContext* txn,
+ BucketType* bucket,
+ const KeyDataType& key,
+ const DiskLoc& recordLoc,
+ bool errorIfDup,
+ int* keyPositionOut,
+ bool* foundOut) const;
+
+ bool customFind(int low,
+ int high,
+ const BSONObj& keyBegin,
+ int keyBeginLen,
+ bool afterKey,
+ const vector<const BSONElement*>& keyEnd,
+ const vector<bool>& keyEndInclusive,
+ const Ordering& order,
+ int direction,
+ DiskLoc* thisLocInOut,
+ int* keyOfsInOut,
+ pair<DiskLoc, int>& bestParent) const;
+
+ void advanceToImpl(OperationContext* txn,
+ DiskLoc* thisLocInOut,
+ int* keyOfsInOut,
+ const BSONObj &keyBegin,
+ int keyBeginLen,
+ bool afterKey,
+ const vector<const BSONElement*>& keyEnd,
+ const vector<bool>& keyEndInclusive,
+ int direction) const;
+
+ bool wouldCreateDup(OperationContext* txn,
+ const KeyDataType& key,
+ const DiskLoc self) const;
+
+ bool keyIsUsed(const DiskLoc& loc, const int& pos) const;
+
+ void skipUnusedKeys(OperationContext* txn,
+ DiskLoc* loc,
+ int* pos,
+ int direction) const;
+
+ DiskLoc advance(OperationContext* txn,
+ const DiskLoc& bucketLoc,
+ int* posInOut,
+ int direction) const;
+
+ DiskLoc _locate(OperationContext* txn,
+ const DiskLoc& bucketLoc,
+ const KeyDataType& key,
+ int* posOut,
+ bool* foundOut,
+ const DiskLoc& recordLoc,
+ const int direction) const;
+
+ long long _fullValidate(OperationContext* txn,
+ const DiskLoc bucketLoc,
+ long long *unusedCount,
+ bool strict,
+ bool dumpBuckets,
+ unsigned depth);
+
+ DiskLoc _addBucket(OperationContext* txn);
+
+ bool canMergeChildren(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ const int leftIndex);
+
+ // has to look in children of 'bucket' and requires record store
+ int _rebalancedSeparatorPos(OperationContext* txn,
+ BucketType* bucket,
+ int leftIndex);
+
+ void _packReadyForMod(BucketType* bucket, int &refPos);
+
+ void truncateTo(BucketType* bucket, int N, int &refPos);
+
+ void split(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int keypos,
+ const DiskLoc recordLoc,
+ const KeyDataType& key,
+ const DiskLoc lchild,
+ const DiskLoc rchild);
+
+ Status _insert(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ const KeyDataType& key,
+ const DiskLoc recordLoc,
+ bool dupsAllowed,
+ const DiskLoc leftChild,
+ const DiskLoc rightChild);
+
+ // TODO take a BucketType*?
+ void insertHere(OperationContext* txn,
+ const DiskLoc bucketLoc,
+ int pos,
+ const KeyDataType& key,
+ const DiskLoc recordLoc,
+ const DiskLoc leftChild,
+ const DiskLoc rightChild);
+
+ std::string dupKeyError(const KeyDataType& key) const;
+
+ void setInternalKey(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int keypos,
+ const DiskLoc recordLoc,
+ const KeyDataType& key,
+ const DiskLoc lchild,
+ const DiskLoc rchild);
+
+ void fixParentPtrs(OperationContext* trans,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int firstIndex = 0,
+ int lastIndex = -1);
+
+ bool mayBalanceWithNeighbors(OperationContext* txn, BucketType* bucket, const DiskLoc bucketLoc);
+
+ void doBalanceChildren(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int leftIndex);
+
+ void doBalanceLeftToRight(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc thisLoc,
+ int leftIndex,
+ int split,
+ BucketType* l,
+ const DiskLoc lchild,
+ BucketType* r,
+ const DiskLoc rchild);
+
+ void doBalanceRightToLeft(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc thisLoc,
+ int leftIndex,
+ int split,
+ BucketType* l,
+ const DiskLoc lchild,
+ BucketType* r,
+ const DiskLoc rchild);
+
+ bool tryBalanceChildren(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int leftIndex);
+
+ int indexInParent(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc) const;
+
+ void doMergeChildren(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int leftIndex);
+
+ void replaceWithNextChild(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc);
+
+ void deleteInternalKey(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int keypos);
+
+ void delKeyAtPos(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc,
+ int p);
+
+ void delBucket(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc);
+
+ void deallocBucket(OperationContext* txn,
+ BucketType* bucket,
+ const DiskLoc bucketLoc);
+
+ bool _keyIsAt(const BSONObj& savedKey,
+ const DiskLoc& savedLoc,
+ BucketType* bucket,
+ int keyPos) const;
+
+ // TODO 'this' for _ordering(?)
+ int customBSONCmp(const BSONObj& l,
+ const BSONObj& rBegin,
+ int rBeginLen,
+ bool rSup,
+ const std::vector<const BSONElement*>& rEnd,
+ const std::vector<bool>& rEndInclusive,
+ const Ordering& o,
+ int direction) const;
+
+ // TODO needs 'this' for _ordering for sanity check
+ bool _pushBack(BucketType* bucket,
+ const DiskLoc recordLoc,
+ const KeyDataType& key,
+ const DiskLoc prevChild);
+
+ void pushBack(BucketType* bucket,
+ const DiskLoc recordLoc,
+ const KeyDataType& key,
+ const DiskLoc prevChild) {
+ invariant(_pushBack(bucket, recordLoc, key, prevChild));
+ }
+
+ BucketType* childForPos(BucketType* bucket, int pos) const;
+
+ BucketType* getBucket(const DiskLoc dl) const;
+
+ BucketType* getRoot() const;
+
+ DiskLoc getRootLoc() const;
+
+ //
+ // Data
+ //
+
+ // Not owned here.
+ HeadManager* _headManager;
+
+ // Not owned here.
+ RecordStore* _recordStore;
+
+ Ordering _ordering;
+
+ string _indexName;
+
+ // Not owned here
+ BucketDeletionNotification* _bucketDeletion;
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp
new file mode 100644
index 00000000000..ca6cdce9a9e
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp
@@ -0,0 +1,2207 @@
+// btree_logic_test.cpp : Btree unit tests
+//
+
+/**
+ * Copyright (C) 2014 MongoDB
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects
+ * for all of the code used other than as permitted herein. If you modify
+ * file(s) with this exception, you may extend this exception to your
+ * version of the file(s), but you are not obligated to do so. If you do not
+ * wish to do so, delete this exception statement from your version. If you
+ * delete this exception statement from all source files in the program,
+ * then also delete it in the license file.
+ */
+
+// This file contains simple single-threaded tests, which check various aspects of the Btree logic
+//
+
+#include "mongo/db/instance.h"
+#include "mongo/db/operation_context_noop.h"
+#include "mongo/db/storage/mmap_v1/btree/btree_test_help.h"
+#include "mongo/unittest/unittest.h"
+
+
+namespace mongo {
+
+ /**
+ * This class is made friend of BtreeLogic so we can add whatever private method accesses we
+ * need to it, to be used by the tests.
+ */
+ template<class BtreeLayoutType>
+ class BtreeLogicTestBase {
+ public:
+ typedef typename BtreeLayoutType::BucketType BucketType;
+ typedef typename BtreeLayoutType::FixedWidthKeyType FixedWidthKeyType;
+
+ typedef typename BtreeLogic<BtreeLayoutType>::FullKey FullKey;
+ typedef typename BtreeLogic<BtreeLayoutType>::KeyDataOwnedType KeyDataOwnedType;
+
+ BtreeLogicTestBase() : _helper(BSON("TheKey" << 1)) {
+
+ }
+
+ virtual ~BtreeLogicTestBase() {
+
+ }
+
+ protected:
+ void checkValidNumKeys(int nKeys) {
+ OperationContextNoop txn;
+ ASSERT_EQUALS(nKeys, _helper.btree.fullValidate(&txn, NULL, true, true, 0));
+ }
+
+ void insert(const BSONObj &key, const DiskLoc dl) {
+ OperationContextNoop txn;
+ _helper.btree.insert(&txn, key, dl, true);
+ }
+
+ bool unindex(const BSONObj &key) {
+ OperationContextNoop txn;
+ return _helper.btree.unindex(&txn, key, _helper.dummyDiskLoc);
+ }
+
+ void locate(const BSONObj &key,
+ int expectedPos,
+ bool expectedFound,
+ const DiskLoc &expectedLocation,
+ int direction) {
+ int pos;
+ DiskLoc loc;
+ OperationContextNoop txn;
+ ASSERT_EQUALS(expectedFound,
+ _helper.btree.locate(&txn, key, _helper.dummyDiskLoc, direction, &pos, &loc));
+ ASSERT_EQUALS(expectedLocation, loc);
+ ASSERT_EQUALS(expectedPos, pos);
+ }
+
+ const BucketType* child(const BucketType* bucket, int i) const {
+ verify(i <= bucket->n);
+
+ DiskLoc diskLoc;
+ if (i == bucket->n) {
+ diskLoc = bucket->nextChild;
+ }
+ else {
+ FullKey fullKey = BtreeLogic<BtreeLayoutType>::getFullKey(bucket, i);
+ diskLoc = fullKey.prevChildBucket;
+ }
+
+ verify(!diskLoc.isNull());
+
+ return _helper.btree.getBucket(diskLoc);
+ }
+
+ BucketType* head() const {
+ return _helper.btree.getBucket(_helper.headManager.getHead());
+ }
+
+ void forcePackBucket(const DiskLoc bucketLoc) {
+ BucketType* bucket = _helper.btree.getBucket(bucketLoc);
+
+ bucket->topSize += bucket->emptySize;
+ bucket->emptySize = 0;
+ BtreeLogic<BtreeLayoutType>::setNotPacked(bucket);
+ }
+
+ void truncateBucket(BucketType* bucket, int N, int &refPos) {
+ _helper.btree.truncateTo(bucket, N, refPos);
+ }
+
+ int bucketPackedDataSize(BucketType* bucket, int refPos) {
+ return _helper.btree._packedDataSize(bucket, refPos);
+ }
+
+ int bucketRebalancedSeparatorPos(const DiskLoc bucketLoc, int leftIndex) {
+ BucketType* bucket = _helper.btree.getBucket(bucketLoc);
+ OperationContextNoop txn;
+ return _helper.btree._rebalancedSeparatorPos(&txn, bucket, leftIndex);
+ }
+
+ FullKey getKey(const DiskLoc bucketLoc, int pos) const {
+ const BucketType* bucket = _helper.btree.getBucket(bucketLoc);
+ return BtreeLogic<BtreeLayoutType>::getFullKey(bucket, pos);
+ }
+
+ void markKeyUnused(const DiskLoc bucketLoc, int keyPos) {
+ BucketType* bucket = _helper.btree.getBucket(bucketLoc);
+ invariant(keyPos >= 0 && keyPos < bucket->n);
+
+ _helper.btree.getKeyHeader(bucket, keyPos).setUnused();
+ }
+
+ DiskLoc newBucket() {
+ OperationContextNoop txn;
+ return _helper.btree._addBucket(&txn);
+ }
+
+ /**
+ * Sets the nextChild pointer for the bucket at the specified location.
+ */
+ void setBucketNextChild(const DiskLoc bucketLoc, const DiskLoc nextChild) {
+ OperationContextNoop txn;
+
+ BucketType* bucket = _helper.btree.getBucket(bucketLoc);
+ bucket->nextChild = nextChild;
+
+ _helper.btree.fixParentPtrs(&txn, bucket, bucketLoc);
+ }
+
+ protected:
+ BtreeLogicTestHelper<BtreeLayoutType> _helper;
+ };
+
+ //
+ // TESTS
+ //
+
+ template<class OnDiskFormat>
+ class SimpleCreate : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ this->checkValidNumKeys(0);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class SimpleInsertDelete : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ BSONObj key = simpleKey('z');
+ this->insert(key, this->_helper.dummyDiskLoc);
+
+ this->checkValidNumKeys(1);
+ this->locate(key, 0, true, this->_helper.headManager.getHead(), 1);
+
+ this->unindex(key);
+
+ this->checkValidNumKeys(0);
+ this->locate(key, 0, false, DiskLoc(), 1);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class SplitUnevenBucketBase : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ for (int i = 0; i < 10; ++i) {
+ BSONObj shortKey = simpleKey(shortToken(i), 1);
+ this->insert(shortKey, this->_helper.dummyDiskLoc);
+
+ BSONObj longKey = simpleKey(longToken(i), 800);
+ this->insert(longKey, this->_helper.dummyDiskLoc);
+ }
+
+ this->checkValidNumKeys(20);
+ ASSERT_EQUALS(1, this->head()->n);
+ checkSplit();
+ }
+
+ protected:
+ virtual char shortToken(int i) const = 0;
+ virtual char longToken(int i) const = 0;
+ virtual void checkSplit() = 0;
+
+ static char leftToken(int i) {
+ return 'a' + i;
+ }
+
+ static char rightToken(int i) {
+ return 'z' - i;
+ }
+ };
+
+ template<class OnDiskFormat>
+ class SplitRightHeavyBucket : public SplitUnevenBucketBase<OnDiskFormat> {
+ private:
+ virtual char shortToken(int i) const {
+ return this->leftToken(i);
+ }
+ virtual char longToken(int i) const {
+ return this->rightToken(i);
+ }
+ virtual void checkSplit() {
+ ASSERT_EQUALS(15, this->child(this->head(), 0)->n);
+ ASSERT_EQUALS(4, this->child(this->head(), 1)->n);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class SplitLeftHeavyBucket : public SplitUnevenBucketBase<OnDiskFormat> {
+ private:
+ virtual char shortToken(int i) const {
+ return this->rightToken(i);
+ }
+ virtual char longToken(int i) const {
+ return this->leftToken(i);
+ }
+ virtual void checkSplit() {
+ ASSERT_EQUALS(4, this->child(this->head(), 0)->n);
+ ASSERT_EQUALS(15, this->child(this->head(), 1)->n);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class MissingLocate : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ for (int i = 0; i < 3; ++i) {
+ BSONObj k = simpleKey('b' + 2 * i);
+ this->insert(k, this->_helper.dummyDiskLoc);
+ }
+
+ locateExtended(1, 'a', 'b', this->_helper.headManager.getHead());
+ locateExtended(1, 'c', 'd', this->_helper.headManager.getHead());
+ locateExtended(1, 'e', 'f', this->_helper.headManager.getHead());
+ locateExtended(1, 'g', 'g' + 1, DiskLoc()); // of course, 'h' isn't in the index.
+
+ // old behavior
+ // locateExtended( -1, 'a', 'b', dl() );
+ // locateExtended( -1, 'c', 'd', dl() );
+ // locateExtended( -1, 'e', 'f', dl() );
+ // locateExtended( -1, 'g', 'f', dl() );
+
+ locateExtended(-1, 'a', 'a' - 1, DiskLoc()); // of course, 'a' - 1 isn't in the index
+ locateExtended(-1, 'c', 'b', this->_helper.headManager.getHead());
+ locateExtended(-1, 'e', 'd', this->_helper.headManager.getHead());
+ locateExtended(-1, 'g', 'f', this->_helper.headManager.getHead());
+ }
+
+ private:
+ void locateExtended(
+ int direction, char token, char expectedMatch, DiskLoc expectedLocation) {
+ const BSONObj k = simpleKey(token);
+ int expectedPos = (expectedMatch - 'b') / 2;
+
+ this->locate(k, expectedPos, false, expectedLocation, direction);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class MissingLocateMultiBucket : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ this->insert(simpleKey('A', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('B', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('C', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('D', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('E', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('F', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('G', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('H', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('J', 800), this->_helper.dummyDiskLoc);
+
+ // This causes split
+ this->insert(simpleKey('I', 800), this->_helper.dummyDiskLoc);
+
+ int pos;
+ DiskLoc loc;
+
+ // 'E' is the split point and should be in the head the rest should be ~50/50
+ const BSONObj splitPoint = simpleKey('E', 800);
+ this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &pos, &loc);
+ ASSERT_EQUALS(this->_helper.headManager.getHead(), loc);
+ ASSERT_EQUALS(0, pos);
+
+ // Find the one before 'E'
+ int largePos;
+ DiskLoc largeLoc;
+ this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &largePos, &largeLoc);
+ this->_helper.btree.advance(&txn, &largeLoc, &largePos, -1);
+
+ // Find the one after 'E'
+ int smallPos;
+ DiskLoc smallLoc;
+ this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &smallPos, &smallLoc);
+ this->_helper.btree.advance(&txn, &smallLoc, &smallPos, 1);
+
+ ASSERT_NOT_EQUALS(smallLoc, largeLoc);
+ ASSERT_NOT_EQUALS(smallLoc, loc);
+ ASSERT_NOT_EQUALS(largeLoc, loc);
+ }
+ };
+
+ /**
+ * Validates that adding keys incrementally produces buckets, which are 90%/10% full.
+ */
+ template<class OnDiskFormat>
+ class SERVER983 : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ this->insert(simpleKey('A', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('B', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('C', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('D', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('E', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('F', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('G', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('H', 800), this->_helper.dummyDiskLoc);
+ this->insert(simpleKey('I', 800), this->_helper.dummyDiskLoc);
+
+ // This will cause split
+ this->insert(simpleKey('J', 800), this->_helper.dummyDiskLoc);
+
+ int pos;
+ DiskLoc loc;
+
+ // 'H' is the maximum 'large' interval key, 90% should be < 'H' and 10% larger
+ const BSONObj splitPoint = simpleKey('H', 800);
+ this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &pos, &loc);
+ ASSERT_EQUALS(this->_helper.headManager.getHead(), loc);
+ ASSERT_EQUALS(0, pos);
+
+ // Find the one before 'H'
+ int largePos;
+ DiskLoc largeLoc;
+ this->_helper.btree.locate(&txn,
+ splitPoint, this->_helper.dummyDiskLoc, 1, &largePos, &largeLoc);
+ this->_helper.btree.advance(&txn, &largeLoc, &largePos, -1);
+
+ // Find the one after 'H'
+ int smallPos;
+ DiskLoc smallLoc;
+ this->_helper.btree.locate(&txn,
+ splitPoint, this->_helper.dummyDiskLoc, 1, &smallPos, &smallLoc);
+ this->_helper.btree.advance(&txn, &smallLoc, &smallPos, 1);
+
+ ASSERT_NOT_EQUALS(smallLoc, largeLoc);
+ ASSERT_NOT_EQUALS(smallLoc, loc);
+ ASSERT_NOT_EQUALS(largeLoc, loc);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class DontReuseUnused : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ for (int i = 0; i < 10; ++i) {
+ const BSONObj k = simpleKey('b' + 2 * i, 800);
+ this->insert(k, this->_helper.dummyDiskLoc);
+ }
+
+ const BSONObj root = simpleKey('p', 800);
+ this->unindex(root);
+
+ this->insert(root, this->_helper.dummyDiskLoc);
+ this->locate(root, 0, true, this->head()->nextChild, 1);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class MergeBucketsTestBase : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ for (int i = 0; i < 10; ++i) {
+ const BSONObj k = simpleKey('b' + 2 * i, 800);
+ this->insert(k, this->_helper.dummyDiskLoc);
+ }
+
+ // numRecords() - 1, because this->_helper.dummyDiskLoc is actually in the record store too
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords() - 1);
+
+ long long expectedCount = 10 - unindexKeys();
+ ASSERT_EQUALS(1, this->_helper.recordStore.numRecords() - 1);
+
+ long long unusedCount = 0;
+ ASSERT_EQUALS(expectedCount, this->_helper.btree.fullValidate(&txn, &unusedCount, true, true, 0));
+ ASSERT_EQUALS(0, unusedCount);
+ }
+
+ protected:
+ virtual int unindexKeys() = 0;
+ };
+
+ template<class OnDiskFormat>
+ class MergeBucketsLeft : public MergeBucketsTestBase<OnDiskFormat> {
+ virtual int unindexKeys() {
+ BSONObj k = simpleKey('b', 800);
+ this->unindex(k);
+
+ k = simpleKey('b' + 2, 800);
+ this->unindex(k);
+
+ k = simpleKey('b' + 4, 800);
+ this->unindex(k);
+
+ k = simpleKey('b' + 6, 800);
+ this->unindex(k);
+
+ return 4;
+ }
+ };
+
+ template<class OnDiskFormat>
+ class MergeBucketsRight : public MergeBucketsTestBase<OnDiskFormat> {
+ virtual int unindexKeys() {
+ const BSONObj k = simpleKey('b' + 2 * 9, 800);
+ this->unindex(k);
+ return 1;
+ }
+ };
+
+ template<class OnDiskFormat>
+ class MergeBucketsDontReplaceHead : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ for (int i = 0; i < 18; ++i) {
+ const BSONObj k = simpleKey('a' + i, 800);
+ this->insert(k, this->_helper.dummyDiskLoc);
+ }
+
+ // numRecords() - 1, because fixedDiskLoc is actually in the record store too
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords() - 1);
+
+ const BSONObj k = simpleKey('a' + 17, 800);
+ this->unindex(k);
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords() - 1);
+
+ long long unusedCount = 0;
+ ASSERT_EQUALS(17, this->_helper.btree.fullValidate(&txn, &unusedCount, true, true, 0));
+ ASSERT_EQUALS(0, unusedCount);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class MergeBucketsDelInternal : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{d:{b:{a:null},bb:null,_:{c:null}},_:{f:{e:null},_:{g:null}}}");
+ ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << "bb");
+ verify(this->unindex(k));
+
+ ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(6, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{b:{a:null},d:{c:null},f:{e:null},_:{g:null}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class MergeBucketsRightNull : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{d:{b:{a:null},bb:null,cc:{c:null}},_:{f:{e:null},h:{g:null}}}");
+ ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << "bb");
+ verify(this->unindex(k));
+
+ ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(6, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}");
+ }
+ };
+
+ // This comment was here during porting, not sure what it means:
+ //
+ // "Not yet handling this case"
+ template<class OnDiskFormat>
+ class DontMergeSingleBucket : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{d:{b:{a:null},c:null}}");
+
+ ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << "c");
+ verify(this->unindex(k));
+
+ ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{d:{b:{a:null}}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class ParentMergeNonRightToLeft : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{d:{b:{a:null},bb:null,cc:{c:null}},i:{f:{e:null},h:{g:null}}}");
+
+ ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << "bb");
+ verify(this->unindex(k));
+
+ ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // Child does not currently replace parent in this case. Also, the tree
+ // has 6 buckets + 1 for the this->_helper.dummyDiskLoc.
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{i:{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class ParentMergeNonRightToRight : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{d:{b:{a:null},cc:{c:null}},i:{f:{e:null},ff:null,h:{g:null}}}");
+
+ ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << "ff");
+ verify(this->unindex(k));
+
+ ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // Child does not currently replace parent in this case. Also, the tree
+ // has 6 buckets + 1 for the this->_helper.dummyDiskLoc.
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{i:{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class CantMergeRightNoMerge : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{d:{b:{a:null},bb:null,cc:{c:null}},"
+ "dd:null,"
+ "_:{f:{e:null},h:{g:null}}}");
+
+ ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << "bb");
+ verify(this->unindex(k));
+
+ ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{d:{b:{a:null},cc:{c:null}},"
+ "dd:null,"
+ "_:{f:{e:null},h:{g:null}}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class CantMergeLeftNoMerge : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{c:{b:{a:null}},d:null,_:{f:{e:null},g:null}}");
+
+ ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(6, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << "g");
+ verify(this->unindex(k));
+
+ ASSERT_EQUALS(6, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(6, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{c:{b:{a:null}},d:null,_:{f:{e:null}}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class MergeOption : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{c:{b:{a:null}},f:{e:{d:null},ee:null},_:{h:{g:null}}}");
+
+ ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << "ee");
+ verify(this->unindex(k));
+
+ ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{c:{b:{a:null}},_:{e:{d:null},f:null,h:{g:null}}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class ForceMergeLeft : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{c:{b:{a:null}},f:{e:{d:null},ee:null},ff:null,_:{h:{g:null}}}");
+
+ ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << "ee");
+ verify(this->unindex(k));
+
+ ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{f:{b:{a:null},c:null,e:{d:null}},ff:null,_:{h:{g:null}}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class ForceMergeRight : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{c:{b:{a:null}},cc:null,f:{e:{d:null},ee:null},_:{h:{g:null}}}");
+
+ ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << "ee");
+ verify(this->unindex(k));
+
+ ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{c:{b:{a:null}},cc:null,_:{e:{d:null},f:null,h:{g:null}}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class RecursiveMerge : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},g:{f:null}},j:{i:null}}");
+
+ ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << "c");
+ verify(this->unindex(k));
+
+ ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+ // Height is not currently reduced in this case
+ builder.checkStructure("{j:{g:{b:{a:null},d:null,e:null,f:null},h:null,i:null}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class RecursiveMergeRightBucket : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},g:{f:null}},_:{i:null}}");
+
+ ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << "c");
+ verify(this->unindex(k));
+
+ ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{g:{b:{a:null},d:null,e:null,f:null},h:null,i:null}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class RecursiveMergeDoubleRightBucket : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},_:{f:null}},_:{i:null}}");
+
+ ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << "c");
+ verify(this->unindex(k));
+
+ ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+ // no recursion currently in this case
+ builder.checkStructure("{h:{b:{a:null},d:null,e:null,f:null},_:{i:null}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class MergeSizeTestBase : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ MergeSizeTestBase() : _count(0) {
+
+ }
+
+ void run() {
+ OperationContextNoop txn;
+ this->_helper.btree.initAsEmpty(&txn);
+
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ const BSONObj& topKey = biggestKey('m');
+
+ DiskLoc leftChild = this->newBucket();
+ builder.push(this->_helper.headManager.getHead(), topKey, leftChild);
+ _count++;
+
+ DiskLoc rightChild = this->newBucket();
+ this->setBucketNextChild(this->_helper.headManager.getHead(), rightChild);
+
+ _count += builder.fillBucketToExactSize(leftChild, leftSize(), 'a');
+ _count += builder.fillBucketToExactSize(rightChild, rightSize(), 'n');
+
+ ASSERT(leftAdditional() <= 2);
+ if (leftAdditional() >= 2) {
+ builder.push(leftChild, bigKey('k'), DiskLoc());
+ }
+ if (leftAdditional() >= 1) {
+ builder.push(leftChild, bigKey('l'), DiskLoc());
+ }
+
+ ASSERT(rightAdditional() <= 2);
+ if (rightAdditional() >= 2) {
+ builder.push(rightChild, bigKey('y'), DiskLoc());
+ }
+ if (rightAdditional() >= 1) {
+ builder.push(rightChild, bigKey('z'), DiskLoc());
+ }
+
+ _count += leftAdditional() + rightAdditional();
+
+ initCheck();
+
+ const char *keys = delKeys();
+ for (const char *i = keys; *i; ++i) {
+ long long unused = 0;
+ ASSERT_EQUALS(_count, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+ ASSERT_EQUALS(0, unused);
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = bigKey(*i);
+ this->unindex(k);
+
+ --_count;
+ }
+
+ long long unused = 0;
+ ASSERT_EQUALS(_count, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+ ASSERT_EQUALS(0, unused);
+
+ validate();
+
+ if (!merge()) {
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+ }
+ else {
+ // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(2, this->_helper.recordStore.numRecords());
+ }
+ }
+
+ protected:
+ virtual int leftAdditional() const { return 2; }
+ virtual int rightAdditional() const { return 2; }
+ virtual void initCheck() {}
+ virtual void validate() {}
+ virtual int leftSize() const = 0;
+ virtual int rightSize() const = 0;
+ virtual const char * delKeys() const { return "klyz"; }
+ virtual bool merge() const { return true; }
+
+ static BSONObj bigKey(char a) {
+ return simpleKey(a, 801);
+ }
+
+ static BSONObj biggestKey(char a) {
+ int size = OnDiskFormat::KeyMax - bigSize() + 801;
+ return simpleKey(a, size);
+ }
+
+ static int bigSize() {
+ return typename BtreeLogicTestBase<OnDiskFormat>::KeyDataOwnedType(bigKey('a')).dataSize();
+ }
+
+ static int biggestSize() {
+ return typename BtreeLogicTestBase<OnDiskFormat>::KeyDataOwnedType(biggestKey('a')).dataSize();
+ }
+
+ int _count;
+ };
+
+ template<class OnDiskFormat>
+ class MergeSizeJustRightRight : public MergeSizeTestBase<OnDiskFormat> {
+ protected:
+ virtual int rightSize() const {
+ return BtreeLogic<OnDiskFormat>::lowWaterMark() - 1;
+ }
+
+ virtual int leftSize() const {
+ return OnDiskFormat::BucketBodySize -
+ MergeSizeTestBase<OnDiskFormat>::biggestSize() -
+ sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) -
+ (BtreeLogic<OnDiskFormat>::lowWaterMark() - 1);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class MergeSizeJustRightLeft : public MergeSizeTestBase<OnDiskFormat> {
+ protected:
+ virtual int leftSize() const {
+ return BtreeLogic<OnDiskFormat>::lowWaterMark() - 1;
+ }
+
+ virtual int rightSize() const {
+ return OnDiskFormat::BucketBodySize -
+ MergeSizeTestBase<OnDiskFormat>::biggestSize() -
+ sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) -
+ (BtreeLogic<OnDiskFormat>::lowWaterMark() - 1);
+ }
+
+ virtual const char * delKeys() const { return "yzkl"; }
+ };
+
+ template<class OnDiskFormat>
+ class MergeSizeRight : public MergeSizeJustRightRight<OnDiskFormat> {
+ virtual int rightSize() const { return MergeSizeJustRightRight<OnDiskFormat>::rightSize() - 1; }
+ virtual int leftSize() const { return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1; }
+ };
+
+ template<class OnDiskFormat>
+ class MergeSizeLeft : public MergeSizeJustRightLeft<OnDiskFormat> {
+ virtual int rightSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1; }
+ virtual int leftSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() - 1; }
+ };
+
+ template<class OnDiskFormat>
+ class NoMergeBelowMarkRight : public MergeSizeJustRightRight<OnDiskFormat> {
+ virtual int rightSize() const { return MergeSizeJustRightRight<OnDiskFormat>::rightSize() + 1; }
+ virtual int leftSize() const { return MergeSizeJustRightRight<OnDiskFormat>::leftSize() - 1; }
+ virtual bool merge() const { return false; }
+ };
+
+ template<class OnDiskFormat>
+ class NoMergeBelowMarkLeft : public MergeSizeJustRightLeft<OnDiskFormat> {
+ virtual int rightSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() - 1; }
+ virtual int leftSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() + 1; }
+ virtual bool merge() const { return false; }
+ };
+
+ template<class OnDiskFormat>
+ class MergeSizeRightTooBig : public MergeSizeJustRightLeft<OnDiskFormat> {
+ virtual int rightSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1; }
+ virtual bool merge() const { return false; }
+ };
+
+ template<class OnDiskFormat>
+ class MergeSizeLeftTooBig : public MergeSizeJustRightRight<OnDiskFormat> {
+ virtual int leftSize() const { return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1; }
+ virtual bool merge() const { return false; }
+ };
+
+ template<class OnDiskFormat>
+ class MergeRightEmpty : public MergeSizeTestBase<OnDiskFormat> {
+ protected:
+ virtual int rightAdditional() const { return 1; }
+ virtual int leftAdditional() const { return 1; }
+ virtual const char * delKeys() const { return "lz"; }
+ virtual int rightSize() const { return 0; }
+ virtual int leftSize() const {
+ return OnDiskFormat::BucketBodySize -
+ MergeSizeTestBase<OnDiskFormat>::biggestSize() -
+ sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class MergeMinRightEmpty : public MergeSizeTestBase<OnDiskFormat> {
+ protected:
+ virtual int rightAdditional() const { return 1; }
+ virtual int leftAdditional() const { return 0; }
+ virtual const char * delKeys() const { return "z"; }
+ virtual int rightSize() const { return 0; }
+ virtual int leftSize() const {
+ return MergeSizeTestBase<OnDiskFormat>::bigSize() +
+ sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class MergeLeftEmpty : public MergeSizeTestBase<OnDiskFormat> {
+ protected:
+ virtual int rightAdditional() const { return 1; }
+ virtual int leftAdditional() const { return 1; }
+ virtual const char * delKeys() const { return "zl"; }
+ virtual int leftSize() const { return 0; }
+ virtual int rightSize() const {
+ return OnDiskFormat::BucketBodySize -
+ MergeSizeTestBase<OnDiskFormat>::biggestSize() -
+ sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class MergeMinLeftEmpty : public MergeSizeTestBase<OnDiskFormat> {
+ protected:
+ virtual int leftAdditional() const { return 1; }
+ virtual int rightAdditional() const { return 0; }
+ virtual const char * delKeys() const { return "l"; }
+ virtual int leftSize() const { return 0; }
+ virtual int rightSize() const {
+ return MergeSizeTestBase<OnDiskFormat>::bigSize() +
+ sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class BalanceRightEmpty : public MergeRightEmpty<OnDiskFormat> {
+ protected:
+ virtual int leftSize() const {
+ return OnDiskFormat::BucketBodySize -
+ MergeSizeTestBase<OnDiskFormat>::biggestSize() -
+ sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) + 1;
+ }
+
+ virtual bool merge() const { return false; }
+
+ virtual void initCheck() {
+ _oldTop = this->getKey(this->_helper.headManager.getHead(), 0).data.toBson();
+ }
+
+ virtual void validate() {
+ ASSERT_NOT_EQUALS(_oldTop, this->getKey(this->_helper.headManager.getHead(), 0).data.toBson());
+ }
+
+ private:
+ BSONObj _oldTop;
+ };
+
+ template<class OnDiskFormat>
+ class BalanceLeftEmpty : public MergeLeftEmpty<OnDiskFormat> {
+ protected:
+ virtual int rightSize() const {
+ return OnDiskFormat::BucketBodySize -
+ MergeSizeTestBase<OnDiskFormat>::biggestSize() -
+ sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) + 1;
+ }
+
+ virtual bool merge() const { return false; }
+
+ virtual void initCheck() {
+ _oldTop = this->getKey(this->_helper.headManager.getHead(), 0).data.toBson();
+ }
+
+ virtual void validate() {
+ ASSERT_TRUE(_oldTop != this->getKey(this->_helper.headManager.getHead(), 0).data.toBson());
+ }
+
+ private:
+ BSONObj _oldTop;
+ };
+
+ template<class OnDiskFormat>
+ class BalanceOneLeftToRight : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},"
+ "b:{$20:null,$30:null,$40:null,$50:null,a:null},"
+ "_:{c:null}}");
+
+ ASSERT_EQUALS(14, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << bigNumString(0x40, 800));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},"
+ "b:{$10:null,$20:null,$30:null,$50:null,a:null},"
+ "_:{c:null}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class BalanceOneRightToLeft : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null},"
+ "b:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null},"
+ "_:{c:null}}");
+
+ ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << bigNumString(0x3, 800));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{$20:{$1:null,$2:null,$4:null,$10:null},"
+ "b:{$30:null,$40:null,$50:null,$60:null,$70:null},"
+ "_:{c:null}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class BalanceThreeLeftToRight : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{$20:{$1:{$0:null},$3:{$2:null},$5:{$4:null},$7:{$6:null},"
+ "$9:{$8:null},$11:{$10:null},$13:{$12:null},_:{$14:null}},"
+ "b:{$30:null,$40:{$35:null},$50:{$45:null}},"
+ "_:{c:null}}");
+
+ ASSERT_EQUALS(23, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 14 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(15, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << bigNumString(0x30, 800));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(22, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 14 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(15, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{$9:{$1:{$0:null},$3:{$2:null},"
+ "$5:{$4:null},$7:{$6:null},_:{$8:null}},"
+ "b:{$11:{$10:null},$13:{$12:null},$20:{$14:null},"
+ "$40:{$35:null},$50:{$45:null}},"
+ "_:{c:null}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class BalanceThreeRightToLeft : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{$20:{$1:{$0:null},$3:{$2:null},$5:null,_:{$14:null}},"
+ "b:{$30:{$25:null},$40:{$35:null},$50:{$45:null},$60:{$55:null},"
+ "$70:{$65:null},$80:{$75:null},"
+ "$90:{$85:null},$100:{$95:null}},"
+ "_:{c:null}}");
+
+ ASSERT_EQUALS(25, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 15 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(16, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << bigNumString(0x5, 800));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(24, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 15 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(16, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{$50:{$1:{$0:null},$3:{$2:null},$20:{$14:null},"
+ "$30:{$25:null},$40:{$35:null},_:{$45:null}},"
+ "b:{$60:{$55:null},$70:{$65:null},$80:{$75:null},"
+ "$90:{$85:null},$100:{$95:null}},"
+ "_:{c:null}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class BalanceSingleParentKey : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},"
+ "_:{$20:null,$30:null,$40:null,$50:null,a:null}}");
+
+ ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << bigNumString(0x40, 800));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},"
+ "_:{$10:null,$20:null,$30:null,$50:null,a:null}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class PackEmptyBucket : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{a:null}");
+
+ const BSONObj k = BSON("" << "a");
+ ASSERT(this->unindex(k));
+
+ this->forcePackBucket(this->_helper.headManager.getHead());
+
+ typename BtreeLogicTestBase<OnDiskFormat>::BucketType* headBucket = this->head();
+
+ ASSERT_EQUALS(0, headBucket->n);
+ ASSERT_FALSE(headBucket->flags & Packed);
+
+ int unused = 0;
+ this->truncateBucket(headBucket, 0, unused);
+
+ ASSERT_EQUALS(0, headBucket->n);
+ ASSERT_EQUALS(0, headBucket->topSize);
+ ASSERT_EQUALS((int)OnDiskFormat::BucketBodySize, headBucket->emptySize);
+ ASSERT_TRUE(headBucket->flags & Packed);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class PackedDataSizeEmptyBucket : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{a:null}");
+
+ const BSONObj k = BSON("" << "a");
+ ASSERT(this->unindex(k));
+
+ this->forcePackBucket(this->_helper.headManager.getHead());
+
+ typename BtreeLogicTestBase<OnDiskFormat>::BucketType* headBucket = this->head();
+
+ ASSERT_EQUALS(0, headBucket->n);
+ ASSERT_FALSE(headBucket->flags & Packed);
+ ASSERT_EQUALS(0, this->bucketPackedDataSize(headBucket, 0));
+ ASSERT_FALSE(headBucket->flags & Packed);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class BalanceSingleParentKeyPackParent : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},"
+ "_:{$20:null,$30:null,$40:null,$50:null,a:null}}");
+
+ ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+
+ // force parent pack
+ this->forcePackBucket(this->_helper.headManager.getHead());
+
+ const BSONObj k = BSON("" << bigNumString(0x40, 800));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},"
+ "_:{$10:null,$20:null,$30:null,$50:null,a:null}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class BalanceSplitParent : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree(
+ "{$10$10:{$1:null,$2:null,$3:null,$4:null},"
+ "$100:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null,$80:null},"
+ "$200:null,$300:null,$400:null,$500:null,$600:null,"
+ "$700:null,$800:null,$900:null,_:{c:null}}");
+
+ ASSERT_EQUALS(22, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << bigNumString(0x3, 800));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(21, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{$500:{ $30:{$1:null,$2:null,$4:null,$10$10:null,$20:null},"
+ "$100:{$40:null,$50:null,$60:null,$70:null,$80:null},"
+ "$200:null,$300:null,$400:null},"
+ "_:{$600:null,$700:null,$800:null,$900:null,_:{c:null}}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class RebalancedSeparatorBase : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree(treeSpec());
+ modTree();
+
+ ASSERT_EQUALS(expectedSeparator(),
+ this->bucketRebalancedSeparatorPos(
+ this->_helper.headManager.getHead(), 0));
+ }
+
+ virtual string treeSpec() const = 0;
+ virtual int expectedSeparator() const = 0;
+ virtual void modTree() {}
+ };
+
+ template<class OnDiskFormat>
+ class EvenRebalanceLeft : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const { return "{$7:{$1:null,$2$31f:null,$3:null,"
+ "$4$31f:null,$5:null,$6:null},"
+ "_:{$8:null,$9:null,$10$31e:null}}"; }
+ virtual int expectedSeparator() const { return 4; }
+ };
+
+ template<class OnDiskFormat>
+ class EvenRebalanceLeftCusp : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const {
+ return "{$6:{$1:null,$2$31f:null,$3:null,$4$31f:null,$5:null},"
+ "_:{$7:null,$8:null,$9$31e:null,$10:null}}";
+ }
+ virtual int expectedSeparator() const { return 4; }
+ };
+
+ template<class OnDiskFormat>
+ class EvenRebalanceRight : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const { return "{$3:{$1:null,$2$31f:null},_:{$4$31f:null,$5:null,$6:null,$7:null,$8$31e:null,$9:null,$10:null}}"; }
+ virtual int expectedSeparator() const { return 4; }
+ };
+
+ template<class OnDiskFormat>
+ class EvenRebalanceRightCusp : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const { return "{$4$31f:{$1:null,$2$31f:null,$3:null},_:{$5:null,$6:null,$7$31e:null,$8:null,$9:null,$10:null}}"; }
+ virtual int expectedSeparator() const { return 4; }
+ };
+
+ template<class OnDiskFormat>
+ class EvenRebalanceCenter : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const { return "{$5:{$1:null,$2$31f:null,$3:null,$4$31f:null},_:{$6:null,$7$31e:null,$8:null,$9:null,$10:null}}"; }
+ virtual int expectedSeparator() const { return 4; }
+ };
+
+ template<class OnDiskFormat>
+ class OddRebalanceLeft : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const { return "{$6$31f:{$1:null,$2:null,$3:null,$4:null,$5:null},_:{$7:null,$8:null,$9:null,$10:null}}"; }
+ virtual int expectedSeparator() const { return 4; }
+ };
+
+ template<class OnDiskFormat>
+ class OddRebalanceRight : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const { return "{$4:{$1:null,$2:null,$3:null},_:{$5:null,$6:null,$7:null,$8$31f:null,$9:null,$10:null}}"; }
+ virtual int expectedSeparator() const { return 4; }
+ };
+
+ template<class OnDiskFormat>
+ class OddRebalanceCenter : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const { return "{$5:{$1:null,$2:null,$3:null,$4:null},_:{$6:null,$7:null,$8:null,$9:null,$10$31f:null}}"; }
+ virtual int expectedSeparator() const { return 4; }
+ };
+
+ template<class OnDiskFormat>
+ class RebalanceEmptyRight : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const { return "{$a:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null,$7:null,$8:null,$9:null},_:{$b:null}}"; }
+ virtual void modTree() {
+ BSONObj k = BSON("" << bigNumString(0xb, 800));
+ ASSERT(this->unindex(k));
+ }
+ virtual int expectedSeparator() const { return 4; }
+ };
+
+ template<class OnDiskFormat>
+ class RebalanceEmptyLeft : public RebalancedSeparatorBase<OnDiskFormat> {
+ virtual string treeSpec() const { return "{$a:{$1:null},_:{$11:null,$12:null,$13:null,$14:null,$15:null,$16:null,$17:null,$18:null,$19:null}}"; }
+ virtual void modTree() {
+ BSONObj k = BSON("" << bigNumString(0x1, 800));
+ ASSERT(this->unindex(k));
+ }
+ virtual int expectedSeparator() const { return 4; }
+ };
+
+ template<class OnDiskFormat>
+ class NoMoveAtLowWaterMarkRight : public MergeSizeJustRightRight<OnDiskFormat> {
+ virtual int rightSize() const { return MergeSizeJustRightRight<OnDiskFormat>::rightSize() + 1; }
+
+ virtual void initCheck() {
+ _oldTop = this->getKey(this->_helper.headManager.getHead(), 0).data.toBson();
+ }
+
+ virtual void validate() {
+ ASSERT_EQUALS(_oldTop, this->getKey(this->_helper.headManager.getHead(), 0).data.toBson());
+ }
+
+ virtual bool merge() const { return false; }
+
+ protected:
+ BSONObj _oldTop;
+ };
+
+ template<class OnDiskFormat>
+ class MoveBelowLowWaterMarkRight : public NoMoveAtLowWaterMarkRight<OnDiskFormat> {
+ virtual int rightSize() const { return MergeSizeJustRightRight<OnDiskFormat>::rightSize(); }
+ virtual int leftSize() const { return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1; }
+
+ virtual void validate() {
+ // Different top means we rebalanced
+ ASSERT_NOT_EQUALS(this->_oldTop,
+ this->getKey(this->_helper.headManager.getHead(), 0).data.toBson());
+ }
+ };
+
+ template<class OnDiskFormat>
+ class NoMoveAtLowWaterMarkLeft : public MergeSizeJustRightLeft<OnDiskFormat> {
+ virtual int leftSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() + 1; }
+ virtual void initCheck() {
+ this->_oldTop = this->getKey(this->_helper.headManager.getHead(), 0).data.toBson();
+ }
+
+ virtual void validate() {
+ ASSERT_EQUALS(this->_oldTop,
+ this->getKey(this->_helper.headManager.getHead(), 0).data.toBson());
+ }
+ virtual bool merge() const { return false; }
+
+ protected:
+ BSONObj _oldTop;
+ };
+
+ template<class OnDiskFormat>
+ class MoveBelowLowWaterMarkLeft : public NoMoveAtLowWaterMarkLeft<OnDiskFormat> {
+ virtual int leftSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::leftSize(); }
+ virtual int rightSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1; }
+
+ virtual void validate() {
+ // Different top means we rebalanced
+ ASSERT_NOT_EQUALS(this->_oldTop,
+ this->getKey(this->_helper.headManager.getHead(), 0).data.toBson());
+ }
+ };
+
+ template<class OnDiskFormat>
+ class PreferBalanceLeft : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},"
+ "$20:{$11:null,$12:null,$13:null,$14:null},"
+ "_:{$30:null}}");
+
+ ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << bigNumString(0x12, 800));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{$5:{$1:null,$2:null,$3:null,$4:null},"
+ "$20:{$6:null,$10:null,$11:null,$13:null,$14:null},"
+ "_:{$30:null}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class PreferBalanceRight : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{$10:{$1:null},"
+ "$20:{$11:null,$12:null,$13:null,$14:null},"
+ "_:{$31:null,$32:null,$33:null,$34:null,$35:null,$36:null}}");
+
+ ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << bigNumString(0x12, 800));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{$10:{$1:null},"
+ "$31:{$11:null,$13:null,$14:null,$20:null},"
+ "_:{$32:null,$33:null,$34:null,$35:null,$36:null}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class RecursiveMergeThenBalance : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{$10:{$5:{$1:null,$2:null},$8:{$6:null,$7:null}},"
+ "_:{$20:null,$30:null,$40:null,$50:null,"
+ "$60:null,$70:null,$80:null,$90:null}}");
+
+ ASSERT_EQUALS(15, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(6, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << bigNumString(0x7, 800));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(14, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure(
+ "{$40:{$8:{$1:null,$2:null,$5:null,$6:null},$10:null,$20:null,$30:null},"
+ "_:{$50:null,$60:null,$70:null,$80:null,$90:null}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class DelEmptyNoNeighbors : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{b:{a:null}}");
+
+ ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << "a");
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(1, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(2, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{b:null}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class DelEmptyEmptyNeighbors : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{a:null,c:{b:null},d:null}");
+
+ ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords());
+
+ const BSONObj k = BSON("" << "b");
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+ // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(2, this->_helper.recordStore.numRecords());
+
+ builder.checkStructure("{a:null,c:null,d:null}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class DelInternal : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{a:null,c:{b:null},d:null}");
+
+ long long unused = 0;
+ ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(0, unused);
+
+ const BSONObj k = BSON("" << "c");
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(2, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(0, unused);
+
+ builder.checkStructure("{a:null,b:null,d:null}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class DelInternalReplaceWithUnused : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{a:null,c:{b:null},d:null}");
+
+ const DiskLoc prevChildBucket =
+ this->getKey(this->_helper.headManager.getHead(), 1).prevChildBucket;
+ this->markKeyUnused(prevChildBucket, 0);
+
+ long long unused = 0;
+ ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(1, unused);
+
+ const BSONObj k = BSON("" << "c");
+ ASSERT(this->unindex(k));
+
+ unused = 0;
+ ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(2, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(1, unused);
+
+ // doesn't discriminate between used and unused
+ builder.checkStructure("{a:null,b:null,d:null}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class DelInternalReplaceRight : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{a:null,_:{b:null}}");
+
+ long long unused = 0;
+ ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(0, unused);
+
+ const BSONObj k = BSON("" << "a");
+ ASSERT(this->unindex(k));
+
+ unused = 0;
+ ASSERT_EQUALS(1, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(2, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(0, unused);
+
+ builder.checkStructure("{b:null}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class DelInternalPromoteKey : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{a:null,y:{d:{c:{b:null}},_:{e:null}},z:null}");
+
+ long long unused = 0;
+ ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(6, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(0, unused);
+
+ const BSONObj k = BSON("" << "y");
+ ASSERT(this->unindex(k));
+
+ unused = 0;
+ ASSERT_EQUALS(6, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(0, unused);
+
+ builder.checkStructure("{a:null,e:{c:{b:null},d:null},z:null}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class DelInternalPromoteRightKey : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{a:null,_:{e:{c:null},_:{f:null}}}");
+
+ long long unused = 0;
+ ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(0, unused);
+
+ const BSONObj k = BSON("" << "a");
+ ASSERT(this->unindex(k));
+
+ unused = 0;
+ ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(3, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(0, unused);
+
+ builder.checkStructure("{c:null,_:{e:null,f:null}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class DelInternalReplacementPrevNonNull : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{a:null,d:{c:{b:null}},e:null}");
+
+ long long unused = 0;
+ ASSERT_EQUALS(5, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(0, unused);
+
+ const BSONObj k = BSON("" << "d");
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(1, unused);
+
+ builder.checkStructure("{a:null,d:{c:{b:null}},e:null}");
+
+ // Check 'unused' key
+ ASSERT(this->getKey(this->_helper.headManager.getHead(), 1).recordLoc.getOfs() & 1);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class DelInternalReplacementNextNonNull : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{a:null,_:{c:null,_:{d:null}}}");
+
+ long long unused = 0;
+ ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(0, unused);
+
+ const BSONObj k = BSON("" << "a");
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(1, unused);
+
+ builder.checkStructure("{a:null,_:{c:null,_:{d:null}}}");
+
+ // Check 'unused' key
+ ASSERT(this->getKey(this->_helper.headManager.getHead(), 0).recordLoc.getOfs() & 1);
+ }
+ };
+
+ template<class OnDiskFormat>
+ class DelInternalSplitPromoteLeft : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{$10:null,$20:null,$30$10:{$25:{$23:null},_:{$27:null}},"
+ "$40:null,$50:null,$60:null,$70:null,$80:null,$90:null,$100:null}");
+
+ long long unused = 0;
+ ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(0, unused);
+
+ const BSONObj k = BSON("" << bigNumString(0x30, 0x10));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(0, unused);
+
+ builder.checkStructure("{$60:{$10:null,$20:null,"
+ "$27:{$23:null,$25:null},$40:null,$50:null},"
+ "_:{$70:null,$80:null,$90:null,$100:null}}");
+ }
+ };
+
+ template<class OnDiskFormat>
+ class DelInternalSplitPromoteRight : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ OperationContextNoop txn;
+ ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+ builder.makeTree("{$10:null,$20:null,$30:null,$40:null,$50:null,$60:null,$70:null,"
+ "$80:null,$90:null,$100$10:{$95:{$93:null},_:{$97:null}}}");
+
+ long long unused = 0;
+ ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(0, unused);
+
+ const BSONObj k = BSON("" << bigNumString(0x100, 0x10));
+ ASSERT(this->unindex(k));
+
+ ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+ // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+ ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+ ASSERT_EQUALS(0, unused);
+
+ builder.checkStructure(
+ "{$80:{$10:null,$20:null,$30:null,$40:null,$50:null,$60:null,$70:null},"
+ "_:{$90:null,$97:{$93:null,$95:null}}}");
+ }
+ };
+
+ /* This test requires the entire server to be linked-in and it is better implemented using
+ the JS framework. Disabling here and will put in jsCore.
+
+ template<class OnDiskFormat>
+ class SignedZeroDuplication : public BtreeLogicTestBase<OnDiskFormat> {
+ public:
+ void run() {
+ ASSERT_EQUALS(0.0, -0.0);
+ DBDirectClient c;
+
+ static const string ns("unittests.SignedZeroDuplication");
+
+ c.ensureIndex(ns, BSON("b" << 1), true);
+ c.insert(ns, BSON("b" << 0.0));
+ c.insert(ns, BSON("b" << 1.0));
+ c.update(ns, BSON("b" << 1.0), BSON("b" << -0.0));
+
+ ASSERT_EQUALS(1U, c.count(ns, BSON("b" << 0.0)));
+ }
+ };
+ */
+
+/*
+// QUERY_MIGRATION: port later
+ class PackUnused : public Base {
+ public:
+ void run() {
+ for ( long long i = 0; i < 1000000; i += 1000 ) {
+ insert( i );
+ }
+ string orig, after;
+ {
+ stringstream ss;
+ bt()->shape( ss );
+ orig = ss.str();
+ }
+ vector< string > toDel;
+ vector< string > other;
+ BSONObjBuilder start;
+ start.appendMinKey( "a" );
+ BSONObjBuilder end;
+ end.appendMaxKey( "a" );
+ auto_ptr< BtreeCursor > c( BtreeCursor::make( nsdetails( ns() ),
+ id(),
+ start.done(),
+ end.done(),
+ false,
+ 1 ) );
+ while( c->ok() ) {
+ bool has_child =
+ c->getBucket().btree()->keyNode(c->getKeyOfs()).prevChildBucket.isNull();
+
+ if (has_child) {
+ toDel.push_back( c->currKey().firstElement().valuestr() );
+ }
+ else {
+ other.push_back( c->currKey().firstElement().valuestr() );
+ }
+ c->advance();
+ }
+ ASSERT( toDel.size() > 0 );
+ for( vector< string >::const_iterator i = toDel.begin(); i != toDel.end(); ++i ) {
+ BSONObj o = BSON( "a" << *i );
+ this->unindex( o );
+ }
+ ASSERT( other.size() > 0 );
+ for( vector< string >::const_iterator i = other.begin(); i != other.end(); ++i ) {
+ BSONObj o = BSON( "a" << *i );
+ this->unindex( o );
+ }
+
+ long long unused = 0;
+ ASSERT_EQUALS( 0, bt()->fullValidate(&txn, dl(), order(), &unused, true ) );
+
+ for ( long long i = 50000; i < 50100; ++i ) {
+ insert( i );
+ }
+
+ long long unused2 = 0;
+ ASSERT_EQUALS( 100, bt()->fullValidate(&txn, dl(), order(), &unused2, true ) );
+
+// log() << "old unused: " << unused << ", new unused: " << unused2 << endl;
+//
+ ASSERT( unused2 <= unused );
+ }
+ protected:
+ void insert( long long n ) {
+ string val = bigNumString( n );
+ BSONObj k = BSON( "a" << val );
+ Base::insert( k );
+ }
+ };
+
+ class DontDropReferenceKey : public PackUnused {
+ public:
+ void run() {
+ // with 80 root node is full
+ for ( long long i = 0; i < 80; i += 1 ) {
+ insert( i );
+ }
+
+ BSONObjBuilder start;
+ start.appendMinKey( "a" );
+ BSONObjBuilder end;
+ end.appendMaxKey( "a" );
+ BSONObj l = bt()->keyNode( 0 ).key.toBson();
+ string toInsert;
+ auto_ptr< BtreeCursor > c( BtreeCursor::make( nsdetails( ns() ),
+ id(),
+ start.done(),
+ end.done(),
+ false,
+ 1 ) );
+ while( c->ok() ) {
+ if ( c->currKey().woCompare( l ) > 0 ) {
+ toInsert = c->currKey().firstElement().valuestr();
+ break;
+ }
+ c->advance();
+ }
+ // too much work to try to make this happen through inserts and deletes
+ // we are intentionally manipulating the btree bucket directly here
+ BtreeBucket::Loc* L = const_cast< BtreeBucket::Loc* >( &bt()->keyNode( 1 ).prevChildBucket );
+ getDur().writing(L)->Null();
+ getDur().writingInt( const_cast< BtreeBucket::Loc& >( bt()->keyNode( 1 ).recordLoc ).GETOFS() ) |= 1; // make unused
+ BSONObj k = BSON( "a" << toInsert );
+ Base::insert( k );
+ }
+ };
+ */
+
+ //
+ // TEST SUITE DEFINITION
+ //
+
+ template<class OnDiskFormat>
+ class BtreeLogicTestSuite : public unittest::Suite {
+ public:
+ BtreeLogicTestSuite(const std::string& name) : Suite(name) {
+
+ }
+
+ void setupTests() {
+ add< SimpleCreate<OnDiskFormat> >();
+ add< SimpleInsertDelete<OnDiskFormat> >();
+ add< SplitRightHeavyBucket<OnDiskFormat> >();
+ add< SplitLeftHeavyBucket<OnDiskFormat> >();
+ add< MissingLocate<OnDiskFormat> >();
+ add< MissingLocateMultiBucket<OnDiskFormat> >();
+ add< SERVER983<OnDiskFormat> >();
+ add< DontReuseUnused<OnDiskFormat> >();
+ add< MergeBucketsLeft<OnDiskFormat> >();
+ add< MergeBucketsRight<OnDiskFormat> >();
+ add< MergeBucketsDontReplaceHead<OnDiskFormat> >();
+ add< MergeBucketsDelInternal<OnDiskFormat> >();
+ add< MergeBucketsRightNull<OnDiskFormat> >();
+ add< DontMergeSingleBucket<OnDiskFormat> >();
+ add< ParentMergeNonRightToLeft<OnDiskFormat> >();
+ add< ParentMergeNonRightToRight<OnDiskFormat> >();
+ add< CantMergeRightNoMerge<OnDiskFormat> >();
+ add< CantMergeLeftNoMerge<OnDiskFormat> >();
+ add< MergeOption<OnDiskFormat> >();
+ add< ForceMergeLeft<OnDiskFormat> >();
+ add< ForceMergeRight<OnDiskFormat> >();
+ add< RecursiveMerge<OnDiskFormat> >();
+ add< RecursiveMergeRightBucket<OnDiskFormat> >();
+ add< RecursiveMergeDoubleRightBucket<OnDiskFormat> >();
+
+ add< MergeSizeJustRightRight<OnDiskFormat> >();
+ add< MergeSizeJustRightLeft<OnDiskFormat> >();
+ add< MergeSizeRight<OnDiskFormat> >();
+ add< MergeSizeLeft<OnDiskFormat> >();
+ add< NoMergeBelowMarkRight<OnDiskFormat> >();
+ add< NoMergeBelowMarkLeft<OnDiskFormat> >();
+ add< MergeSizeRightTooBig<OnDiskFormat> >();
+ add< MergeSizeLeftTooBig<OnDiskFormat> >();
+ add< MergeRightEmpty<OnDiskFormat> >();
+ add< MergeMinRightEmpty<OnDiskFormat> >();
+ add< MergeLeftEmpty<OnDiskFormat> >();
+ add< MergeMinLeftEmpty<OnDiskFormat> >();
+ add< BalanceRightEmpty<OnDiskFormat> >();
+ add< BalanceLeftEmpty<OnDiskFormat> >();
+
+ add< BalanceOneLeftToRight<OnDiskFormat> >();
+ add< BalanceOneRightToLeft<OnDiskFormat> >();
+ add< BalanceThreeLeftToRight<OnDiskFormat> >();
+ add< BalanceThreeRightToLeft<OnDiskFormat> >();
+ add< BalanceSingleParentKey<OnDiskFormat> >();
+
+ add< PackEmptyBucket<OnDiskFormat> >();
+ add< PackedDataSizeEmptyBucket<OnDiskFormat> >();
+
+ add< BalanceSingleParentKeyPackParent<OnDiskFormat> >();
+ add< BalanceSplitParent<OnDiskFormat> >();
+ add< EvenRebalanceLeft<OnDiskFormat> >();
+ add< EvenRebalanceLeftCusp<OnDiskFormat> >();
+ add< EvenRebalanceRight<OnDiskFormat> >();
+ add< EvenRebalanceRightCusp<OnDiskFormat> >();
+ add< EvenRebalanceCenter<OnDiskFormat> >();
+ add< OddRebalanceLeft<OnDiskFormat> >();
+ add< OddRebalanceRight<OnDiskFormat> >();
+ add< OddRebalanceCenter<OnDiskFormat> >();
+ add< RebalanceEmptyRight<OnDiskFormat> >();
+ add< RebalanceEmptyLeft<OnDiskFormat> >();
+
+ add< NoMoveAtLowWaterMarkRight<OnDiskFormat> >();
+ add< MoveBelowLowWaterMarkRight<OnDiskFormat> >();
+ add< NoMoveAtLowWaterMarkLeft<OnDiskFormat> >();
+ add< MoveBelowLowWaterMarkLeft<OnDiskFormat> >();
+
+ add< PreferBalanceLeft<OnDiskFormat> >();
+ add< PreferBalanceRight<OnDiskFormat> >();
+ add< RecursiveMergeThenBalance<OnDiskFormat> >();
+ add< DelEmptyNoNeighbors<OnDiskFormat> >();
+ add< DelEmptyEmptyNeighbors<OnDiskFormat> >();
+ add< DelInternal<OnDiskFormat> >();
+ add< DelInternalReplaceWithUnused<OnDiskFormat> >();
+ add< DelInternalReplaceRight<OnDiskFormat> >();
+ add< DelInternalPromoteKey<OnDiskFormat> >();
+ add< DelInternalPromoteRightKey<OnDiskFormat> >();
+ add< DelInternalReplacementPrevNonNull<OnDiskFormat> >();
+ add< DelInternalReplacementNextNonNull<OnDiskFormat> >();
+ add< DelInternalSplitPromoteLeft<OnDiskFormat> >();
+ add< DelInternalSplitPromoteRight<OnDiskFormat> >();
+ }
+ };
+
+ // Test suite for both V0 and V1
+ static BtreeLogicTestSuite<BtreeLayoutV0> SUITE_V0("BTreeLogicTests_V0");
+ static BtreeLogicTestSuite<BtreeLayoutV1> SUITE_V1("BTreeLogicTests_V1");
+}
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h b/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h
new file mode 100644
index 00000000000..7f91cd2fb27
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h
@@ -0,0 +1,380 @@
+/**
+ * Copyright (C) 2014 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/diskloc.h"
+#include "mongo/db/jsobj.h"
+#include "mongo/db/storage/mmap_v1/btree/key.h"
+
+namespace mongo {
+
+ const int OldBucketSize = 8192;
+
+ //
+ // On-disk index format
+ //
+
+#pragma pack(1)
+ /**
+ * This is the fixed width data component for storage of a key within a bucket. It contains an
+ * offset pointer to the variable width bson data component. This may be 'unused', please see
+ * below.
+ *
+ * Why is this templated on Loc? Because V0 and V1 have different size DiskLoc(s) but otherwise
+ * the same layout.
+ */
+ template <class LocType>
+ struct FixedWidthKey {
+ //
+ // Data
+ //
+
+ /**
+ * The 'left' child bucket of this key. If this is the i-th key, it points to the i index
+ * child bucket.
+ */
+ LocType prevChildBucket;
+
+ /**
+ * The location of the record associated with this key.
+ */
+ LocType recordLoc;
+
+ /**
+ * Offset within current bucket of the variable width bson key for this _KeyNode.
+ */
+ unsigned short _kdo;
+
+ //
+ // Accessors / mutators
+ //
+
+ short keyDataOfs() const {
+ return static_cast<short>(_kdo);
+ }
+
+ void setKeyDataOfs(short s) {
+ _kdo = s;
+ invariant(s>=0);
+ }
+
+ void setKeyDataOfsSavingUse(short s) {
+ // XXX kill this func
+ setKeyDataOfs(s);
+ }
+
+ /**
+ * Unused keys are not returned by read operations. Keys may be marked
+ * as unused in cases where it is difficult to delete them while
+ * maintaining the constraints required of a btree.
+ *
+ * Setting ofs to odd is the sentinel for unused, as real recordLoc's
+ * are always even numbers. Note we need to keep its value basically
+ * the same as we use the recordLoc as part of the key in the index
+ * (to handle duplicate keys efficiently).
+ *
+ * Flagging keys as unused is a feature that is being phased out in favor
+ * of deleting the keys outright. The current btree implementation is
+ * not expected to mark a key as unused in a non legacy btree.
+ */
+ void setUnused() {
+ recordLoc.GETOFS() |= 1;
+ }
+
+ void setUsed() { recordLoc.GETOFS() &= ~1; }
+
+ int isUnused() const {
+ return recordLoc.getOfs() & 1;
+ }
+
+ int isUsed() const {
+ return !isUnused();
+ }
+ };
+
+ /**
+ * This structure represents header data for a btree bucket. An object of
+ * this type is typically allocated inside of a buffer of size BucketSize,
+ * resulting in a full bucket with an appropriate header.
+ *
+ * The body of a btree bucket contains an array of _KeyNode objects starting
+ * from its lowest indexed bytes and growing to higher indexed bytes. The
+ * body also contains variable width bson keys, which are allocated from the
+ * highest indexed bytes toward lower indexed bytes.
+ *
+ * |hhhh|kkkkkkk--------bbbbbbbbbbbuuubbbuubbb|
+ * h = header data
+ * k = KeyNode data
+ * - = empty space
+ * b = bson key data
+ * u = unused (old) bson key data, that may be garbage collected
+ */
+ struct BtreeBucketV0 {
+ /**
+ * Parent bucket of this bucket, which isNull() for the root bucket.
+ */
+ DiskLoc parent;
+
+ /**
+ * Given that there are n keys, this is the n index child.
+ */
+ DiskLoc nextChild;
+
+ /**
+ * Can be reused, value is 8192 in current pdfile version Apr2010
+ */
+ unsigned short _wasSize;
+
+ /**
+ * zero
+ */
+ unsigned short _reserved1;
+
+ int flags;
+
+ /** basicInsert() assumes the next three members are consecutive and in this order: */
+
+ /** Size of the empty region. */
+ int emptySize;
+
+ /** Size used for bson storage, including storage of old keys. */
+ int topSize;
+
+ /* Number of keys in the bucket. */
+ int n;
+
+ int reserved;
+
+ /* Beginning of the bucket's body */
+ char data[4];
+
+ // Precalculated size constants
+ enum { HeaderSize = 40 };
+ };
+
+ // BtreeBucketV0 is part of the on-disk format, so it should never be changed
+ BOOST_STATIC_ASSERT(
+ sizeof(BtreeBucketV0) - sizeof(reinterpret_cast<BtreeBucketV0*>(NULL)->data)
+ == BtreeBucketV0::HeaderSize);
+
+ /**
+ * A variant of DiskLoc Used by the V1 bucket type.
+ */
+ struct DiskLoc56Bit {
+ //
+ // Data
+ //
+
+ int ofs;
+
+ unsigned char _a[3];
+
+ //
+ // Accessors XXX rename these, this is terrible
+ //
+
+ int& GETOFS() { return ofs; }
+
+ int getOfs() const { return ofs; }
+
+ //
+ // Comparison
+ //
+
+ bool isNull() const { return ofs < 0; }
+
+ unsigned long long toLongLong() const {
+ // endian
+ unsigned long long result = ofs;
+ char* cursor = reinterpret_cast<char *>(&result);
+ *reinterpret_cast<uint16_t*>(cursor + 4) = *reinterpret_cast<const uint16_t*>(&_a[0]);
+ *reinterpret_cast<uint8_t*>(cursor + 6) = *reinterpret_cast<const uint8_t*>(&_a[2]);
+ *reinterpret_cast<uint8_t*>(cursor + 7) = uint8_t(0);
+ return result;
+ }
+
+ bool operator<(const DiskLoc56Bit& rhs) const {
+ // the orderering of dup keys in btrees isn't too critical, but we'd like to put items
+ // that are close together on disk close together in the tree, so we do want the file #
+ // to be the most significant bytes
+ return toLongLong() < rhs.toLongLong();
+ }
+
+ int compare(const DiskLoc56Bit& rhs) const {
+ unsigned long long a = toLongLong();
+ unsigned long long b = rhs.toLongLong();
+ if ( a < b ) {
+ return -1;
+ }
+ else {
+ return a == b ? 0 : 1;
+ }
+ }
+
+ bool operator==(const DiskLoc56Bit& rhs) const {
+ return toLongLong() == rhs.toLongLong();
+ }
+
+ bool operator!=(const DiskLoc56Bit& rhs) const {
+ return toLongLong() != rhs.toLongLong();
+ }
+
+ bool operator==(const DiskLoc& rhs) const {
+ return DiskLoc(*this) == rhs;
+ }
+
+ bool operator!=(const DiskLoc& rhs) const {
+ return !(*this==rhs);
+ }
+
+ //
+ // Mutation
+ //
+
+ enum {
+ // first bit of offsets used in _KeyNode we don't use -1 here.
+ OurNullOfs = -2
+ };
+
+ void Null() {
+ ofs = OurNullOfs;
+ _a[0] = _a[1] = _a[2] = 0;
+ }
+
+ void operator=(const DiskLoc& loc) {
+ ofs = loc.getOfs();
+ int la = loc.a();
+ invariant( la <= 0xffffff ); // must fit in 3 bytes
+ if( la < 0 ) {
+ if ( la != -1 ) {
+ log() << "btree diskloc isn't negative 1: " << la << std::endl;
+ invariant ( la == -1 );
+ }
+ la = 0;
+ ofs = OurNullOfs;
+ }
+ memcpy(_a, &la, 3); // endian
+ }
+
+ //
+ // Type Conversion
+ //
+
+ operator const DiskLoc() const {
+ // endian
+ if( isNull() ) return DiskLoc();
+ unsigned a = *((unsigned *) (_a-1));
+ return DiskLoc(a >> 8, ofs);
+ }
+
+ std::string toString() const { return DiskLoc(*this).toString(); }
+ };
+
+ struct BtreeBucketV1 {
+ /** Parent bucket of this bucket, which isNull() for the root bucket. */
+ DiskLoc56Bit parent;
+
+ /** Given that there are n keys, this is the n index child. */
+ DiskLoc56Bit nextChild;
+
+ unsigned short flags;
+
+ /** Size of the empty region. */
+ unsigned short emptySize;
+
+ /** Size used for bson storage, including storage of old keys. */
+ unsigned short topSize;
+
+ /* Number of keys in the bucket. */
+ unsigned short n;
+
+ /* Beginning of the bucket's body */
+ char data[4];
+
+ // Precalculated size constants
+ enum { HeaderSize = 22 };
+ };
+
+ // BtreeBucketV1 is part of the on-disk format, so it should never be changed
+ BOOST_STATIC_ASSERT(
+ sizeof(BtreeBucketV1) - sizeof(reinterpret_cast<BtreeBucketV1*>(NULL)->data)
+ == BtreeBucketV1::HeaderSize);
+
+ enum Flags {
+ Packed = 1
+ };
+
+ struct BtreeLayoutV0 {
+ typedef FixedWidthKey<DiskLoc> FixedWidthKeyType;
+ typedef DiskLoc LocType;
+ typedef KeyBson KeyType;
+ typedef KeyBson KeyOwnedType;
+ typedef BtreeBucketV0 BucketType;
+
+ enum { BucketSize = 8192,
+ BucketBodySize = BucketSize - BucketType::HeaderSize
+ };
+
+ // largest key size we allow. note we very much need to support bigger keys (somehow) in
+ // the future.
+
+ static const int KeyMax = OldBucketSize / 10;
+
+ // A sentinel value sometimes used to identify a deallocated bucket.
+ static const int INVALID_N_SENTINEL = -1;
+
+ static void initBucket(BucketType* bucket) {
+ bucket->_reserved1 = 0;
+ bucket->_wasSize = BucketSize;
+ bucket->reserved = 0;
+ }
+ };
+
+ struct BtreeLayoutV1 {
+ typedef FixedWidthKey<DiskLoc56Bit> FixedWidthKeyType;
+ typedef KeyV1 KeyType;
+ typedef KeyV1Owned KeyOwnedType;
+ typedef DiskLoc56Bit LocType;
+ typedef BtreeBucketV1 BucketType;
+
+ enum { BucketSize = 8192 - 16, // The -16 is to leave room for the Record header
+ BucketBodySize = BucketSize - BucketType::HeaderSize
+ };
+
+ static const int KeyMax = 1024;
+
+ // A sentinel value sometimes used to identify a deallocated bucket.
+ static const unsigned short INVALID_N_SENTINEL = 0xffff;
+
+ static void initBucket(BucketType* bucket) { }
+ };
+
+#pragma pack()
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp
new file mode 100644
index 00000000000..99385d46e86
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp
@@ -0,0 +1,247 @@
+// btree_test_help.cpp : Helper functions for Btree unit-testing
+//
+
+/**
+ * Copyright (C) 2014 MongoDB
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects
+ * for all of the code used other than as permitted herein. If you modify
+ * file(s) with this exception, you may extend this exception to your
+ * version of the file(s), but you are not obligated to do so. If you do not
+ * wish to do so, delete this exception statement from your version. If you
+ * delete this exception statement from all source files in the program,
+ * then also delete it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/btree/btree_test_help.h"
+
+#include "mongo/db/operation_context_noop.h"
+#include "mongo/unittest/unittest.h"
+
+
+namespace mongo {
+
+ string bigNumString(long long n, int len) {
+ char sub[17];
+ sprintf(sub, "%.16llx", n);
+ string val(len, ' ');
+ for (int i = 0; i < len; ++i) {
+ val[i] = sub[i % 16];
+ }
+ return val;
+ }
+
+ BSONObj simpleKey(char c, int n) {
+ BSONObjBuilder builder;
+ string val(n, c);
+ builder.append("a", val);
+ return builder.obj();
+ }
+
+ //
+ // BtreeLogicTestHelper
+ //
+
+ static BucketDeletionNotification dummyBucketDeletionNotification;
+
+ template <class OnDiskFormat>
+ BtreeLogicTestHelper<OnDiskFormat>::BtreeLogicTestHelper(const BSONObj& order)
+ : recordStore("TestRecordStore"),
+ btree(&headManager,
+ &recordStore,
+ Ordering::make(order),
+ "TestIndex",
+ &dummyBucketDeletionNotification) {
+
+ static const string randomData("RandomStuff");
+
+ // Generate a valid record location for a "fake" record, which we will repeatedly use
+ // thoughout the tests.
+ OperationContextNoop txn;
+ StatusWith<DiskLoc> s =
+ recordStore.insertRecord(&txn, randomData.c_str(), randomData.length(), false);
+
+ ASSERT_TRUE(s.isOK());
+ ASSERT_EQUALS(1, recordStore.numRecords());
+
+ dummyDiskLoc = s.getValue();
+ }
+
+
+ //
+ // ArtificialTreeBuilder
+ //
+
+ template <class OnDiskFormat>
+ void ArtificialTreeBuilder<OnDiskFormat>::makeTree(const string &spec) {
+ _helper->headManager.setHead(_txn, makeTree(fromjson(spec)));
+ }
+
+ template <class OnDiskFormat>
+ DiskLoc ArtificialTreeBuilder<OnDiskFormat>::makeTree(const BSONObj &spec) {
+ DiskLoc bucketLoc = _helper->btree._addBucket(_txn);
+ BucketType* bucket = _helper->btree.getBucket(bucketLoc);
+
+ BSONObjIterator i(spec);
+ while (i.more()) {
+ BSONElement e = i.next();
+ DiskLoc child;
+ if (e.type() == Object) {
+ child = makeTree(e.embeddedObject());
+ }
+
+ if (e.fieldName() == string("_")) {
+ bucket->nextChild = child;
+ }
+ else {
+ KeyDataOwnedType key(BSON("" << expectedKey(e.fieldName())));
+ _helper->btree._pushBack(bucket, _helper->dummyDiskLoc, key, child);
+ }
+ }
+
+ _helper->btree.fixParentPtrs(_txn, bucket, bucketLoc);
+ return bucketLoc;
+ }
+
+ template <class OnDiskFormat>
+ void ArtificialTreeBuilder<OnDiskFormat>::checkStructure(const string &spec) const {
+ checkStructure(fromjson(spec), _helper->headManager.getHead());
+ }
+
+ template <class OnDiskFormat>
+ void ArtificialTreeBuilder<OnDiskFormat>::push(
+ const DiskLoc bucketLoc, const BSONObj& key, const DiskLoc child) {
+ KeyDataOwnedType k(key);
+ BucketType* bucket = _helper->btree.getBucket(bucketLoc);
+
+ _helper->btree._pushBack(bucket, _helper->dummyDiskLoc, k, child);
+ _helper->btree.fixParentPtrs(_txn, bucket, bucketLoc);
+ }
+
+ template <class OnDiskFormat>
+ void ArtificialTreeBuilder<OnDiskFormat>::checkStructure(
+ const BSONObj &spec, const DiskLoc node) const {
+ BucketType* bucket = _helper->btree.getBucket(node);
+
+ BSONObjIterator j(spec);
+ for (int i = 0; i < bucket->n; ++i) {
+ ASSERT(j.more());
+ BSONElement e = j.next();
+ KeyHeaderType kn = BtreeLogic<OnDiskFormat>::getKeyHeader(bucket, i);
+ string expected = expectedKey(e.fieldName());
+ ASSERT(isPresent(BSON("" << expected), 1));
+ ASSERT(isPresent(BSON("" << expected), -1));
+
+ // ASSERT_EQUALS(expected, kn.key.toBson().firstElement().valuestr());
+ if (kn.prevChildBucket.isNull()) {
+ ASSERT(e.type() == jstNULL);
+ }
+ else {
+ ASSERT(e.type() == Object);
+ checkStructure(e.embeddedObject(), kn.prevChildBucket);
+ }
+ }
+ if (bucket->nextChild.isNull()) {
+ // maybe should allow '_' field with null value?
+ ASSERT(!j.more());
+ }
+ else {
+ BSONElement e = j.next();
+ ASSERT_EQUALS(string("_"), e.fieldName());
+ ASSERT(e.type() == Object);
+ checkStructure(e.embeddedObject(), bucket->nextChild);
+ }
+ ASSERT(!j.more());
+ }
+
+ template <class OnDiskFormat>
+ bool ArtificialTreeBuilder<OnDiskFormat>::isPresent(const BSONObj &key, int direction) const {
+ int pos;
+ DiskLoc loc;
+ OperationContextNoop txn;
+ return _helper->btree.locate(&txn, key, _helper->dummyDiskLoc, direction, &pos, &loc);
+ }
+
+ // Static
+ template <class OnDiskFormat>
+ string ArtificialTreeBuilder<OnDiskFormat>::expectedKey(const char *spec) {
+ if (spec[0] != '$') {
+ return spec;
+ }
+ char *endPtr;
+
+ // parsing a long long is a pain, so just allow shorter keys for now
+ unsigned long long num = strtol(spec + 1, &endPtr, 16);
+ int len = 800;
+ if (*endPtr == '$') {
+ len = strtol(endPtr + 1, 0, 16);
+ }
+
+ return bigNumString(num, len);
+ }
+
+ template <class OnDiskFormat>
+ int ArtificialTreeBuilder<OnDiskFormat>::fillBucketToExactSize(
+ const DiskLoc bucketLoc, int targetSize, char startKey) {
+ ASSERT_FALSE(bucketLoc.isNull());
+
+ BucketType* bucket = _helper->btree.getBucket(bucketLoc);
+ ASSERT_EQUALS(0, bucket->n);
+
+ static const int bigSize = KeyDataOwnedType(simpleKey('a', 801)).dataSize();
+
+ int size = 0;
+ int keyCount = 0;
+ while (size < targetSize) {
+ int space = targetSize - size;
+ int nextSize = space - sizeof(FixedWidthKeyType);
+ verify(nextSize > 0);
+
+ BSONObj newKey;
+ if (nextSize >= bigSize) {
+ newKey = simpleKey(startKey++, 801);
+ }
+ else {
+ newKey = simpleKey(startKey++, nextSize - (bigSize - 801));
+ }
+
+ push(bucketLoc, newKey, DiskLoc());
+
+ size += KeyDataOwnedType(newKey).dataSize() +
+ sizeof(FixedWidthKeyType);
+ keyCount += 1;
+ }
+
+ ASSERT_EQUALS(_helper->btree._packedDataSize(bucket, 0), targetSize);
+
+ return keyCount;
+ }
+
+ //
+ // This causes actual code to be generated for the usages of the templates in this file.
+ //
+
+ // V0 format.
+ template struct BtreeLogicTestHelper<BtreeLayoutV0>;
+ template class ArtificialTreeBuilder<BtreeLayoutV0>;
+
+ // V1 format.
+ template struct BtreeLogicTestHelper<BtreeLayoutV1>;
+ template class ArtificialTreeBuilder<BtreeLayoutV1>;
+}
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_test_help.h b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.h
new file mode 100644
index 00000000000..52d468f053a
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.h
@@ -0,0 +1,154 @@
+/**
+ * Copyright (C) 2014 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include <string>
+
+#include "mongo/db/json.h"
+#include "mongo/db/storage/heap1/record_store_heap.h" // XXX why is this here?
+#include "mongo/db/storage/mmap_v1//btree/btree_logic.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h"
+
+
+namespace mongo {
+
+ /**
+ * Generates a string of the specified length containing repeated concatenation of the
+ * hexadecimal representation of the input value.
+ */
+ std::string bigNumString(long long n, int len);
+
+ /**
+ * Generates key on a field 'a', with the specified number of repetitions of the character.
+ */
+ BSONObj simpleKey(char c, int n = 1);
+
+ /**
+ * Simple head manager, which performs no validity checking or persistence.
+ */
+ class TestHeadManager : public HeadManager {
+ public:
+ virtual const DiskLoc getHead() const {
+ return _head;
+ }
+
+ virtual void setHead(OperationContext* txn, const DiskLoc newHead) {
+ _head = newHead;
+ }
+
+ private:
+ DiskLoc _head;
+ };
+
+
+ /**
+ * This structure encapsulates a Btree and all the infrastructure needed by it (head manager,
+ * record store and a valid disk location to use by the tests).
+ */
+ template <class OnDiskFormat>
+ struct BtreeLogicTestHelper {
+ BtreeLogicTestHelper(const BSONObj& order);
+
+ // Everything needed for a fully-functional Btree logic
+ TestHeadManager headManager;
+ HeapRecordStore recordStore;
+ BtreeLogic<OnDiskFormat> btree;
+ DiskLoc dummyDiskLoc;
+ };
+
+
+ /**
+ * Tool to construct custom tree shapes for tests.
+ */
+ template <class OnDiskFormat>
+ class ArtificialTreeBuilder {
+ public:
+
+ typedef typename BtreeLogic<OnDiskFormat>::BucketType BucketType;
+ typedef typename BtreeLogic<OnDiskFormat>::KeyDataOwnedType KeyDataOwnedType;
+ typedef typename BtreeLogic<OnDiskFormat>::KeyHeaderType KeyHeaderType;
+
+ typedef typename OnDiskFormat::FixedWidthKeyType FixedWidthKeyType;
+
+ /**
+ * The tree builder wraps around the passed-in helper and will invoke methods on it. It
+ * does not do any cleanup, so constructing multiple trees over the same helper will
+ * cause leaked records.
+ */
+ ArtificialTreeBuilder(OperationContext* txn,
+ BtreeLogicTestHelper<OnDiskFormat>* helper)
+ : _txn(txn), _helper(helper) {
+
+ }
+
+ /**
+ * Causes the specified tree shape to be built on the associated helper and the tree's
+ * root installed as the head. Uses a custom JSON-based language with the following
+ * syntax:
+ *
+ * Btree := BTreeBucket
+ * BtreeBucket := { Child_1_Key: <BtreeBucket | null>,
+ * Child_2_Key: <BtreeBucket | null>,
+ * ...,
+ * _: <BtreeBucket | null> }
+ *
+ * The _ key name specifies the content of the nextChild pointer. The value null means
+ * use a fixed disk loc.
+ */
+ void makeTree(const std::string& spec);
+
+ /**
+ * Validates that the structure of the Btree in the helper matches the specification.
+ */
+ void checkStructure(const std::string& spec) const;
+
+ /**
+ * Adds the following key to the bucket and fixes up the child pointers.
+ */
+ void push(const DiskLoc bucketLoc, const BSONObj& key, const DiskLoc child);
+
+ /**
+ * @return The number of keys inserted.
+ */
+ int fillBucketToExactSize(const DiskLoc bucketLoc, int targetSize, char startKey);
+
+ private:
+ DiskLoc makeTree(const BSONObj& spec);
+
+ void checkStructure(const BSONObj& spec, const DiskLoc node) const;
+
+ bool isPresent(const BSONObj& key, int direction) const;
+
+ static string expectedKey(const char* spec);
+
+ OperationContext* _txn;
+ BtreeLogicTestHelper<OnDiskFormat>* _helper;
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/bucket_deletion_notification.h b/src/mongo/db/storage/mmap_v1/btree/bucket_deletion_notification.h
new file mode 100644
index 00000000000..5d6fa99434f
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/bucket_deletion_notification.h
@@ -0,0 +1,54 @@
+/**
+* Copyright (C) 2014 MongoDB Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#pragma once
+
+#include "mongo/db/diskloc.h"
+
+namespace mongo {
+
+ /**
+ * Notifies interested parties before a bucket is about to be deleted. Currently used by
+ * the cursor manager, so the appropriate cursors can be invalidated.
+ *
+ * The default implementation is a no-op.
+ */
+ class BucketDeletionNotification {
+ public:
+
+ /**
+ * If the same object is passed in to different BtreeLogic implementations, this
+ * notification may be invoked on multiple threads, so it is up to the implementor
+ * to ensure thread-safety.
+ */
+ virtual void aboutToDeleteBucket(const DiskLoc& bucket) { }
+
+ virtual ~BucketDeletionNotification() { }
+ };
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/btree/key.cpp b/src/mongo/db/storage/mmap_v1/btree/key.cpp
new file mode 100644
index 00000000000..a6ccd61d2cf
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/key.cpp
@@ -0,0 +1,691 @@
+/**
+ * Copyright (C) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/btree/key.h"
+
+#include "mongo/bson/util/builder.h"
+#include "mongo/platform/float_utils.h"
+#include "mongo/util/startup_test.h"
+
+
+namespace mongo {
+
+ extern const Ordering nullOrdering = Ordering::make(BSONObj());
+
+ // KeyBson is for V0 (version #0) indexes
+
+ int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o);
+
+ // "old" = pre signed dates & such; i.e. btree V0
+ /* must be same canon type when called */
+ int oldCompareElementValues(const BSONElement& l, const BSONElement& r) {
+ dassert( l.canonicalType() == r.canonicalType() );
+ int f;
+ double x;
+
+ switch ( l.type() ) {
+ case EOO:
+ case Undefined: // EOO and Undefined are same canonicalType
+ case jstNULL:
+ case MaxKey:
+ case MinKey:
+ return 0;
+ case Bool:
+ return *l.value() - *r.value();
+ case Timestamp:
+ case Date:
+ // unsigned dates for old version
+ if ( l.date() < r.date() )
+ return -1;
+ return l.date() == r.date() ? 0 : 1;
+ case NumberLong:
+ if( r.type() == NumberLong ) {
+ long long L = l._numberLong();
+ long long R = r._numberLong();
+ if( L < R ) return -1;
+ if( L == R ) return 0;
+ return 1;
+ }
+ // else fall through
+ case NumberInt:
+ case NumberDouble: {
+ double left = l.number();
+ double right = r.number();
+ bool lNan = !( left <= numeric_limits< double >::max() &&
+ left >= -numeric_limits< double >::max() );
+ bool rNan = !( right <= numeric_limits< double >::max() &&
+ right >= -numeric_limits< double >::max() );
+ if ( lNan ) {
+ if ( rNan ) {
+ return 0;
+ }
+ else {
+ return -1;
+ }
+ }
+ else if ( rNan ) {
+ return 1;
+ }
+ x = left - right;
+ if ( x < 0 ) return -1;
+ return x == 0 ? 0 : 1;
+ }
+ case jstOID:
+ return memcmp(l.value(), r.value(), 12);
+ case Code:
+ case Symbol:
+ case String:
+ // nulls not allowed in the middle of strings in the old version
+ return strcmp(l.valuestr(), r.valuestr());
+ case Object:
+ case Array:
+ return oldCompare(l.embeddedObject(), r.embeddedObject(), nullOrdering);
+ case DBRef: {
+ int lsz = l.valuesize();
+ int rsz = r.valuesize();
+ if ( lsz - rsz != 0 ) return lsz - rsz;
+ return memcmp(l.value(), r.value(), lsz);
+ }
+ case BinData: {
+ int lsz = l.objsize(); // our bin data size in bytes, not including the subtype byte
+ int rsz = r.objsize();
+ if ( lsz - rsz != 0 ) return lsz - rsz;
+ return memcmp(l.value()+4, r.value()+4, lsz+1);
+ }
+ case RegEx: {
+ int c = strcmp(l.regex(), r.regex());
+ if ( c )
+ return c;
+ return strcmp(l.regexFlags(), r.regexFlags());
+ }
+ case CodeWScope : {
+ f = l.canonicalType() - r.canonicalType();
+ if ( f )
+ return f;
+ f = strcmp( l.codeWScopeCode() , r.codeWScopeCode() );
+ if ( f )
+ return f;
+ f = strcmp( l.codeWScopeScopeDataUnsafe() , r.codeWScopeScopeDataUnsafe() );
+ if ( f )
+ return f;
+ return 0;
+ }
+ default:
+ log() << "oldCompareElementValues: bad type " << (int) l.type() << endl;
+ verify(false);
+ }
+ return -1;
+ }
+
+ int oldElemCompare(const BSONElement&l , const BSONElement& r) {
+ int lt = (int) l.canonicalType();
+ int rt = (int) r.canonicalType();
+ int x = lt - rt;
+ if( x )
+ return x;
+ return oldCompareElementValues(l, r);
+ }
+
+ // pre signed dates & such
+ int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o) {
+ BSONObjIterator i(l);
+ BSONObjIterator j(r);
+ unsigned mask = 1;
+ while ( 1 ) {
+ // so far, equal...
+
+ BSONElement l = i.next();
+ BSONElement r = j.next();
+ if ( l.eoo() )
+ return r.eoo() ? 0 : -1;
+ if ( r.eoo() )
+ return 1;
+
+ int x;
+ {
+ x = oldElemCompare(l, r);
+ if( o.descending(mask) )
+ x = -x;
+ }
+ if ( x != 0 )
+ return x;
+ mask <<= 1;
+ }
+ return -1;
+ }
+
+ /* old style compares:
+ - dates are unsigned
+ - strings no nulls
+ */
+ int KeyBson::woCompare(const KeyBson& r, const Ordering &o) const {
+ return oldCompare(_o, r._o, o);
+ }
+
+ // woEqual could be made faster than woCompare but this is for backward compatibility so not worth a big effort
+ bool KeyBson::woEqual(const KeyBson& r) const {
+ return oldCompare(_o, r._o, nullOrdering) == 0;
+ }
+
+ // [ ][HASMORE][x][y][canontype_4bits]
+ enum CanonicalsEtc {
+ cminkey=1,
+ cnull=2,
+ cdouble=4,
+ cstring=6,
+ cbindata=7,
+ coid=8,
+ cfalse=10,
+ ctrue=11,
+ cdate=12,
+ cmaxkey=14,
+ cCANONTYPEMASK = 0xf,
+ cY = 0x10,
+ cint = cY | cdouble,
+ cX = 0x20,
+ clong = cX | cdouble,
+ cHASMORE = 0x40,
+ cNOTUSED = 0x80 // but see IsBSON sentinel - this bit not usable without great care
+ };
+
+ // bindata bson type
+ const unsigned BinDataLenMask = 0xf0; // lengths are powers of 2 of this value
+ const unsigned BinDataTypeMask = 0x0f; // 0-7 as you would expect, 8-15 are 128+value. see BinDataType.
+ const int BinDataLenMax = 32;
+ const int BinDataLengthToCode[] = {
+ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, -1/*9*/, 0x90/*10*/, -1/*11*/, 0xa0/*12*/, -1/*13*/, 0xb0/*14*/, -1/*15*/,
+ 0xc0/*16*/, -1, -1, -1, 0xd0/*20*/, -1, -1, -1,
+ 0xe0/*24*/, -1, -1, -1, -1, -1, -1, -1,
+ 0xf0/*32*/
+ };
+ const int BinDataCodeToLength[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 32
+ };
+
+ int binDataCodeToLength(int codeByte) {
+ return BinDataCodeToLength[codeByte >> 4];
+ }
+
+ /** object cannot be represented in compact format. so store in traditional bson format
+ with a leading sentinel byte IsBSON to indicate it's in that format.
+
+ Given that the KeyV1Owned constructor already grabbed a bufbuilder, we reuse it here
+ so that we don't have to do an extra malloc.
+ */
+ void KeyV1Owned::traditional(const BSONObj& obj) {
+ b.reset();
+ b.appendUChar(IsBSON);
+ b.appendBuf(obj.objdata(), obj.objsize());
+ _keyData = (const unsigned char *) b.buf();
+ }
+
+ KeyV1Owned::KeyV1Owned(const KeyV1& rhs) {
+ b.appendBuf( rhs.data(), rhs.dataSize() );
+ _keyData = (const unsigned char *) b.buf();
+ dassert( b.len() == dataSize() ); // check datasize method is correct
+ dassert( (*_keyData & cNOTUSED) == 0 );
+ }
+
+ // fromBSON to Key format
+ KeyV1Owned::KeyV1Owned(const BSONObj& obj) {
+ BSONObj::iterator i(obj);
+ unsigned char bits = 0;
+ while( 1 ) {
+ BSONElement e = i.next();
+ if( i.more() )
+ bits |= cHASMORE;
+ switch( e.type() ) {
+ case MinKey:
+ b.appendUChar(cminkey|bits);
+ break;
+ case jstNULL:
+ b.appendUChar(cnull|bits);
+ break;
+ case MaxKey:
+ b.appendUChar(cmaxkey|bits);
+ break;
+ case Bool:
+ b.appendUChar( (e.boolean()?ctrue:cfalse) | bits );
+ break;
+ case jstOID:
+ b.appendUChar(coid|bits);
+ b.appendBuf(&e.__oid(), sizeof(OID));
+ break;
+ case BinData:
+ {
+ int t = e.binDataType();
+ // 0-7 and 0x80 to 0x87 are supported by Key
+ if( (t & 0x78) == 0 && t != ByteArrayDeprecated ) {
+ int len;
+ const char * d = e.binData(len);
+ if( len <= BinDataLenMax ) {
+ int code = BinDataLengthToCode[len];
+ if( code >= 0 ) {
+ if( t >= 128 )
+ t = (t-128) | 0x08;
+ dassert( (code&t) == 0 );
+ b.appendUChar( cbindata|bits );
+ b.appendUChar( code | t );
+ b.appendBuf(d, len);
+ break;
+ }
+ }
+ }
+ traditional(obj);
+ return;
+ }
+ case Date:
+ b.appendUChar(cdate|bits);
+ b.appendStruct(e.date());
+ break;
+ case String:
+ {
+ b.appendUChar(cstring|bits);
+ // note we do not store the terminating null, to save space.
+ unsigned x = (unsigned) e.valuestrsize() - 1;
+ if( x > 255 ) {
+ traditional(obj);
+ return;
+ }
+ b.appendUChar(x);
+ b.appendBuf(e.valuestr(), x);
+ break;
+ }
+ case NumberInt:
+ b.appendUChar(cint|bits);
+ b.appendNum((double) e._numberInt());
+ break;
+ case NumberLong:
+ {
+ long long n = e._numberLong();
+ long long m = 2LL << 52;
+ DEV {
+ long long d = m-1;
+ verify( ((long long) ((double) -d)) == -d );
+ }
+ if( n >= m || n <= -m ) {
+ // can't represent exactly as a double
+ traditional(obj);
+ return;
+ }
+ b.appendUChar(clong|bits);
+ b.appendNum((double) n);
+ break;
+ }
+ case NumberDouble:
+ {
+ double d = e._numberDouble();
+ if( isNaN(d) ) {
+ traditional(obj);
+ return;
+ }
+ b.appendUChar(cdouble|bits);
+ b.appendNum(d);
+ break;
+ }
+ default:
+ // if other types involved, store as traditional BSON
+ traditional(obj);
+ return;
+ }
+ if( !i.more() )
+ break;
+ bits = 0;
+ }
+ _keyData = (const unsigned char *) b.buf();
+ dassert( b.len() == dataSize() ); // check datasize method is correct
+ dassert( (*_keyData & cNOTUSED) == 0 );
+ }
+
+ BSONObj KeyV1::toBson() const {
+ verify( _keyData != 0 );
+ if( !isCompactFormat() )
+ return bson();
+
+ BSONObjBuilder b(512);
+ const unsigned char *p = _keyData;
+ while( 1 ) {
+ unsigned bits = *p++;
+
+ switch( bits & 0x3f ) {
+ case cminkey: b.appendMinKey(""); break;
+ case cnull: b.appendNull(""); break;
+ case cfalse: b.appendBool("", false); break;
+ case ctrue: b.appendBool("", true); break;
+ case cmaxkey:
+ b.appendMaxKey("");
+ break;
+ case cstring:
+ {
+ unsigned sz = *p++;
+ // we build the element ourself as we have to null terminate it
+ BufBuilder &bb = b.bb();
+ bb.appendNum((char) String);
+ bb.appendUChar(0); // fieldname ""
+ bb.appendNum(sz+1);
+ bb.appendBuf(p, sz);
+ bb.appendUChar(0); // null char at end of string
+ p += sz;
+ break;
+ }
+ case coid:
+ b.appendOID("", (OID *) p);
+ p += sizeof(OID);
+ break;
+ case cbindata:
+ {
+ int len = binDataCodeToLength(*p);
+ int subtype = (*p) & BinDataTypeMask;
+ if( subtype & 0x8 ) {
+ subtype = (subtype & 0x7) | 0x80;
+ }
+ b.appendBinData("", len, (BinDataType) subtype, ++p);
+ p += len;
+ break;
+ }
+ case cdate:
+ b.appendDate("", (Date_t&) *p);
+ p += 8;
+ break;
+ case cdouble:
+ b.append("", (double&) *p);
+ p += sizeof(double);
+ break;
+ case cint:
+ b.append("", static_cast< int >((reinterpret_cast< const PackedDouble& >(*p)).d));
+ p += sizeof(double);
+ break;
+ case clong:
+ b.append("", static_cast< long long>((reinterpret_cast< const PackedDouble& >(*p)).d));
+ p += sizeof(double);
+ break;
+ default:
+ verify(false);
+ }
+
+ if( (bits & cHASMORE) == 0 )
+ break;
+ }
+ return b.obj();
+ }
+
+ static int compare(const unsigned char *&l, const unsigned char *&r) {
+ int lt = (*l & cCANONTYPEMASK);
+ int rt = (*r & cCANONTYPEMASK);
+ int x = lt - rt;
+ if( x )
+ return x;
+
+ l++; r++;
+
+ // same type
+ switch( lt ) {
+ case cdouble:
+ {
+ double L = (reinterpret_cast< const PackedDouble* >(l))->d;
+ double R = (reinterpret_cast< const PackedDouble* >(r))->d;
+ if( L < R )
+ return -1;
+ if( L != R )
+ return 1;
+ l += 8; r += 8;
+ break;
+ }
+ case cstring:
+ {
+ int lsz = *l;
+ int rsz = *r;
+ int common = min(lsz, rsz);
+ l++; r++; // skip the size byte
+ // use memcmp as we (will) allow zeros in UTF8 strings
+ int res = memcmp(l, r, common);
+ if( res )
+ return res;
+ // longer string is the greater one
+ int diff = lsz-rsz;
+ if( diff )
+ return diff;
+ l += lsz; r += lsz;
+ break;
+ }
+ case cbindata:
+ {
+ int L = *l;
+ int R = *r;
+ int llen = binDataCodeToLength(L);
+ int diff = L-R; // checks length and subtype simultaneously
+ if( diff ) {
+ // unfortunately nibbles are backwards to do subtype and len in one check (could bit swap...)
+ int rlen = binDataCodeToLength(R);
+ if( llen != rlen )
+ return llen - rlen;
+ return diff;
+ }
+ // same length, same type
+ l++; r++;
+ int res = memcmp(l, r, llen);
+ if( res )
+ return res;
+ l += llen; r += llen;
+ break;
+ }
+ case cdate:
+ {
+ long long L = *((long long *) l);
+ long long R = *((long long *) r);
+ if( L < R )
+ return -1;
+ if( L > R )
+ return 1;
+ l += 8; r += 8;
+ break;
+ }
+ case coid:
+ {
+ int res = memcmp(l, r, sizeof(OID));
+ if( res )
+ return res;
+ l += 12; r += 12;
+ break;
+ }
+ default:
+ // all the others are a match -- e.g. null == null
+ ;
+ }
+
+ return 0;
+ }
+
+ // at least one of this and right are traditional BSON format
+ int NOINLINE_DECL KeyV1::compareHybrid(const KeyV1& right, const Ordering& order) const {
+ BSONObj L = toBson();
+ BSONObj R = right.toBson();
+ return L.woCompare(R, order, /*considerfieldname*/false);
+ }
+
+ int KeyV1::woCompare(const KeyV1& right, const Ordering &order) const {
+ const unsigned char *l = _keyData;
+ const unsigned char *r = right._keyData;
+
+ if( (*l|*r) == IsBSON ) // only can do this if cNOTUSED maintained
+ return compareHybrid(right, order);
+
+ unsigned mask = 1;
+ while( 1 ) {
+ char lval = *l;
+ char rval = *r;
+ {
+ int x = compare(l, r); // updates l and r pointers
+ if( x ) {
+ if( order.descending(mask) )
+ x = -x;
+ return x;
+ }
+ }
+
+ {
+ int x = ((int)(lval & cHASMORE)) - ((int)(rval & cHASMORE));
+ if( x )
+ return x;
+ if( (lval & cHASMORE) == 0 )
+ break;
+ }
+
+ mask <<= 1;
+ }
+
+ return 0;
+ }
+
+ static unsigned sizes[] = {
+ 0,
+ 1, //cminkey=1,
+ 1, //cnull=2,
+ 0,
+ 9, //cdouble=4,
+ 0,
+ 0, //cstring=6,
+ 0,
+ 13, //coid=8,
+ 0,
+ 1, //cfalse=10,
+ 1, //ctrue=11,
+ 9, //cdate=12,
+ 0,
+ 1, //cmaxkey=14,
+ 0
+ };
+
+ inline unsigned sizeOfElement(const unsigned char *p) {
+ unsigned type = *p & cCANONTYPEMASK;
+ unsigned sz = sizes[type];
+ if( sz == 0 ) {
+ if( type == cstring ) {
+ sz = ((unsigned) p[1]) + 2;
+ }
+ else {
+ verify( type == cbindata );
+ sz = binDataCodeToLength(p[1]) + 2;
+ }
+ }
+ return sz;
+ }
+
+ int KeyV1::dataSize() const {
+ const unsigned char *p = _keyData;
+ if( !isCompactFormat() ) {
+ return bson().objsize() + 1;
+ }
+
+ bool more;
+ do {
+ unsigned z = sizeOfElement(p);
+ more = (*p & cHASMORE) != 0;
+ p += z;
+ } while( more );
+ return p - _keyData;
+ }
+
+ bool KeyV1::woEqual(const KeyV1& right) const {
+ const unsigned char *l = _keyData;
+ const unsigned char *r = right._keyData;
+
+ if( (*l|*r) == IsBSON ) {
+ return toBson().equal(right.toBson());
+ }
+
+ while( 1 ) {
+ char lval = *l;
+ char rval = *r;
+ if( (lval&(cCANONTYPEMASK|cHASMORE)) != (rval&(cCANONTYPEMASK|cHASMORE)) )
+ return false;
+ l++; r++;
+ switch( lval&cCANONTYPEMASK ) {
+ case coid:
+ if( *((unsigned*) l) != *((unsigned*) r) )
+ return false;
+ l += 4; r += 4;
+ case cdate:
+ if( *((unsigned long long *) l) != *((unsigned long long *) r) )
+ return false;
+ l += 8; r += 8;
+ break;
+ case cdouble:
+ if( (reinterpret_cast< const PackedDouble* > (l))->d != (reinterpret_cast< const PackedDouble* >(r))->d )
+ return false;
+ l += 8; r += 8;
+ break;
+ case cstring:
+ {
+ if( *l != *r )
+ return false; // not same length
+ unsigned sz = ((unsigned) *l) + 1;
+ if( memcmp(l, r, sz) )
+ return false;
+ l += sz; r += sz;
+ break;
+ }
+ case cbindata:
+ {
+ if( *l != *r )
+ return false; // len or subtype mismatch
+ int len = binDataCodeToLength(*l) + 1;
+ if( memcmp(l, r, len) )
+ return false;
+ l += len; r += len;
+ break;
+ }
+ case cminkey:
+ case cnull:
+ case cfalse:
+ case ctrue:
+ case cmaxkey:
+ break;
+ default:
+ verify(false);
+ }
+ if( (lval&cHASMORE) == 0 )
+ break;
+ }
+ return true;
+ }
+
+ struct CmpUnitTest : public StartupTest {
+ void run() {
+ char a[2];
+ char b[2];
+ a[0] = -3;
+ a[1] = 0;
+ b[0] = 3;
+ b[1] = 0;
+ verify( strcmp(a,b)>0 && memcmp(a,b,2)>0 );
+ }
+ } cunittest;
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/key.h b/src/mongo/db/storage/mmap_v1/btree/key.h
new file mode 100644
index 00000000000..83203b0fee2
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/key.h
@@ -0,0 +1,130 @@
+// @file key.h class(es) representing individual keys in a btree
+
+/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#pragma once
+
+#include "mongo/db/jsobj.h"
+
+namespace mongo {
+
+ /** Key class for precomputing a small format index key that is denser than a traditional BSONObj.
+
+ KeyBson is a legacy wrapper implementation for old BSONObj style keys for v:0 indexes.
+
+ KeyV1 is the new implementation.
+ */
+ class KeyBson /* "KeyV0" */ {
+ public:
+ KeyBson() { }
+ explicit KeyBson(const char *keyData) : _o(keyData) { }
+ explicit KeyBson(const BSONObj& obj) : _o(obj) { }
+ int woCompare(const KeyBson& r, const Ordering &o) const;
+ BSONObj toBson() const { return _o; }
+ std::string toString() const { return _o.toString(); }
+ int dataSize() const { return _o.objsize(); }
+ const char * data() const { return _o.objdata(); }
+ BSONElement _firstElement() const { return _o.firstElement(); }
+ bool isCompactFormat() const { return false; }
+ bool woEqual(const KeyBson& r) const;
+ void assign(const KeyBson& rhs) { *this = rhs; }
+ bool isValid() const { return true; }
+ private:
+ BSONObj _o;
+ };
+
+ class KeyV1Owned;
+
+ // corresponding to BtreeData_V1
+ class KeyV1 {
+ void operator=(const KeyV1&); // disallowed just to make people be careful as we don't own the buffer
+ KeyV1(const KeyV1Owned&); // disallowed as this is not a great idea as KeyV1Owned likely will go out of scope
+ public:
+ KeyV1() { _keyData = 0; }
+ ~KeyV1() { DEV _keyData = (const unsigned char *) 1; }
+
+ KeyV1(const KeyV1& rhs) : _keyData(rhs._keyData) {
+ dassert( _keyData > (const unsigned char *) 1 );
+ }
+
+ // explicit version of operator= to be safe
+ void assign(const KeyV1& rhs) {
+ _keyData = rhs._keyData;
+ }
+
+ /** @param keyData can be a buffer containing data in either BSON format, OR in KeyV1 format.
+ when BSON, we are just a wrapper
+ */
+ explicit KeyV1(const char *keyData) : _keyData((unsigned char *) keyData) { }
+
+ int woCompare(const KeyV1& r, const Ordering &o) const;
+ bool woEqual(const KeyV1& r) const;
+ BSONObj toBson() const;
+ std::string toString() const { return toBson().toString(); }
+
+ /** get the key data we want to store in the btree bucket */
+ const char * data() const { return (const char *) _keyData; }
+
+ /** @return size of data() */
+ int dataSize() const;
+
+ /** only used by geo, which always has bson keys */
+ BSONElement _firstElement() const { return bson().firstElement(); }
+ bool isCompactFormat() const { return *_keyData != IsBSON; }
+
+ bool isValid() const { return _keyData > (const unsigned char*)1; }
+ protected:
+ enum { IsBSON = 0xff };
+ const unsigned char *_keyData;
+ BSONObj bson() const {
+ dassert( !isCompactFormat() );
+ return BSONObj((const char *) _keyData+1);
+ }
+ private:
+ int compareHybrid(const KeyV1& right, const Ordering& order) const;
+ };
+
+ class KeyV1Owned : public KeyV1 {
+ void operator=(const KeyV1Owned&);
+ public:
+ /** @obj a BSON object to be translated to KeyV1 format. If the object isn't
+ representable in KeyV1 format (which happens, intentionally, at times)
+ it will stay as bson herein.
+ */
+ KeyV1Owned(const BSONObj& obj);
+
+ /** makes a copy (memcpy's the whole thing) */
+ KeyV1Owned(const KeyV1& rhs);
+
+ private:
+ StackBufBuilder b;
+ void traditional(const BSONObj& obj); // store as traditional bson not as compact format
+ };
+
+};
diff --git a/src/mongo/db/storage/mmap_v1/catalog/hashtab.h b/src/mongo/db/storage/mmap_v1/catalog/hashtab.h
new file mode 100644
index 00000000000..07916dc873d
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/hashtab.h
@@ -0,0 +1,180 @@
+/* hashtab.h
+
+ Simple, fixed size hash table. Darn simple.
+
+ Uses a contiguous block of memory, so you can put it in a memory mapped file very easily.
+*/
+
+/* Copyright 2009 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects
+ * for all of the code used other than as permitted herein. If you modify
+ * file(s) with this exception, you may extend this exception to your
+ * version of the file(s), but you are not obligated to do so. If you do not
+ * wish to do so, delete this exception statement from your version. If you
+ * delete this exception statement from all source files in the program,
+ * then also delete it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/pch.h"
+#include <map>
+#include "mongo/db/storage/mmap_v1/dur.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/stdx/functional.h"
+
+namespace mongo {
+
+#pragma pack(1)
+
+ /* you should define:
+
+ int Key::hash() return > 0 always.
+ */
+
+ template <class Key,class Type>
+ class HashTable : boost::noncopyable {
+ public:
+ const char *name;
+ struct Node {
+ int hash;
+ Key k;
+ Type value;
+ bool inUse() {
+ return hash != 0;
+ }
+ void setUnused() {
+ hash = 0;
+ }
+ };
+ void* _buf;
+ int n; // number of hashtable buckets
+ int maxChain;
+
+ Node& nodes(int i) {
+ Node *nodes = (Node *) _buf;
+ return nodes[i];
+ }
+
+ int _find(const Key& k, bool& found) {
+ found = false;
+ int h = k.hash();
+ int i = h % n;
+ int start = i;
+ int chain = 0;
+ int firstNonUsed = -1;
+ while ( 1 ) {
+ if ( !nodes(i).inUse() ) {
+ if ( firstNonUsed < 0 )
+ firstNonUsed = i;
+ }
+
+ if ( nodes(i).hash == h && nodes(i).k == k ) {
+ if ( chain >= 200 )
+ log() << "warning: hashtable " << name << " long chain " << std::endl;
+ found = true;
+ return i;
+ }
+ chain++;
+ i = (i+1) % n;
+ if ( i == start ) {
+ // shouldn't get here / defensive for infinite loops
+ log() << "error: hashtable " << name << " is full n:" << n << std::endl;
+ return -1;
+ }
+ if( chain >= maxChain ) {
+ if ( firstNonUsed >= 0 )
+ return firstNonUsed;
+ log() << "error: hashtable " << name << " max chain reached:" << maxChain << std::endl;
+ return -1;
+ }
+ }
+ }
+
+ public:
+ /* buf must be all zeroes on initialization. */
+ HashTable(void* buf, int buflen, const char *_name) : name(_name) {
+ int m = sizeof(Node);
+ // log() << "hashtab init, buflen:" << buflen << " m:" << m << std::endl;
+ n = buflen / m;
+ if ( (n & 1) == 0 )
+ n--;
+ maxChain = (int) (n * 0.05);
+ _buf = buf;
+ //nodes = (Node *) buf;
+
+ if ( sizeof(Node) != 628 ) {
+ log() << "HashTable() " << _name << " sizeof(node):" << sizeof(Node) << " n:" << n << " sizeof(Key): " << sizeof(Key) << " sizeof(Type):" << sizeof(Type) << std::endl;
+ verify( sizeof(Node) == 628 );
+ }
+
+ }
+
+ Type* get(const Key& k) {
+ bool found;
+ int i = _find(k, found);
+ if ( found )
+ return &nodes(i).value;
+ return 0;
+ }
+
+ void kill(OperationContext* txn, const Key& k) {
+ bool found;
+ int i = _find(k, found);
+ if ( i >= 0 && found ) {
+ Node* n = &nodes(i);
+ n = txn->recoveryUnit()->writing(n);
+ n->k.kill();
+ n->setUnused();
+ }
+ }
+
+ /** returns false if too full */
+ bool put(OperationContext* txn, const Key& k, const Type& value) {
+ bool found;
+ int i = _find(k, found);
+ if ( i < 0 )
+ return false;
+ Node* n = txn->recoveryUnit()->writing( &nodes(i) );
+ if ( !found ) {
+ n->k = k;
+ n->hash = k.hash();
+ }
+ else {
+ verify( n->hash == k.hash() );
+ }
+ n->value = value;
+ return true;
+ }
+
+ typedef stdx::function< void ( const Key& k , Type& v ) > IteratorCallback;
+ void iterAll( IteratorCallback callback ) {
+ for ( int i=0; i<n; i++ ) {
+ if ( nodes(i).inUse() ) {
+ callback( nodes(i).k , nodes(i).value );
+ }
+ }
+ }
+
+ };
+
+#pragma pack()
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/catalog/index_details.cpp b/src/mongo/db/storage/mmap_v1/catalog/index_details.cpp
new file mode 100644
index 00000000000..bc9cc3ee791
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/index_details.cpp
@@ -0,0 +1,40 @@
+// index_details.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#include "mongo/db/storage/mmap_v1/catalog/index_details.h"
+
+namespace mongo {
+
+ void IndexDetails::_reset() {
+ head.setInvalid();
+ info.setInvalid();
+ }
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/index_details.h b/src/mongo/db/storage/mmap_v1/catalog/index_details.h
new file mode 100644
index 00000000000..b2f34ec0681
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/index_details.h
@@ -0,0 +1,69 @@
+// index_details.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#pragma once
+
+#include "mongo/db/diskloc.h"
+
+namespace mongo {
+
+ /* Details about a particular index. There is one of these effectively for each object in
+ system.namespaces (although this also includes the head pointer, which is not in that
+ collection).
+
+ This is an internal part of the catalog. Nothing outside of the catalog should use this.
+
+ ** MemoryMapped Record ** (i.e., this is on disk data)
+ */
+ struct IndexDetails {
+ /**
+ * btree head disk location
+ */
+ DiskLoc head;
+
+ /* Location of index info object. Format:
+
+ { name:"nameofindex", ns:"parentnsname", key: {keypattobject}
+ [, unique: <bool>, background: <bool>, v:<version>]
+ }
+
+ This object is in the system.indexes collection. Note that since we
+ have a pointer to the object here, the object in system.indexes MUST NEVER MOVE.
+ */
+ DiskLoc info;
+
+ /**
+ * makes head and info invalid
+ */
+ void _reset();
+
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace-inl.h b/src/mongo/db/storage/mmap_v1/catalog/namespace-inl.h
new file mode 100644
index 00000000000..5cd45963f1f
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace-inl.h
@@ -0,0 +1,74 @@
+// namespace-inl.h
+
+/**
+* Copyright (C) 2009 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#pragma once
+
+namespace mongo {
+
+ inline Namespace& Namespace::operator=(const StringData& ns) {
+ // we fill the remaining space with all zeroes here. as the full Namespace struct is in
+ // the datafiles (the .ns files specifically), that is helpful as then they are deterministic
+ // in the bytes they have for a given sequence of operations. that makes testing and debugging
+ // the data files easier.
+ //
+ // if profiling indicates this method is a significant bottleneck, we could have a version we
+ // use for reads which does not fill with zeroes, and keep the zeroing behavior on writes.
+ //
+ memset( buf, 0, sizeof(buf) );
+ uassert( 10080 , "ns name too long, max size is 127 bytes", ns.size() <= MaxNsLen);
+ uassert( 17380 , "ns name can't contain embedded '\0' byte", ns.find('\0') == std::string::npos);
+ ns.copyTo( buf, true );
+ return *this;
+ }
+
+ inline std::string Namespace::extraName(int i) const {
+ char ex[] = "$extra";
+ ex[5] += i;
+ std::string s = std::string(buf) + ex;
+ massert( 10348 , "$extra: ns name too long", s.size() <= MaxNsLen);
+ return s;
+ }
+
+ inline bool Namespace::isExtra() const {
+ const char *p = strstr(buf, "$extr");
+ return p && p[5] && p[6] == 0; //==0 important in case an index uses name "$extra_1" for example
+ }
+
+ inline int Namespace::hash() const {
+ unsigned x = 0;
+ const char *p = buf;
+ while ( *p ) {
+ x = x * 131 + *p;
+ p++;
+ }
+ return (x & 0x7fffffff) | 0x8000000; // must be > 0
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace.cpp
new file mode 100644
index 00000000000..822ed26dedb
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace.cpp
@@ -0,0 +1,49 @@
+// namespace.cpp
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/db/storage/mmap_v1/catalog/namespace.h"
+
+#include <boost/static_assert.hpp>
+
+#include "mongo/db/namespace_string.h"
+
+namespace mongo {
+ namespace {
+ BOOST_STATIC_ASSERT( sizeof(Namespace) == 128 );
+ BOOST_STATIC_ASSERT( Namespace::MaxNsLenWithNUL == MaxDatabaseNameLen );
+ BOOST_STATIC_ASSERT((int)Namespace::MaxNsLenWithNUL == (int)NamespaceString::MaxNsLenWithNUL);
+ BOOST_STATIC_ASSERT((int)Namespace::MaxNsLen == (int)NamespaceString::MaxNsLen);
+ // Note the typo.
+ BOOST_STATIC_ASSERT((int)Namespace::MaxNsColletionLen == (int)NamespaceString::MaxNsCollectionLen);
+ }
+}
+
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace.h b/src/mongo/db/storage/mmap_v1/catalog/namespace.h
new file mode 100644
index 00000000000..40e70ac9857
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace.h
@@ -0,0 +1,92 @@
+// namespace.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#pragma once
+
+#include <cstring>
+#include <string>
+
+#include "mongo/base/string_data.h"
+
+namespace mongo {
+
+#pragma pack(1)
+ /**
+ * This is used for storing a namespace on disk in a fixed witdh form
+ * it should only be used for that, not for passing internally
+ * for that, please use NamespaceString
+ */
+ class Namespace {
+ public:
+ Namespace(const StringData& ns) { *this = ns; }
+ Namespace& operator=(const StringData& ns);
+
+ void kill() { buf[0] = 0x7f; }
+
+ bool operator==(const char *r) const { return strcmp(buf, r) == 0; }
+ bool operator==(const Namespace& r) const { return strcmp(buf, r.buf) == 0; }
+ bool operator!=(const char *r) const { return strcmp(buf, r) != 0; }
+ bool operator!=(const Namespace& r) const { return strcmp(buf, r.buf) != 0; }
+
+ bool hasDollarSign() const { return strchr( buf , '$' ) != NULL; }
+
+ int hash() const; // value returned is always > 0
+
+ size_t size() const { return strlen( buf ); }
+
+ std::string toString() const { return buf; }
+ operator std::string() const { return buf; }
+
+ /* NamespaceDetails::Extra was added after fact to allow chaining of data blocks to support more than 10 indexes
+ (more than 10 IndexDetails). It's a bit hacky because of this late addition with backward
+ file support. */
+ std::string extraName(int i) const;
+ bool isExtra() const; /* ends with $extr... -- when true an extra block not a normal NamespaceDetails block */
+
+ enum MaxNsLenValue {
+ // Maximum possible length of name any namespace, including special ones like $extra.
+ // This includes rum for the NUL byte so it can be used when sizing buffers.
+ MaxNsLenWithNUL = 128,
+
+ // MaxNsLenWithNUL excluding the NUL byte. Use this when comparing std::string lengths.
+ MaxNsLen = MaxNsLenWithNUL - 1,
+
+ // Maximum allowed length of fully qualified namespace name of any real collection.
+ // Does not include NUL so it can be directly compared to std::string lengths.
+ MaxNsColletionLen = MaxNsLen - 7/*strlen(".$extra")*/,
+ };
+ private:
+ char buf[MaxNsLenWithNUL];
+ };
+#pragma pack()
+
+} // namespace mongo
+
+#include "mongo/db/storage/mmap_v1/catalog/namespace-inl.h"
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp
new file mode 100644
index 00000000000..2fe3d226e5d
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp
@@ -0,0 +1,244 @@
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h"
+
+#include <algorithm>
+#include <list>
+
+#include "mongo/base/counter.h"
+#include "mongo/db/catalog/collection.h"
+#include "mongo/db/catalog/collection_options.h"
+#include "mongo/db/clientcursor.h"
+#include "mongo/db/commands/server_status.h"
+#include "mongo/db/db.h"
+#include "mongo/db/index_legacy.h"
+#include "mongo/db/json.h"
+#include "mongo/db/ops/delete.h"
+#include "mongo/db/ops/update.h"
+#include "mongo/db/storage/mmap_v1/catalog/hashtab.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/scripting/engine.h"
+#include "mongo/util/startup_test.h"
+
+
+namespace mongo {
+
+
+ BSONObj idKeyPattern = fromjson("{\"_id\":1}");
+
+ NamespaceDetails::NamespaceDetails( const DiskLoc &loc, bool capped ) {
+ BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails) );
+
+ /* be sure to initialize new fields here -- doesn't default to zeroes the way we use it */
+ firstExtent = lastExtent = capExtent = loc;
+ stats.datasize = stats.nrecords = 0;
+ lastExtentSize = 0;
+ nIndexes = 0;
+ isCapped = capped;
+ maxDocsInCapped = 0x7fffffff; // no limit (value is for pre-v2.3.2 compatibility)
+ paddingFactor = 1.0;
+ systemFlagsOldDoNotUse = 0;
+ userFlags = 0;
+ capFirstNewRecord = DiskLoc();
+ // Signal that we are on first allocation iteration through extents.
+ capFirstNewRecord.setInvalid();
+ // For capped case, signal that we are doing initial extent allocation.
+ if ( capped ) {
+ // WAS: cappedLastDelRecLastExtent().setInvalid();
+ deletedList[1].setInvalid();
+ }
+ verify( sizeof(_dataFileVersion) == 2 );
+ _dataFileVersion = 0;
+ _indexFileVersion = 0;
+ multiKeyIndexBits = 0;
+ _reservedA = 0;
+ _extraOffset = 0;
+ indexBuildsInProgress = 0;
+ memset(_reserved, 0, sizeof(_reserved));
+ }
+
+ NamespaceDetails::Extra* NamespaceDetails::allocExtra( OperationContext* txn,
+ const StringData& ns,
+ NamespaceIndex& ni,
+ int nindexessofar) {
+ txn->lockState()->assertWriteLocked(ns);
+
+ int i = (nindexessofar - NIndexesBase) / NIndexesExtra;
+ verify( i >= 0 && i <= 1 );
+
+ Namespace fullns( ns );
+ Namespace extrans( fullns.extraName(i) ); // throws UserException if ns name too long
+
+ massert( 10350, "allocExtra: base ns missing?", this );
+ massert( 10351, "allocExtra: extra already exists", ni.details(extrans) == 0 );
+
+ Extra temp;
+ temp.init();
+
+ ni.add_ns( txn, extrans, reinterpret_cast<NamespaceDetails*>( &temp ) );
+ Extra* e = reinterpret_cast<NamespaceDetails::Extra*>( ni.details( extrans ) );
+
+ long ofs = e->ofsFrom(this);
+ if( i == 0 ) {
+ verify( _extraOffset == 0 );
+ *txn->recoveryUnit()->writing(&_extraOffset) = ofs;
+ verify( extra() == e );
+ }
+ else {
+ Extra *hd = extra();
+ verify( hd->next(this) == 0 );
+ hd->setNext(txn, ofs);
+ }
+ return e;
+ }
+
+ IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected) {
+ if( idxNo < NIndexesBase ) {
+ IndexDetails& id = _indexes[idxNo];
+ return id;
+ }
+ Extra *e = extra();
+ if ( ! e ) {
+ if ( missingExpected )
+ throw MsgAssertionException( 13283 , "Missing Extra" );
+ massert(14045, "missing Extra", e);
+ }
+ int i = idxNo - NIndexesBase;
+ if( i >= NIndexesExtra ) {
+ e = e->next(this);
+ if ( ! e ) {
+ if ( missingExpected )
+ throw MsgAssertionException( 14823 , "missing extra" );
+ massert(14824, "missing Extra", e);
+ }
+ i -= NIndexesExtra;
+ }
+ return e->details[i];
+ }
+
+
+ const IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected) const {
+ if( idxNo < NIndexesBase ) {
+ const IndexDetails& id = _indexes[idxNo];
+ return id;
+ }
+ const Extra *e = extra();
+ if ( ! e ) {
+ if ( missingExpected )
+ throw MsgAssertionException( 17421 , "Missing Extra" );
+ massert(17422, "missing Extra", e);
+ }
+ int i = idxNo - NIndexesBase;
+ if( i >= NIndexesExtra ) {
+ e = e->next(this);
+ if ( ! e ) {
+ if ( missingExpected )
+ throw MsgAssertionException( 17423 , "missing extra" );
+ massert(17424, "missing Extra", e);
+ }
+ i -= NIndexesExtra;
+ }
+ return e->details[i];
+ }
+
+ NamespaceDetails::IndexIterator::IndexIterator(const NamespaceDetails *_d,
+ bool includeBackgroundInProgress) {
+ d = _d;
+ i = 0;
+ n = d->nIndexes;
+ if ( includeBackgroundInProgress )
+ n += d->indexBuildsInProgress;
+ }
+
+ // must be called when renaming a NS to fix up extra
+ void NamespaceDetails::copyingFrom( OperationContext* txn,
+ const StringData& thisns,
+ NamespaceIndex& ni,
+ NamespaceDetails* src) {
+ _extraOffset = 0; // we are a copy -- the old value is wrong. fixing it up below.
+ Extra *se = src->extra();
+ int n = NIndexesBase;
+ if( se ) {
+ Extra *e = allocExtra(txn, thisns, ni, n);
+ while( 1 ) {
+ n += NIndexesExtra;
+ e->copy(this, *se);
+ se = se->next(src);
+ if( se == 0 ) break;
+ Extra *nxt = allocExtra(txn, thisns, ni, n);
+ e->setNext( txn, nxt->ofsFrom(this) );
+ e = nxt;
+ }
+ verify( _extraOffset );
+ }
+ }
+
+ NamespaceDetails* NamespaceDetails::writingWithoutExtra( OperationContext* txn ) {
+ return txn->recoveryUnit()->writing( this );
+ }
+
+
+ // XXX - this method should go away
+ NamespaceDetails *NamespaceDetails::writingWithExtra( OperationContext* txn ) {
+ for( Extra *e = extra(); e; e = e->next( this ) ) {
+ txn->recoveryUnit()->writing( e );
+ }
+ return writingWithoutExtra( txn );
+ }
+
+ void NamespaceDetails::setMaxCappedDocs( OperationContext* txn, long long max ) {
+ massert( 16499,
+ "max in a capped collection has to be < 2^31 or -1",
+ CollectionOptions::validMaxCappedDocs( &max ) );
+ maxDocsInCapped = max;
+ }
+
+ /* ------------------------------------------------------------------------- */
+
+
+ int NamespaceDetails::_catalogFindIndexByName(const Collection* coll,
+ const StringData& name,
+ bool includeBackgroundInProgress) const {
+ IndexIterator i = ii(includeBackgroundInProgress);
+ while( i.more() ) {
+ const BSONObj obj = coll->docFor(i.next().info);
+ if ( name == obj.getStringField("name") )
+ return i.pos()-1;
+ }
+ return -1;
+ }
+
+ void NamespaceDetails::Extra::setNext( OperationContext* txn,
+ long ofs ) {
+ *txn->recoveryUnit()->writing(&_next) = ofs;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.h
new file mode 100644
index 00000000000..0a6734e7d9d
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.h
@@ -0,0 +1,229 @@
+/**
+ * Copyright (C) 2008 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/diskloc.h"
+#include "mongo/db/namespace_string.h"
+#include "mongo/db/storage/mmap_v1/catalog/index_details.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_index.h"
+
+namespace mongo {
+
+ class Collection;
+ class OperationContext;
+
+ /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes
+ so you can look for a deleterecord about the right size.
+ */
+ const int Buckets = 19;
+ const int MaxBucket = 18;
+
+ extern int bucketSizes[];
+
+#pragma pack(1)
+ /* NamespaceDetails : this is the "header" for a collection that has all its details.
+ It's in the .ns file and this is a memory mapped region (thus the pack pragma above).
+ */
+ class NamespaceDetails {
+ public:
+ enum { NIndexesMax = 64, NIndexesExtra = 30, NIndexesBase = 10 };
+
+
+
+ /*-------- data fields, as present on disk : */
+
+ DiskLoc firstExtent;
+ DiskLoc lastExtent;
+
+ /* NOTE: capped collections v1 override the meaning of deletedList.
+ deletedList[0] points to a list of free records (DeletedRecord's) for all extents in
+ the capped namespace.
+ deletedList[1] points to the last record in the prev extent. When the "current extent"
+ changes, this value is updated. !deletedList[1].isValid() when this value is not
+ yet computed.
+ */
+ DiskLoc deletedList[Buckets];
+
+ // ofs 168 (8 byte aligned)
+ struct Stats {
+ // datasize and nrecords MUST Be adjacent code assumes!
+ long long datasize; // this includes padding, but not record headers
+ long long nrecords;
+ } stats;
+
+
+ int lastExtentSize;
+
+ int nIndexes;
+
+ // ofs 192
+ IndexDetails _indexes[NIndexesBase];
+
+ public:
+ // ofs 352 (16 byte aligned)
+ int isCapped; // there is wasted space here if I'm right (ERH)
+
+ int maxDocsInCapped; // max # of objects for a capped table, -1 for inf.
+
+ double paddingFactor; // 1.0 = no padding.
+ // ofs 368 (16)
+ int systemFlagsOldDoNotUse; // things that the system sets/cares about
+
+ DiskLoc capExtent; // the "current" extent we're writing too for a capped collection
+ DiskLoc capFirstNewRecord;
+
+ unsigned short _dataFileVersion; // NamespaceDetails version. So we can do backward compatibility in the future. See filever.h
+ unsigned short _indexFileVersion;
+
+ unsigned long long multiKeyIndexBits;
+
+ // ofs 400 (16)
+ unsigned long long _reservedA;
+ long long _extraOffset; // where the $extra info is located (bytes relative to this)
+
+ public:
+ int indexBuildsInProgress; // Number of indexes currently being built
+
+ int userFlags;
+
+ char _reserved[72];
+ /*-------- end data 496 bytes */
+ public:
+ explicit NamespaceDetails( const DiskLoc &loc, bool _capped );
+
+ class Extra {
+ long long _next;
+ public:
+ IndexDetails details[NIndexesExtra];
+ private:
+ unsigned reserved2;
+ unsigned reserved3;
+ Extra(const Extra&) { verify(false); }
+ Extra& operator=(const Extra& r) { verify(false); return *this; }
+ public:
+ Extra() { }
+ long ofsFrom(NamespaceDetails *d) {
+ return ((char *) this) - ((char *) d);
+ }
+ void init() { memset(this, 0, sizeof(Extra)); }
+ Extra* next(const NamespaceDetails *d) const {
+ if( _next == 0 ) return 0;
+ return (Extra*) (((char *) d) + _next);
+ }
+ void setNext(OperationContext* txn, long ofs);
+ void copy(NamespaceDetails *d, const Extra& e) {
+ memcpy(this, &e, sizeof(Extra));
+ _next = 0;
+ }
+ };
+ Extra* extra() const {
+ if( _extraOffset == 0 ) return 0;
+ return (Extra *) (((char *) this) + _extraOffset);
+ }
+ /* add extra space for indexes when more than 10 */
+ Extra* allocExtra( OperationContext* txn,
+ const StringData& ns,
+ NamespaceIndex& ni,
+ int nindexessofar );
+
+ void copyingFrom( OperationContext* txn,
+ const StringData& thisns,
+ NamespaceIndex& ni,
+ NamespaceDetails *src); // must be called when renaming a NS to fix up extra
+
+ public:
+ void setMaxCappedDocs( OperationContext* txn, long long max );
+
+ enum UserFlags {
+ Flag_UsePowerOf2Sizes = 1 << 0
+ };
+
+ IndexDetails& idx(int idxNo, bool missingExpected = false );
+ const IndexDetails& idx(int idxNo, bool missingExpected = false ) const;
+
+ class IndexIterator {
+ public:
+ int pos() { return i; } // note this is the next one to come
+ bool more() { return i < n; }
+ const IndexDetails& next() { return d->idx(i++); }
+ private:
+ friend class NamespaceDetails;
+ int i, n;
+ const NamespaceDetails *d;
+ IndexIterator(const NamespaceDetails *_d, bool includeBackgroundInProgress);
+ };
+
+ IndexIterator ii( bool includeBackgroundInProgress = false ) const {
+ return IndexIterator(this, includeBackgroundInProgress);
+ }
+
+ /**
+ * This fetches the IndexDetails for the next empty index slot. The caller must populate
+ * returned object. This handles allocating extra index space, if necessary.
+ */
+ IndexDetails& getNextIndexDetails(OperationContext* txn, Collection* collection);
+
+ NamespaceDetails *writingWithoutExtra( OperationContext* txn );
+
+ /** Make all linked Extra objects writeable as well */
+ NamespaceDetails *writingWithExtra( OperationContext* txn );
+
+ /**
+ * Returns the offset of the specified index name within the array of indexes. Must be
+ * passed-in the owning collection to resolve the index record entries to objects.
+ *
+ * @return > 0 if index name was found, -1 otherwise.
+ */
+ int _catalogFindIndexByName(const Collection* coll,
+ const StringData& name,
+ bool includeBackgroundInProgress) const;
+
+ private:
+
+ /**
+ * swaps all meta data for 2 indexes
+ * a and b are 2 index ids, whose contents will be swapped
+ * must have a lock on the entire collection to do this
+ */
+ void swapIndex( OperationContext* txn, int a, int b );
+
+ friend class IndexCatalog;
+ friend class IndexCatalogEntry;
+
+ /** Update cappedLastDelRecLastExtent() after capExtent changed in cappedTruncateAfter() */
+ void cappedTruncateLastDelUpdate();
+ BOOST_STATIC_ASSERT( NIndexesMax <= NIndexesBase + NIndexesExtra*2 );
+ BOOST_STATIC_ASSERT( NIndexesMax <= 64 ); // multiKey bits
+ BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) == 496 );
+ }; // NamespaceDetails
+ BOOST_STATIC_ASSERT( sizeof(NamespaceDetails) == 496 );
+#pragma pack()
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp
new file mode 100644
index 00000000000..27957a297a5
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp
@@ -0,0 +1,333 @@
+// namespace_details_collection_entry.h
+
+/**
+* Copyright (C) 2014 MongoDB Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h"
+
+#include "mongo/db/index/index_descriptor.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h"
+#include "mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h"
+#include "mongo/db/storage/record_store.h"
+#include "mongo/util/startup_test.h"
+
+namespace mongo {
+ NamespaceDetailsCollectionCatalogEntry::NamespaceDetailsCollectionCatalogEntry( const StringData& ns,
+ NamespaceDetails* details,
+ RecordStore* indexRecordStore,
+ MMAPV1DatabaseCatalogEntry* db )
+ : CollectionCatalogEntry( ns ),
+ _details( details ),
+ _indexRecordStore( indexRecordStore ),
+ _db( db ) {
+ }
+
+ CollectionOptions NamespaceDetailsCollectionCatalogEntry::getCollectionOptions(OperationContext* txn) const {
+ return _db->getCollectionOptions( txn, ns().ns() );
+ }
+
+ int NamespaceDetailsCollectionCatalogEntry::getTotalIndexCount() const {
+ return _details->nIndexes + _details->indexBuildsInProgress;
+ }
+
+ int NamespaceDetailsCollectionCatalogEntry::getCompletedIndexCount() const {
+ return _details->nIndexes;
+ }
+
+ int NamespaceDetailsCollectionCatalogEntry::getMaxAllowedIndexes() const {
+ return NamespaceDetails::NIndexesMax;
+ }
+
+ void NamespaceDetailsCollectionCatalogEntry::getAllIndexes( std::vector<std::string>* names ) const {
+ NamespaceDetails::IndexIterator i = _details->ii( true );
+ while ( i.more() ) {
+ const IndexDetails& id = i.next();
+ const BSONObj obj = _indexRecordStore->dataFor( id.info ).toBson();
+ names->push_back( obj.getStringField("name") );
+ }
+ }
+
+ bool NamespaceDetailsCollectionCatalogEntry::isIndexMultikey(const StringData& idxName) const {
+ int idxNo = _findIndexNumber( idxName );
+ invariant( idxNo >= 0 );
+ return isIndexMultikey( idxNo );
+ }
+
+ bool NamespaceDetailsCollectionCatalogEntry::isIndexMultikey(int idxNo) const {
+ return (_details->multiKeyIndexBits & (((unsigned long long) 1) << idxNo)) != 0;
+ }
+
+ bool NamespaceDetailsCollectionCatalogEntry::setIndexIsMultikey(OperationContext* txn,
+ const StringData& indexName,
+ bool multikey ) {
+
+ int idxNo = _findIndexNumber( indexName );
+ invariant( idxNo >= 0 );
+ return setIndexIsMultikey( txn, idxNo, multikey );
+ }
+
+ bool NamespaceDetailsCollectionCatalogEntry::setIndexIsMultikey(OperationContext* txn,
+ int idxNo,
+ bool multikey ) {
+ unsigned long long mask = 1ULL << idxNo;
+
+ if (multikey) {
+ // Shortcut if the bit is already set correctly
+ if (_details->multiKeyIndexBits & mask) {
+ return false;
+ }
+
+ *txn->recoveryUnit()->writing(&_details->multiKeyIndexBits) |= mask;
+ }
+ else {
+ // Shortcut if the bit is already set correctly
+ if (!(_details->multiKeyIndexBits & mask)) {
+ return false;
+ }
+
+ // Invert mask: all 1's except a 0 at the ith bit
+ mask = ~mask;
+ *txn->recoveryUnit()->writing(&_details->multiKeyIndexBits) &= mask;
+ }
+
+ return true;
+ }
+
+ DiskLoc NamespaceDetailsCollectionCatalogEntry::getIndexHead( const StringData& idxName ) const {
+ int idxNo = _findIndexNumber( idxName );
+ invariant( idxNo >= 0 );
+ return _details->idx( idxNo ).head;
+ }
+
+ BSONObj NamespaceDetailsCollectionCatalogEntry::getIndexSpec( const StringData& idxName ) const {
+ int idxNo = _findIndexNumber( idxName );
+ invariant( idxNo >= 0 );
+ const IndexDetails& id = _details->idx( idxNo );
+ return _indexRecordStore->dataFor( id.info ).toBson();
+ }
+
+ void NamespaceDetailsCollectionCatalogEntry::setIndexHead( OperationContext* txn,
+ const StringData& idxName,
+ const DiskLoc& newHead ) {
+ int idxNo = _findIndexNumber( idxName );
+ invariant( idxNo >= 0 );
+ *txn->recoveryUnit()->writing( &_details->idx( idxNo ).head) = newHead;
+ }
+
+ bool NamespaceDetailsCollectionCatalogEntry::isIndexReady( const StringData& idxName ) const {
+ int idxNo = _findIndexNumber( idxName );
+ invariant( idxNo >= 0 );
+ return idxNo < getCompletedIndexCount();
+ }
+
+ int NamespaceDetailsCollectionCatalogEntry::_findIndexNumber( const StringData& idxName ) const {
+ NamespaceDetails::IndexIterator i = _details->ii( true );
+ while ( i.more() ) {
+ const IndexDetails& id = i.next();
+ int idxNo = i.pos() - 1;
+ const BSONObj obj = _indexRecordStore->dataFor( id.info ).toBson();
+ if ( idxName == obj.getStringField("name") )
+ return idxNo;
+ }
+ return -1;
+ }
+
+ /* remove bit from a bit array - actually remove its slot, not a clear
+ note: this function does not work with x == 63 -- that is ok
+ but keep in mind in the future if max indexes were extended to
+ exactly 64 it would be a problem
+ */
+ unsigned long long removeAndSlideBit(unsigned long long b, int x) {
+ unsigned long long tmp = b;
+ return
+ (tmp & ((((unsigned long long) 1) << x)-1)) |
+ ((tmp >> (x+1)) << x);
+ }
+
+ class IndexUpdateTest : public StartupTest {
+ public:
+ void run() {
+ verify( removeAndSlideBit(1, 0) == 0 );
+ verify( removeAndSlideBit(2, 0) == 1 );
+ verify( removeAndSlideBit(2, 1) == 0 );
+ verify( removeAndSlideBit(255, 1) == 127 );
+ verify( removeAndSlideBit(21, 2) == 9 );
+ verify( removeAndSlideBit(0x4000000000000001ULL, 62) == 1 );
+ }
+ } iu_unittest;
+
+ Status NamespaceDetailsCollectionCatalogEntry::removeIndex( OperationContext* txn,
+ const StringData& indexName ) {
+ int idxNo = _findIndexNumber( indexName );
+ if ( idxNo < 0 )
+ return Status( ErrorCodes::NamespaceNotFound, "index not found to remove" );
+
+ DiskLoc infoLocation = _details->idx( idxNo ).info;
+
+ { // sanity check
+ BSONObj info = _indexRecordStore->dataFor( infoLocation ).toBson();
+ invariant( info["name"].String() == indexName );
+ }
+
+ { // drop the namespace
+ string indexNamespace = IndexDescriptor::makeIndexNamespace( ns().ns(), indexName );
+ Status status = _db->dropCollection( txn, indexNamespace );
+ if ( !status.isOK() ) {
+ return status;
+ }
+ }
+
+ { // all info in the .ns file
+ NamespaceDetails* d = _details->writingWithExtra( txn );
+
+ // fix the _multiKeyIndexBits, by moving all bits above me down one
+ d->multiKeyIndexBits = removeAndSlideBit(d->multiKeyIndexBits, idxNo);
+
+ if ( idxNo >= d->nIndexes )
+ d->indexBuildsInProgress--;
+ else
+ d->nIndexes--;
+
+ for ( int i = idxNo; i < getTotalIndexCount(); i++ )
+ d->idx(i) = d->idx(i+1);
+
+ d->idx( getTotalIndexCount() ) = IndexDetails();
+ }
+
+ // remove from system.indexes
+ _indexRecordStore->deleteRecord( txn, infoLocation );
+
+ return Status::OK();
+ }
+
+ Status NamespaceDetailsCollectionCatalogEntry::prepareForIndexBuild( OperationContext* txn,
+ const IndexDescriptor* desc ) {
+ BSONObj spec = desc->infoObj();
+ // 1) entry in system.indexs
+ StatusWith<DiskLoc> systemIndexesEntry = _indexRecordStore->insertRecord( txn,
+ spec.objdata(),
+ spec.objsize(),
+ -1 );
+ if ( !systemIndexesEntry.isOK() )
+ return systemIndexesEntry.getStatus();
+
+ // 2) NamespaceDetails mods
+ IndexDetails *id;
+ try {
+ id = &_details->idx(getTotalIndexCount(), true);
+ }
+ catch( DBException& ) {
+ _details->allocExtra(txn,
+ ns().ns(),
+ _db->_namespaceIndex,
+ getTotalIndexCount());
+ id = &_details->idx(getTotalIndexCount(), false);
+ }
+
+ *txn->recoveryUnit()->writing( &id->info ) = systemIndexesEntry.getValue();
+ *txn->recoveryUnit()->writing( &id->head ) = DiskLoc();
+
+ txn->recoveryUnit()->writingInt( _details->indexBuildsInProgress ) += 1;
+
+ // 3) indexes entry in .ns file
+ NamespaceIndex& nsi = _db->_namespaceIndex;
+ invariant( nsi.details( desc->indexNamespace() ) == NULL );
+ nsi.add_ns( txn, desc->indexNamespace(), DiskLoc(), false );
+
+ // 4) system.namespaces entry index ns
+ _db->_addNamespaceToNamespaceCollection( txn, desc->indexNamespace(), NULL);
+
+ return Status::OK();
+ }
+
+ void NamespaceDetailsCollectionCatalogEntry::indexBuildSuccess( OperationContext* txn,
+ const StringData& indexName ) {
+ int idxNo = _findIndexNumber( indexName );
+ fassert( 17202, idxNo >= 0 );
+
+ // Make sure the newly created index is relocated to nIndexes, if it isn't already there
+ if ( idxNo != getCompletedIndexCount() ) {
+ int toIdxNo = getCompletedIndexCount();
+
+ //_details->swapIndex( txn, idxNo, toIdxNo );
+
+ // flip main meta data
+ IndexDetails temp = _details->idx(idxNo);
+ *txn->recoveryUnit()->writing(&_details->idx(idxNo)) = _details->idx(toIdxNo);
+ *txn->recoveryUnit()->writing(&_details->idx(toIdxNo)) = temp;
+
+ // flip multi key bits
+ bool tempMultikey = isIndexMultikey(idxNo);
+ setIndexIsMultikey( txn, idxNo, isIndexMultikey(toIdxNo) );
+ setIndexIsMultikey( txn, toIdxNo, tempMultikey );
+
+ idxNo = toIdxNo;
+ invariant( idxNo = _findIndexNumber( indexName ) );
+ }
+
+ txn->recoveryUnit()->writingInt( _details->indexBuildsInProgress ) -= 1;
+ txn->recoveryUnit()->writingInt( _details->nIndexes ) += 1;
+
+ invariant( isIndexReady( indexName ) );
+ }
+
+ void NamespaceDetailsCollectionCatalogEntry::updateTTLSetting( OperationContext* txn,
+ const StringData& idxName,
+ long long newExpireSeconds ) {
+ int idx = _findIndexNumber( idxName );
+ invariant( idx >= 0 );
+
+ IndexDetails& indexDetails = _details->idx( idx );
+
+ BSONObj obj = _indexRecordStore->dataFor( indexDetails.info ).toBson();
+ const BSONElement oldExpireSecs = obj.getField("expireAfterSeconds");
+
+ // Important that we set the new value in-place. We are writing directly to the
+ // object here so must be careful not to overwrite with a longer numeric type.
+
+ char* nonConstPtr = const_cast<char*>(oldExpireSecs.value());
+ switch( oldExpireSecs.type() ) {
+ case EOO:
+ massert( 16631, "index does not have an 'expireAfterSeconds' field", false );
+ break;
+ case NumberInt:
+ *txn->recoveryUnit()->writing(reinterpret_cast<int*>(nonConstPtr)) = newExpireSeconds;
+ break;
+ case NumberDouble:
+ *txn->recoveryUnit()->writing(reinterpret_cast<double*>(nonConstPtr)) = newExpireSeconds;
+ break;
+ case NumberLong:
+ *txn->recoveryUnit()->writing(reinterpret_cast<long long*>(nonConstPtr)) = newExpireSeconds;
+ break;
+ default:
+ massert( 16632, "current 'expireAfterSeconds' is not a number", false );
+ }
+ }
+
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h
new file mode 100644
index 00000000000..78a5b96f181
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h
@@ -0,0 +1,109 @@
+// namespace_details_collection_entry.h
+
+#pragma once
+
+/**
+* Copyright (C) 2014 MongoDB Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#include "mongo/base/string_data.h"
+#include "mongo/bson/bsonobj.h"
+#include "mongo/db/catalog/collection_catalog_entry.h"
+#include "mongo/db/diskloc.h"
+
+namespace mongo {
+
+ class NamespaceDetails;
+
+ class MMAPV1DatabaseCatalogEntry;;
+ class RecordStore;
+ class OperationContext;
+
+ class NamespaceDetailsCollectionCatalogEntry : public CollectionCatalogEntry {
+ public:
+ NamespaceDetailsCollectionCatalogEntry( const StringData& ns,
+ NamespaceDetails* details,
+ RecordStore* indexRecordStore,
+ MMAPV1DatabaseCatalogEntry* db );
+
+ virtual ~NamespaceDetailsCollectionCatalogEntry(){}
+
+ virtual CollectionOptions getCollectionOptions(OperationContext* txn) const;
+
+ virtual int getTotalIndexCount() const;
+
+ virtual int getCompletedIndexCount() const;
+
+ virtual int getMaxAllowedIndexes() const;
+
+ virtual void getAllIndexes( std::vector<std::string>* names ) const;
+
+ virtual BSONObj getIndexSpec( const StringData& idxName ) const;
+
+ virtual bool isIndexMultikey(const StringData& indexName) const;
+ virtual bool isIndexMultikey(int idxNo) const;
+
+ virtual bool setIndexIsMultikey(OperationContext* txn,
+ int idxNo,
+ bool multikey = true);
+ virtual bool setIndexIsMultikey(OperationContext* txn,
+ const StringData& indexName,
+ bool multikey = true);
+
+ virtual DiskLoc getIndexHead( const StringData& indexName ) const;
+
+ virtual void setIndexHead( OperationContext* txn,
+ const StringData& indexName,
+ const DiskLoc& newHead );
+
+ virtual bool isIndexReady( const StringData& indexName ) const;
+
+ virtual Status removeIndex( OperationContext* txn,
+ const StringData& indexName );
+
+ virtual Status prepareForIndexBuild( OperationContext* txn,
+ const IndexDescriptor* spec );
+
+ virtual void indexBuildSuccess( OperationContext* txn,
+ const StringData& indexName );
+
+ virtual void updateTTLSetting( OperationContext* txn,
+ const StringData& idxName,
+ long long newExpireSeconds );
+
+ // not part of interface, but available to my storage engine
+
+ int _findIndexNumber( const StringData& indexName) const;
+
+ private:
+ NamespaceDetails* _details;
+ RecordStore* _indexRecordStore;
+ MMAPV1DatabaseCatalogEntry* _db;
+
+ friend class MMAPV1DatabaseCatalogEntry;
+ };
+}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp
new file mode 100644
index 00000000000..2f168bd19a6
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp
@@ -0,0 +1,225 @@
+// namespace_details_rsv1_metadata.cpp
+
+/**
+ * Copyright (C) 2014 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/ops/update.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h"
+
+namespace mongo {
+ NamespaceDetailsRSV1MetaData::NamespaceDetailsRSV1MetaData( const StringData& ns,
+ NamespaceDetails* details,
+ RecordStore* namespaceRecordStore )
+ : _ns( ns.toString() ),
+ _details( details ),
+ _namespaceRecordStore( namespaceRecordStore ) {
+ }
+
+ const DiskLoc& NamespaceDetailsRSV1MetaData::capExtent() const {
+ return _details->capExtent;
+ }
+
+ void NamespaceDetailsRSV1MetaData::setCapExtent( OperationContext* txn, const DiskLoc& loc ) {
+ *txn->recoveryUnit()->writing( &_details->capExtent ) = loc;
+ }
+
+ const DiskLoc& NamespaceDetailsRSV1MetaData::capFirstNewRecord() const {
+ return _details->capFirstNewRecord;
+ }
+
+ void NamespaceDetailsRSV1MetaData::setCapFirstNewRecord( OperationContext* txn,
+ const DiskLoc& loc ) {
+ *txn->recoveryUnit()->writing( &_details->capFirstNewRecord ) = loc;
+ }
+
+ bool NamespaceDetailsRSV1MetaData::capLooped() const {
+ return _details->capFirstNewRecord.isValid();
+ }
+
+ long long NamespaceDetailsRSV1MetaData::dataSize() const {
+ return _details->stats.datasize;
+ }
+ long long NamespaceDetailsRSV1MetaData::numRecords() const {
+ return _details->stats.nrecords;
+ }
+
+ void NamespaceDetailsRSV1MetaData::incrementStats( OperationContext* txn,
+ long long dataSizeIncrement,
+ long long numRecordsIncrement ) {
+ // durability todo : this could be a bit annoying / slow to record constantly
+ NamespaceDetails::Stats* s = txn->recoveryUnit()->writing( &_details->stats );
+ s->datasize += dataSizeIncrement;
+ s->nrecords += numRecordsIncrement;
+ }
+
+ void NamespaceDetailsRSV1MetaData::setStats( OperationContext* txn,
+ long long dataSize,
+ long long numRecords ) {
+ NamespaceDetails::Stats* s = txn->recoveryUnit()->writing( &_details->stats );
+ s->datasize = dataSize;
+ s->nrecords = numRecords;
+ }
+
+ const DiskLoc& NamespaceDetailsRSV1MetaData::deletedListEntry( int bucket ) const {
+ return _details->deletedList[ bucket ];
+ }
+
+ void NamespaceDetailsRSV1MetaData::setDeletedListEntry( OperationContext* txn,
+ int bucket,
+ const DiskLoc& loc ) {
+ *txn->recoveryUnit()->writing( &_details->deletedList[bucket] ) = loc;
+ }
+
+ void NamespaceDetailsRSV1MetaData::orphanDeletedList( OperationContext* txn ) {
+ for( int i = 0; i < Buckets; i++ ) {
+ setDeletedListEntry( txn, i, DiskLoc() );
+ }
+ }
+
+ const DiskLoc& NamespaceDetailsRSV1MetaData::firstExtent( OperationContext* txn ) const {
+ return _details->firstExtent;
+ }
+
+ void NamespaceDetailsRSV1MetaData::setFirstExtent( OperationContext* txn, const DiskLoc& loc ) {
+ *txn->recoveryUnit()->writing( &_details->firstExtent ) = loc;
+ }
+
+ const DiskLoc& NamespaceDetailsRSV1MetaData::lastExtent( OperationContext* txn ) const {
+ return _details->lastExtent;
+ }
+
+ void NamespaceDetailsRSV1MetaData::setLastExtent( OperationContext* txn, const DiskLoc& loc ) {
+ *txn->recoveryUnit()->writing( &_details->lastExtent ) = loc;
+ }
+
+ bool NamespaceDetailsRSV1MetaData::isCapped() const {
+ return _details->isCapped;
+ }
+
+ bool NamespaceDetailsRSV1MetaData::isUserFlagSet( int flag ) const {
+ return _details->userFlags & flag;
+ }
+
+ int NamespaceDetailsRSV1MetaData::userFlags() const {
+ return _details->userFlags;
+ }
+
+ bool NamespaceDetailsRSV1MetaData::setUserFlag( OperationContext* txn, int flag ) {
+ if ( ( _details->userFlags & flag ) == flag )
+ return false;
+
+ txn->recoveryUnit()->writingInt( _details->userFlags) |= flag;
+ _syncUserFlags( txn );
+ return true;
+ }
+
+ bool NamespaceDetailsRSV1MetaData::clearUserFlag( OperationContext* txn, int flag ) {
+ if ( ( _details->userFlags & flag ) == 0 )
+ return false;
+
+ txn->recoveryUnit()->writingInt(_details->userFlags) &= ~flag;
+ _syncUserFlags( txn );
+ return true;
+ }
+
+ bool NamespaceDetailsRSV1MetaData::replaceUserFlags( OperationContext* txn, int flags ) {
+ if ( _details->userFlags == flags )
+ return false;
+
+ txn->recoveryUnit()->writingInt(_details->userFlags) = flags;
+ _syncUserFlags( txn );
+ return true;
+ }
+
+ int NamespaceDetailsRSV1MetaData::lastExtentSize( OperationContext* txn ) const {
+ return _details->lastExtentSize;
+ }
+
+ void NamespaceDetailsRSV1MetaData::setLastExtentSize( OperationContext* txn, int newMax ) {
+ if ( _details->lastExtentSize == newMax )
+ return;
+ txn->recoveryUnit()->writingInt(_details->lastExtentSize) = newMax;
+ }
+
+ long long NamespaceDetailsRSV1MetaData::maxCappedDocs() const {
+ invariant( _details->isCapped );
+ if ( _details->maxDocsInCapped == 0x7fffffff )
+ return numeric_limits<long long>::max();
+ return _details->maxDocsInCapped;
+ }
+
+ double NamespaceDetailsRSV1MetaData::paddingFactor() const {
+ return _details->paddingFactor;
+ }
+
+ void NamespaceDetailsRSV1MetaData::setPaddingFactor( OperationContext* txn, double paddingFactor ) {
+ if ( paddingFactor == _details->paddingFactor )
+ return;
+
+ if ( _details->isCapped )
+ return;
+
+ *txn->recoveryUnit()->writing(&_details->paddingFactor) = paddingFactor;
+ }
+
+ void NamespaceDetailsRSV1MetaData::_syncUserFlags( OperationContext* txn ) {
+ if ( !_namespaceRecordStore )
+ return;
+
+ scoped_ptr<RecordIterator> iterator( _namespaceRecordStore->getIterator( txn,
+ DiskLoc(),
+ false,
+ CollectionScanParams::FORWARD ) );
+ while ( !iterator->isEOF() ) {
+ DiskLoc loc = iterator->getNext();
+
+ BSONObj oldEntry = iterator->dataFor( loc ).toBson();
+ BSONElement e = oldEntry["name"];
+ if ( e.type() != String )
+ continue;
+
+ if ( e.String() != _ns )
+ continue;
+
+ BSONObj newEntry = applyUpdateOperators( oldEntry,
+ BSON( "$set" << BSON( "options.flags" << userFlags() ) ) );
+
+ StatusWith<DiskLoc> result = _namespaceRecordStore->updateRecord( txn,
+ loc,
+ newEntry.objdata(),
+ newEntry.objsize(),
+ -1,
+ NULL );
+ fassert( 17486, result.isOK() );
+ return;
+ }
+
+ fassertFailed( 17488 );
+ }
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h
new file mode 100644
index 00000000000..9f933d003e5
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h
@@ -0,0 +1,111 @@
+// namespace_details_rsv1_metadata.h
+
+/**
+ * Copyright (C) 2014 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include <string>
+
+#include "mongo/base/string_data.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_base.h"
+
+namespace mongo {
+
+ class RecordStore;
+
+ /*
+ * NOTE: NamespaceDetails will become a struct
+ * all dur, etc... will move here
+ */
+ class NamespaceDetailsRSV1MetaData : public RecordStoreV1MetaData {
+ public:
+ explicit NamespaceDetailsRSV1MetaData( const StringData& ns,
+ NamespaceDetails* details,
+ RecordStore* namespaceRecordStore );
+
+ virtual ~NamespaceDetailsRSV1MetaData(){}
+
+ virtual const DiskLoc& capExtent() const;
+ virtual void setCapExtent( OperationContext* txn, const DiskLoc& loc );
+
+ virtual const DiskLoc& capFirstNewRecord() const;
+ virtual void setCapFirstNewRecord( OperationContext* txn, const DiskLoc& loc );
+
+ virtual bool capLooped() const;
+
+ virtual long long dataSize() const;
+ virtual long long numRecords() const;
+
+ virtual void incrementStats( OperationContext* txn,
+ long long dataSizeIncrement,
+ long long numRecordsIncrement );
+
+ virtual void setStats( OperationContext* txn,
+ long long dataSize,
+ long long numRecords );
+
+ virtual const DiskLoc& deletedListEntry( int bucket ) const;
+ virtual void setDeletedListEntry( OperationContext* txn,
+ int bucket,
+ const DiskLoc& loc );
+ virtual void orphanDeletedList(OperationContext* txn);
+
+ virtual const DiskLoc& firstExtent( OperationContext* txn ) const;
+ virtual void setFirstExtent( OperationContext* txn, const DiskLoc& loc );
+
+ virtual const DiskLoc& lastExtent( OperationContext* txn ) const;
+ virtual void setLastExtent( OperationContext* txn, const DiskLoc& loc );
+
+ virtual bool isCapped() const;
+
+ virtual bool isUserFlagSet( int flag ) const;
+ virtual int userFlags() const;
+ virtual bool setUserFlag( OperationContext* txn, int flag );
+ virtual bool clearUserFlag( OperationContext* txn, int flag );
+ virtual bool replaceUserFlags( OperationContext* txn, int flags );
+
+ virtual int lastExtentSize( OperationContext* txn ) const;
+ virtual void setLastExtentSize( OperationContext* txn, int newMax );
+
+ virtual long long maxCappedDocs() const;
+
+ virtual double paddingFactor() const;
+ virtual void setPaddingFactor( OperationContext* txn, double paddingFactor );
+
+ private:
+
+ void _syncUserFlags( OperationContext* txn );
+
+ std::string _ns;
+ NamespaceDetails* _details;
+ RecordStore* _namespaceRecordStore;
+ };
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp
new file mode 100644
index 00000000000..9bbf8ef6303
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp
@@ -0,0 +1,205 @@
+// namespace_index.cpp
+
+/**
+ * Copyright (C) 2013 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/platform/basic.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_index.h"
+
+#include <boost/filesystem/operations.hpp>
+
+#include "mongo/db/d_concurrency.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h"
+#include "mongo/util/exit.h"
+#include "mongo/util/log.h"
+
+namespace mongo {
+
+ MONGO_LOG_DEFAULT_COMPONENT_FILE(::mongo::logger::LogComponent::kIndexing);
+
+ NamespaceDetails* NamespaceIndex::details(const StringData& ns) {
+ Namespace n(ns);
+ return details(n);
+ }
+
+ NamespaceDetails* NamespaceIndex::details(const Namespace& ns) {
+ if ( !_ht.get() )
+ return 0;
+ return _ht->get(ns);
+ }
+
+ void NamespaceIndex::add_ns( OperationContext* txn,
+ const StringData& ns, const DiskLoc& loc, bool capped) {
+ NamespaceDetails details( loc, capped );
+ add_ns( txn, ns, &details );
+ }
+
+ void NamespaceIndex::add_ns( OperationContext* txn,
+ const StringData& ns, const NamespaceDetails* details ) {
+ Namespace n(ns);
+ add_ns( txn, n, details );
+ }
+
+ void NamespaceIndex::add_ns( OperationContext* txn,
+ const Namespace& ns, const NamespaceDetails* details ) {
+ string nsString = ns.toString();
+ txn->lockState()->assertWriteLocked( nsString );
+ massert( 17315, "no . in ns", nsString.find( '.' ) != string::npos );
+ init( txn );
+ uassert( 10081, "too many namespaces/collections", _ht->put(txn, ns, *details));
+ }
+
+ void NamespaceIndex::kill_ns( OperationContext* txn, const StringData& ns) {
+ txn->lockState()->assertWriteLocked(ns);
+ if ( !_ht.get() )
+ return;
+ Namespace n(ns);
+ _ht->kill(txn, n);
+
+ if (ns.size() <= Namespace::MaxNsColletionLen) {
+ // Larger namespace names don't have room for $extras so they can't exist. The code
+ // below would cause an "$extra: ns too large" error and stacktrace to be printed to the
+ // log even though everything is fine.
+ for( int i = 0; i<=1; i++ ) {
+ try {
+ Namespace extra(n.extraName(i));
+ _ht->kill(txn, extra);
+ }
+ catch(DBException&) {
+ LOG(3) << "caught exception in kill_ns" << endl;
+ }
+ }
+ }
+ }
+
+ bool NamespaceIndex::pathExists() const {
+ return boost::filesystem::exists(path());
+ }
+
+ boost::filesystem::path NamespaceIndex::path() const {
+ boost::filesystem::path ret( _dir );
+ if (storageGlobalParams.directoryperdb)
+ ret /= _database;
+ ret /= ( _database + ".ns" );
+ return ret;
+ }
+
+ static void namespaceGetNamespacesCallback( const Namespace& k , NamespaceDetails& v , list<string>* l ) {
+ if ( ! k.hasDollarSign() || k == "local.oplog.$main" ) {
+ // we call out local.oplog.$main specifically as its the only "normal"
+ // collection that has a $, so we make sure it gets added
+ l->push_back( k.toString() );
+ }
+ }
+
+ void NamespaceIndex::getCollectionNamespaces( list<string>* tofill ) const {
+ if ( _ht.get() )
+ _ht->iterAll( stdx::bind( namespaceGetNamespacesCallback,
+ stdx::placeholders::_1, stdx::placeholders::_2, tofill) );
+ }
+
+ void NamespaceIndex::maybeMkdir() const {
+ if (!storageGlobalParams.directoryperdb)
+ return;
+ boost::filesystem::path dir( _dir );
+ dir /= _database;
+ if ( !boost::filesystem::exists( dir ) )
+ MONGO_ASSERT_ON_EXCEPTION_WITH_MSG( boost::filesystem::create_directory( dir ), "create dir for db " );
+ }
+
+ NOINLINE_DECL void NamespaceIndex::_init( OperationContext* txn ) {
+ verify( !_ht.get() );
+
+ txn->lockState()->assertWriteLocked(_database);
+
+ /* if someone manually deleted the datafiles for a database,
+ we need to be sure to clear any cached info for the database in
+ local.*.
+ */
+ /*
+ if ( "local" != _database ) {
+ DBInfo i(_database.c_str());
+ i.dbDropped();
+ }
+ */
+
+ unsigned long long len = 0;
+ boost::filesystem::path nsPath = path();
+ string pathString = nsPath.string();
+ void *p = 0;
+ if ( boost::filesystem::exists(nsPath) ) {
+ if( _f.open(pathString, true) ) {
+ len = _f.length();
+ if ( len % (1024*1024) != 0 ) {
+ log() << "bad .ns file: " << pathString << endl;
+ uassert( 10079 , "bad .ns file length, cannot open database", len % (1024*1024) == 0 );
+ }
+ p = _f.getView();
+ }
+ }
+ else {
+ // use storageGlobalParams.lenForNewNsFiles, we are making a new database
+ massert(10343, "bad storageGlobalParams.lenForNewNsFiles",
+ storageGlobalParams.lenForNewNsFiles >= 1024*1024);
+ maybeMkdir();
+ unsigned long long l = storageGlobalParams.lenForNewNsFiles;
+ if ( _f.create(pathString, l, true) ) {
+ // The writes done in this function must not be rolled back. If the containing
+ // UnitOfWork rolls back it should roll back to the state *after* these writes. This
+ // will leave the file empty, but available for future use. That is why we go
+ // directly to the global dur dirty list rather than going through the
+ // OperationContext.
+ getDur().createdFile(pathString, l); // always a new file
+ len = l;
+ verify(len == storageGlobalParams.lenForNewNsFiles);
+ p = _f.getView();
+
+ if ( p ) {
+ // we do this so the durability system isn't mad at us for
+ // only initiating file and not doing a write
+ // grep for 17388
+ getDur().writingPtr( p, 5 ); // throw away
+ }
+ }
+ }
+
+ if ( p == 0 ) {
+ /** TODO: this shouldn't terminate? */
+ log() << "error couldn't open file " << pathString << " terminating" << endl;
+ dbexit( EXIT_FS );
+ }
+
+
+ verify( len <= 0x7fffffff );
+ _ht.reset(new HashTable<Namespace,NamespaceDetails>(p, (int) len, "namespace index"));
+ }
+
+
+}
+
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_index.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.h
new file mode 100644
index 00000000000..3ce2c2e0194
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.h
@@ -0,0 +1,94 @@
+// namespace_index.h
+
+/**
+ * Copyright (C) 2013 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include <list>
+#include <string>
+
+#include "mongo/base/disallow_copying.h"
+#include "mongo/db/diskloc.h"
+#include "mongo/db/storage/mmap_v1/catalog/hashtab.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace.h"
+
+namespace mongo {
+
+ class NamespaceDetails;
+ class OperationContext;
+
+ /* NamespaceIndex is the ".ns" file you see in the data directory. It is the "system catalog"
+ if you will: at least the core parts. (Additional info in system.* collections.)
+ */
+ class NamespaceIndex {
+ MONGO_DISALLOW_COPYING(NamespaceIndex);
+ public:
+ NamespaceIndex(const std::string &dir, const std::string &database) :
+ _ht( 0 ), _dir( dir ), _database( database ) {}
+
+ /* returns true if the file represented by this file exists on disk */
+ bool pathExists() const;
+
+ void init( OperationContext* txn ) {
+ if ( !_ht.get() )
+ _init( txn );
+ }
+
+ void add_ns( OperationContext* txn,
+ const StringData& ns, const DiskLoc& loc, bool capped);
+ void add_ns( OperationContext* txn,
+ const StringData& ns, const NamespaceDetails* details );
+ void add_ns( OperationContext* txn,
+ const Namespace& ns, const NamespaceDetails* details );
+
+ NamespaceDetails* details(const StringData& ns);
+ NamespaceDetails* details(const Namespace& ns);
+
+ void kill_ns( OperationContext* txn,
+ const StringData& ns);
+
+ bool allocated() const { return _ht.get() != 0; }
+
+ void getCollectionNamespaces( std::list<std::string>* tofill ) const;
+
+ boost::filesystem::path path() const;
+
+ unsigned long long fileLength() const { return _f.length(); }
+
+ private:
+ void _init( OperationContext* txn );
+ void maybeMkdir() const;
+
+ DurableMappedFile _f;
+ scoped_ptr<HashTable<Namespace,NamespaceDetails> > _ht;
+ std::string _dir;
+ std::string _database;
+ };
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp
new file mode 100644
index 00000000000..7c50b86a5bf
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp
@@ -0,0 +1,67 @@
+// namespace_test.h
+
+/**
+* Copyright (C) 2008 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#include "mongo/unittest/unittest.h"
+
+#include "mongo/db/storage/mmap_v1/catalog/namespace.h"
+
+namespace mongo {
+
+ TEST( NamespaceTest, Basics ) {
+ Namespace foo( "foo.bar" );
+ Namespace bar( "bar.foo" );
+
+ ASSERT_EQUALS( foo.toString(), foo.toString() );
+ ASSERT_EQUALS( foo.hash(), foo.hash() );
+
+ ASSERT_NOT_EQUALS( foo.hash(), bar.hash() );
+
+ ASSERT( foo == foo );
+ ASSERT( !( foo != foo ) );
+ ASSERT( foo != bar );
+ ASSERT( !( foo == bar ) );
+ }
+
+ TEST( NamespaceTest, ExtraName ) {
+ Namespace foo( "foo.bar" );
+ ASSERT_FALSE( foo.isExtra() );
+
+ string str0 = foo.extraName( 0 );
+ ASSERT_EQUALS( "foo.bar$extra", str0 );
+ Namespace ex0( str0 );
+ ASSERT_TRUE( ex0.isExtra() );
+
+ string str1 = foo.extraName( 1 );
+ ASSERT_EQUALS( "foo.bar$extrb", str1 );
+ Namespace ex1( str1 );
+ ASSERT_TRUE( ex1.isExtra() );
+
+ }
+}
diff --git a/src/mongo/db/storage/mmap_v1/dur_recover.cpp b/src/mongo/db/storage/mmap_v1/dur_recover.cpp
index 9d4e679808a..52836e7977f 100644
--- a/src/mongo/db/storage/mmap_v1/dur_recover.cpp
+++ b/src/mongo/db/storage/mmap_v1/dur_recover.cpp
@@ -40,6 +40,7 @@
#include "mongo/db/catalog/database.h"
#include "mongo/db/db.h"
#include "mongo/db/storage/storage_engine.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace.h"
#include "mongo/db/storage/mmap_v1/dur.h"
#include "mongo/db/storage/mmap_v1/dur_commitjob.h"
#include "mongo/db/storage/mmap_v1/dur_journal.h"
diff --git a/src/mongo/db/storage/mmap_v1/extent.h b/src/mongo/db/storage/mmap_v1/extent.h
index 8a27e271c04..f009e283380 100644
--- a/src/mongo/db/storage/mmap_v1/extent.h
+++ b/src/mongo/db/storage/mmap_v1/extent.h
@@ -34,7 +34,7 @@
#include <vector>
#include "mongo/db/diskloc.h"
-#include "mongo/db/structure/catalog/namespace.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace.h"
namespace mongo {
diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp
index 303ac49e507..f8ca6265c5f 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp
+++ b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp
@@ -42,11 +42,12 @@
#include "mongo/db/pdfile_version.h"
#include "mongo/db/server_parameters.h"
#include "mongo/db/storage/mmap_v1/data_file.h"
-#include "mongo/db/structure/catalog/namespace_details.h"
-#include "mongo/db/structure/catalog/namespace_details_collection_entry.h"
-#include "mongo/db/structure/catalog/namespace_details_rsv1_metadata.h"
-#include "mongo/db/structure/record_store_v1_capped.h"
-#include "mongo/db/structure/record_store_v1_simple.h"
+#include "mongo/db/storage/mmap_v1/btree/btree_interface.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h"
namespace mongo {
@@ -444,7 +445,7 @@ namespace mongo {
void MMAPV1DatabaseCatalogEntry::_lazyInit( OperationContext* txn ) {
// this is sort of insane
- // it's because the whole structure is highly recursive
+ // it's because the whole storage/mmap_v1 is highly recursive
_namespaceIndex.init( txn );
@@ -682,13 +683,13 @@ namespace mongo {
rs = entry->recordStore.get();
}
- std::auto_ptr<BtreeInterface> btree(
- BtreeInterface::getInterface(entry->headManager(),
- rs,
- entry->ordering(),
- entry->descriptor()->indexNamespace(),
- entry->descriptor()->version(),
- &BtreeBasedAccessMethod::invalidateCursors));
+ std::auto_ptr<SortedDataInterface> btree(
+ getMMAPV1Interface(entry->headManager(),
+ rs,
+ entry->ordering(),
+ entry->descriptor()->indexNamespace(),
+ entry->descriptor()->version(),
+ &BtreeBasedAccessMethod::invalidateCursors));
if (IndexNames::HASHED == type)
return new HashAccessMethod( entry, btree.release() );
diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h
index 16a88b84ede..fa5a5874061 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h
+++ b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h
@@ -33,8 +33,8 @@
#include "mongo/base/status.h"
#include "mongo/base/string_data.h"
#include "mongo/db/catalog/database_catalog_entry.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_index.h"
#include "mongo/db/storage/mmap_v1/mmap_v1_extent_manager.h"
-#include "mongo/db/structure/catalog/namespace_index.h"
namespace mongo {
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp
new file mode 100644
index 00000000000..3a1bed72dd9
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp
@@ -0,0 +1,974 @@
+// record_store_v1_base.cpp
+
+/**
+ * Copyright (C) 2013-2014 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_base.h"
+
+#include "mongo/db/catalog/collection.h"
+#include "mongo/db/concurrency/lock_mgr.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/db/storage/mmap_v1/extent.h"
+#include "mongo/db/storage/mmap_v1/extent_manager.h"
+#include "mongo/db/storage/mmap_v1/record.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h"
+#include "mongo/util/progress_meter.h"
+#include "mongo/util/timer.h"
+#include "mongo/util/touch_pages.h"
+
+namespace mongo {
+
+ const int RecordStoreV1Base::Buckets = 19;
+ const int RecordStoreV1Base::MaxBucket = 18;
+
+ /* Deleted list buckets are used to quickly locate free space based on size. Each bucket
+ contains records up to that size. All records >= 4mb are placed into the 16mb bucket.
+ */
+ const int RecordStoreV1Base::bucketSizes[] = {
+ 0x20, 0x40, 0x80, 0x100, // 32, 64, 128, 256
+ 0x200, 0x400, 0x800, 0x1000, // 512, 1K, 2K, 4K
+ 0x2000, 0x4000, 0x8000, 0x10000, // 8K, 16K, 32K, 64K
+ 0x20000, 0x40000, 0x80000, 0x100000, // 128K, 256K, 512K, 1M
+ 0x200000, 0x400000, 0x1000000, // 2M, 4M, 16M (see above)
+ };
+
+
+ RecordStoreV1Base::RecordStoreV1Base( const StringData& ns,
+ RecordStoreV1MetaData* details,
+ ExtentManager* em,
+ bool isSystemIndexes )
+ : RecordStore( ns ),
+ _details( details ),
+ _extentManager( em ),
+ _isSystemIndexes( isSystemIndexes ) {
+ }
+
+ RecordStoreV1Base::~RecordStoreV1Base() {
+ }
+
+
+ int64_t RecordStoreV1Base::storageSize( OperationContext* txn,
+ BSONObjBuilder* extraInfo,
+ int level ) const {
+ BSONArrayBuilder extentInfo;
+
+ int64_t total = 0;
+ int n = 0;
+
+ DiskLoc cur = _details->firstExtent(txn);
+
+ while ( !cur.isNull() ) {
+ Extent* e = _extentManager->getExtent( cur );
+
+ total += e->length;
+ n++;
+
+ if ( extraInfo && level > 0 ) {
+ extentInfo.append( BSON( "len" << e->length << "loc: " << e->myLoc.toBSONObj() ) );
+ }
+ cur = e->xnext;
+ }
+
+ if ( extraInfo ) {
+ extraInfo->append( "numExtents", n );
+ if ( level > 0 )
+ extraInfo->append( "extents", extentInfo.arr() );
+ }
+
+ return total;
+ }
+
+ RecordData RecordStoreV1Base::dataFor( const DiskLoc& loc ) const {
+ return recordFor(loc)->toRecordData();
+ }
+
+ Record* RecordStoreV1Base::recordFor( const DiskLoc& loc ) const {
+ return _extentManager->recordForV1( loc );
+ }
+
+ const DeletedRecord* RecordStoreV1Base::deletedRecordFor( const DiskLoc& loc ) const {
+ invariant( loc.a() != -1 );
+ return reinterpret_cast<const DeletedRecord*>( recordFor( loc ) );
+ }
+
+ DeletedRecord* RecordStoreV1Base::drec( const DiskLoc& loc ) const {
+ invariant( loc.a() != -1 );
+ return reinterpret_cast<DeletedRecord*>( recordFor( loc ) );
+ }
+
+ Extent* RecordStoreV1Base::_getExtent( OperationContext* txn, const DiskLoc& loc ) const {
+ return _extentManager->getExtent( loc );
+ }
+
+ DiskLoc RecordStoreV1Base::_getExtentLocForRecord( OperationContext* txn, const DiskLoc& loc ) const {
+ return _extentManager->extentLocForV1( loc );
+ }
+
+
+ DiskLoc RecordStoreV1Base::getNextRecord( OperationContext* txn, const DiskLoc& loc ) const {
+ DiskLoc next = getNextRecordInExtent( txn, loc );
+ if ( !next.isNull() ) {
+ return next;
+ }
+
+ // now traverse extents
+
+ Extent* e = _getExtent( txn, _getExtentLocForRecord(txn, loc) );
+ while ( 1 ) {
+ if ( e->xnext.isNull() )
+ return DiskLoc(); // end of collection
+ e = _getExtent( txn, e->xnext );
+ if ( !e->firstRecord.isNull() )
+ break;
+ // entire extent could be empty, keep looking
+ }
+ return e->firstRecord;
+ }
+
+ DiskLoc RecordStoreV1Base::getPrevRecord( OperationContext* txn, const DiskLoc& loc ) const {
+ DiskLoc prev = getPrevRecordInExtent( txn, loc );
+ if ( !prev.isNull() ) {
+ return prev;
+ }
+
+ // now traverse extents
+
+ Extent *e = _getExtent(txn, _getExtentLocForRecord(txn, loc));
+ while ( 1 ) {
+ if ( e->xprev.isNull() )
+ return DiskLoc(); // end of collection
+ e = _getExtent( txn, e->xprev );
+ if ( !e->firstRecord.isNull() )
+ break;
+ // entire extent could be empty, keep looking
+ }
+ return e->lastRecord;
+
+ }
+
+ DiskLoc RecordStoreV1Base::_findFirstSpot( OperationContext* txn,
+ const DiskLoc& extDiskLoc,
+ Extent* e ) {
+ DiskLoc emptyLoc = extDiskLoc;
+ emptyLoc.inc( Extent::HeaderSize() );
+ int delRecLength = e->length - Extent::HeaderSize();
+ if ( delRecLength >= 32*1024 && _ns.find('$') != string::npos && !isCapped() ) {
+ // probably an index. so skip forward to keep its records page aligned
+ int& ofs = emptyLoc.GETOFS();
+ int newOfs = (ofs + 0xfff) & ~0xfff;
+ delRecLength -= (newOfs-ofs);
+ dassert( delRecLength > 0 );
+ ofs = newOfs;
+ }
+
+ DeletedRecord* empty = txn->recoveryUnit()->writing(drec(emptyLoc));
+ empty->lengthWithHeaders() = delRecLength;
+ empty->extentOfs() = e->myLoc.getOfs();
+ empty->nextDeleted().Null();
+ return emptyLoc;
+
+ }
+
+ DiskLoc RecordStoreV1Base::getNextRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const {
+ int nextOffset = recordFor( loc )->nextOfs();
+
+ if ( nextOffset == DiskLoc::NullOfs )
+ return DiskLoc();
+
+ fassert( 17441, abs(nextOffset) >= 8 ); // defensive
+ DiskLoc result( loc.a(), nextOffset );
+ return result;
+ }
+
+ DiskLoc RecordStoreV1Base::getPrevRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const {
+ int prevOffset = recordFor( loc )->prevOfs();
+
+ if ( prevOffset == DiskLoc::NullOfs )
+ return DiskLoc();
+
+ fassert( 17442, abs(prevOffset) >= 8 ); // defensive
+ DiskLoc result( loc.a(), prevOffset );
+ return result;
+ }
+
+
+ StatusWith<DiskLoc> RecordStoreV1Base::insertRecord( OperationContext* txn,
+ const DocWriter* doc,
+ bool enforceQuota ) {
+ int docSize = doc->documentSize();
+ if ( docSize < 4 ) {
+ return StatusWith<DiskLoc>( ErrorCodes::InvalidLength,
+ "record has to be >= 4 bytes" );
+ }
+ int lenWHdr = docSize + Record::HeaderSize;
+ if ( doc->addPadding() )
+ lenWHdr = getRecordAllocationSize( lenWHdr );
+
+ StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, enforceQuota );
+ if ( !loc.isOK() )
+ return loc;
+
+ Record *r = recordFor( loc.getValue() );
+ fassert( 17319, r->lengthWithHeaders() >= lenWHdr );
+
+ r = reinterpret_cast<Record*>( txn->recoveryUnit()->writingPtr(r, lenWHdr) );
+ doc->writeDocument( r->data() );
+
+ _addRecordToRecListInExtent(txn, r, loc.getValue());
+
+ _details->incrementStats( txn, r->netLength(), 1 );
+
+ _paddingFits( txn );
+
+ return loc;
+ }
+
+
+ StatusWith<DiskLoc> RecordStoreV1Base::insertRecord( OperationContext* txn,
+ const char* data,
+ int len,
+ bool enforceQuota ) {
+ if ( len < 4 ) {
+ return StatusWith<DiskLoc>( ErrorCodes::InvalidLength,
+ "record has to be >= 4 bytes" );
+ }
+
+ StatusWith<DiskLoc> status = _insertRecord( txn, data, len, enforceQuota );
+ if ( status.isOK() )
+ _paddingFits( txn );
+
+ return status;
+ }
+
+ StatusWith<DiskLoc> RecordStoreV1Base::_insertRecord( OperationContext* txn,
+ const char* data,
+ int len,
+ bool enforceQuota ) {
+
+ int lenWHdr = getRecordAllocationSize( len + Record::HeaderSize );
+ fassert( 17208, lenWHdr >= ( len + Record::HeaderSize ) );
+
+ StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, enforceQuota );
+ if ( !loc.isOK() )
+ return loc;
+
+ Record *r = recordFor( loc.getValue() );
+ fassert( 17210, r->lengthWithHeaders() >= lenWHdr );
+
+ // copy the data
+ r = reinterpret_cast<Record*>( txn->recoveryUnit()->writingPtr(r, lenWHdr) );
+ memcpy( r->data(), data, len );
+
+ _addRecordToRecListInExtent(txn, r, loc.getValue());
+
+ _details->incrementStats( txn, r->netLength(), 1 );
+
+ return loc;
+ }
+
+ StatusWith<DiskLoc> RecordStoreV1Base::updateRecord( OperationContext* txn,
+ const DiskLoc& oldLocation,
+ const char* data,
+ int dataSize,
+ bool enforceQuota,
+ UpdateMoveNotifier* notifier ) {
+ Record* oldRecord = recordFor( oldLocation );
+ if ( oldRecord->netLength() >= dataSize ) {
+ // we fit
+ _paddingFits( txn );
+ memcpy( txn->recoveryUnit()->writingPtr( oldRecord->data(), dataSize ), data, dataSize );
+ return StatusWith<DiskLoc>( oldLocation );
+ }
+
+ if ( isCapped() )
+ return StatusWith<DiskLoc>( ErrorCodes::InternalError,
+ "failing update: objects in a capped ns cannot grow",
+ 10003 );
+
+ // we have to move
+
+ _paddingTooSmall( txn );
+
+ StatusWith<DiskLoc> newLocation = _insertRecord( txn, data, dataSize, enforceQuota );
+ if ( !newLocation.isOK() )
+ return newLocation;
+
+ // insert worked, so we delete old record
+ if ( notifier ) {
+ Status moveStatus = notifier->recordStoreGoingToMove( txn,
+ oldLocation,
+ oldRecord->data(),
+ oldRecord->netLength() );
+ if ( !moveStatus.isOK() )
+ return StatusWith<DiskLoc>( moveStatus );
+ }
+
+ deleteRecord( txn, oldLocation );
+
+ return newLocation;
+ }
+
+
+ Status RecordStoreV1Base::updateWithDamages( OperationContext* txn,
+ const DiskLoc& loc,
+ const char* damageSource,
+ const mutablebson::DamageVector& damages ) {
+ _paddingFits( txn );
+
+ Record* rec = recordFor( loc );
+ char* root = rec->data();
+
+ // All updates were in place. Apply them via durability and writing pointer.
+ mutablebson::DamageVector::const_iterator where = damages.begin();
+ const mutablebson::DamageVector::const_iterator end = damages.end();
+ for( ; where != end; ++where ) {
+ const char* sourcePtr = damageSource + where->sourceOffset;
+ void* targetPtr = txn->recoveryUnit()->writingPtr(root + where->targetOffset, where->size);
+ std::memcpy(targetPtr, sourcePtr, where->size);
+ }
+
+ return Status::OK();
+ }
+
+ void RecordStoreV1Base::deleteRecord( OperationContext* txn, const DiskLoc& dl ) {
+
+ Record* todelete = recordFor( dl );
+ invariant( todelete->netLength() >= 4 ); // this is required for defensive code
+
+ /* remove ourself from the record next/prev chain */
+ {
+ if ( todelete->prevOfs() != DiskLoc::NullOfs ) {
+ DiskLoc prev = getPrevRecordInExtent( txn, dl );
+ Record* prevRecord = recordFor( prev );
+ txn->recoveryUnit()->writingInt( prevRecord->nextOfs() ) = todelete->nextOfs();
+ }
+
+ if ( todelete->nextOfs() != DiskLoc::NullOfs ) {
+ DiskLoc next = getNextRecord( txn, dl );
+ Record* nextRecord = recordFor( next );
+ txn->recoveryUnit()->writingInt( nextRecord->prevOfs() ) = todelete->prevOfs();
+ }
+ }
+
+ /* remove ourself from extent pointers */
+ {
+ DiskLoc extentLoc = todelete->myExtentLoc(dl);
+ Extent *e = _getExtent( txn, extentLoc );
+ if ( e->firstRecord == dl ) {
+ txn->recoveryUnit()->writing(&e->firstRecord);
+ if ( todelete->nextOfs() == DiskLoc::NullOfs )
+ e->firstRecord.Null();
+ else
+ e->firstRecord.set(dl.a(), todelete->nextOfs() );
+ }
+ if ( e->lastRecord == dl ) {
+ txn->recoveryUnit()->writing(&e->lastRecord);
+ if ( todelete->prevOfs() == DiskLoc::NullOfs )
+ e->lastRecord.Null();
+ else
+ e->lastRecord.set(dl.a(), todelete->prevOfs() );
+ }
+ }
+
+ /* add to the free list */
+ {
+ _details->incrementStats( txn, -1 * todelete->netLength(), -1 );
+
+ if ( _isSystemIndexes ) {
+ /* temp: if in system.indexes, don't reuse, and zero out: we want to be
+ careful until validated more, as IndexDetails has pointers
+ to this disk location. so an incorrectly done remove would cause
+ a lot of problems.
+ */
+ memset( txn->recoveryUnit()->writingPtr(todelete, todelete->lengthWithHeaders() ),
+ 0, todelete->lengthWithHeaders() );
+ }
+ else {
+ // this is defensive so we can detect if we are still using a location
+ // that was deleted
+ memset(txn->recoveryUnit()->writingPtr(todelete->data(), 4), 0xee, 4);
+ addDeletedRec(txn, dl);
+ }
+ }
+
+ }
+
+ RecordIterator* RecordStoreV1Base::getIteratorForRepair(OperationContext* txn) const {
+ return new RecordStoreV1RepairIterator(txn, this);
+ }
+
+ void RecordStoreV1Base::_addRecordToRecListInExtent(OperationContext* txn,
+ Record *r,
+ DiskLoc loc) {
+ dassert( recordFor(loc) == r );
+ DiskLoc extentLoc = _getExtentLocForRecord( txn, loc );
+ Extent *e = _getExtent( txn, extentLoc );
+ if ( e->lastRecord.isNull() ) {
+ *txn->recoveryUnit()->writing(&e->firstRecord) = loc;
+ *txn->recoveryUnit()->writing(&e->lastRecord) = loc;
+ r->prevOfs() = r->nextOfs() = DiskLoc::NullOfs;
+ }
+ else {
+ Record *oldlast = recordFor(e->lastRecord);
+ r->prevOfs() = e->lastRecord.getOfs();
+ r->nextOfs() = DiskLoc::NullOfs;
+ txn->recoveryUnit()->writingInt(oldlast->nextOfs()) = loc.getOfs();
+ *txn->recoveryUnit()->writing(&e->lastRecord) = loc;
+ }
+ }
+
+ void RecordStoreV1Base::increaseStorageSize( OperationContext* txn,
+ int size,
+ bool enforceQuota ) {
+ DiskLoc eloc = _extentManager->allocateExtent( txn,
+ isCapped(),
+ size,
+ enforceQuota );
+ Extent *e = _extentManager->getExtent( eloc );
+ invariant( e );
+
+ *txn->recoveryUnit()->writing( &e->nsDiagnostic ) = _ns;
+
+ txn->recoveryUnit()->writing( &e->xnext )->Null();
+ txn->recoveryUnit()->writing( &e->xprev )->Null();
+ txn->recoveryUnit()->writing( &e->firstRecord )->Null();
+ txn->recoveryUnit()->writing( &e->lastRecord )->Null();
+
+ DiskLoc emptyLoc = _findFirstSpot( txn, eloc, e );
+
+ if ( _details->lastExtent(txn).isNull() ) {
+ invariant( _details->firstExtent(txn).isNull() );
+ _details->setFirstExtent( txn, eloc );
+ _details->setLastExtent( txn, eloc );
+ _details->setCapExtent( txn, eloc );
+ invariant( e->xprev.isNull() );
+ invariant( e->xnext.isNull() );
+ }
+ else {
+ invariant( !_details->firstExtent(txn).isNull() );
+ *txn->recoveryUnit()->writing(&e->xprev) = _details->lastExtent(txn);
+ *txn->recoveryUnit()->writing(&_extentManager->getExtent(_details->lastExtent(txn))->xnext) = eloc;
+ _details->setLastExtent( txn, eloc );
+ }
+
+ _details->setLastExtentSize( txn, e->length );
+
+ addDeletedRec(txn, emptyLoc);
+ }
+
+ Status RecordStoreV1Base::validate( OperationContext* txn,
+ bool full, bool scanData,
+ ValidateAdaptor* adaptor,
+ ValidateResults* results, BSONObjBuilder* output ) const {
+
+ // 1) basic status that require no iteration
+ // 2) extent level info
+ // 3) check extent start and end
+ // 4) check each non-deleted record
+ // 5) check deleted list
+
+ // -------------
+
+ // 1111111111111111111
+ if ( isCapped() ){
+ output->appendBool("capped", true);
+ output->appendNumber("max", _details->maxCappedDocs());
+ }
+
+ output->appendNumber("datasize", _details->dataSize());
+ output->appendNumber("nrecords", _details->numRecords());
+ output->appendNumber("lastExtentSize", _details->lastExtentSize(txn));
+ output->appendNumber("padding", _details->paddingFactor());
+
+ if ( _details->firstExtent(txn).isNull() )
+ output->append( "firstExtent", "null" );
+ else
+ output->append( "firstExtent",
+ str::stream() << _details->firstExtent(txn).toString()
+ << " ns:"
+ << _getExtent( txn, _details->firstExtent(txn) )->nsDiagnostic.toString());
+ if ( _details->lastExtent(txn).isNull() )
+ output->append( "lastExtent", "null" );
+ else
+ output->append( "lastExtent", str::stream() << _details->lastExtent(txn).toString()
+ << " ns:"
+ << _getExtent( txn, _details->lastExtent(txn) )->nsDiagnostic.toString());
+
+ // 22222222222222222222222222
+ { // validate extent basics
+ BSONArrayBuilder extentData;
+ int extentCount = 0;
+ DiskLoc extentDiskLoc;
+ try {
+ if ( !_details->firstExtent(txn).isNull() ) {
+ _getExtent( txn, _details->firstExtent(txn) )->assertOk();
+ _getExtent( txn, _details->lastExtent(txn) )->assertOk();
+ }
+
+ extentDiskLoc = _details->firstExtent(txn);
+ while (!extentDiskLoc.isNull()) {
+ Extent* thisExtent = _getExtent( txn, extentDiskLoc );
+ if (full) {
+ extentData << thisExtent->dump();
+ }
+ if (!thisExtent->validates(extentDiskLoc, &results->errors)) {
+ results->valid = false;
+ }
+ DiskLoc nextDiskLoc = thisExtent->xnext;
+
+ if (extentCount > 0 && !nextDiskLoc.isNull()
+ && _getExtent( txn, nextDiskLoc )->xprev != extentDiskLoc) {
+ StringBuilder sb;
+ sb << "'xprev' pointer " << _getExtent( txn, nextDiskLoc )->xprev.toString()
+ << " in extent " << nextDiskLoc.toString()
+ << " does not point to extent " << extentDiskLoc.toString();
+ results->errors.push_back( sb.str() );
+ results->valid = false;
+ }
+ if (nextDiskLoc.isNull() && extentDiskLoc != _details->lastExtent(txn)) {
+ StringBuilder sb;
+ sb << "'lastExtent' pointer " << _details->lastExtent(txn).toString()
+ << " does not point to last extent in list " << extentDiskLoc.toString();
+ results->errors.push_back( sb.str() );
+ results->valid = false;
+ }
+ extentDiskLoc = nextDiskLoc;
+ extentCount++;
+ txn->checkForInterrupt();
+ }
+ }
+ catch (const DBException& e) {
+ StringBuilder sb;
+ sb << "exception validating extent " << extentCount
+ << ": " << e.what();
+ results->errors.push_back( sb.str() );
+ results->valid = false;
+ return Status::OK();
+ }
+ output->append("extentCount", extentCount);
+
+ if ( full )
+ output->appendArray( "extents" , extentData.arr() );
+
+ }
+
+ try {
+ // 333333333333333333333333333
+ bool testingLastExtent = false;
+ try {
+ DiskLoc firstExtentLoc = _details->firstExtent(txn);
+ if (firstExtentLoc.isNull()) {
+ // this is ok
+ }
+ else {
+ output->append("firstExtentDetails", _getExtent(txn, firstExtentLoc)->dump());
+ if (!_getExtent(txn, firstExtentLoc)->xprev.isNull()) {
+ StringBuilder sb;
+ sb << "'xprev' pointer in 'firstExtent' " << _details->firstExtent(txn).toString()
+ << " is " << _getExtent(txn, firstExtentLoc)->xprev.toString()
+ << ", should be null";
+ results->errors.push_back( sb.str() );
+ results->valid = false;
+ }
+ }
+ testingLastExtent = true;
+ DiskLoc lastExtentLoc = _details->lastExtent(txn);
+ if (lastExtentLoc.isNull()) {
+ // this is ok
+ }
+ else {
+ if (firstExtentLoc != lastExtentLoc) {
+ output->append("lastExtentDetails", _getExtent(txn, lastExtentLoc)->dump());
+ if (!_getExtent(txn, lastExtentLoc)->xnext.isNull()) {
+ StringBuilder sb;
+ sb << "'xnext' pointer in 'lastExtent' " << lastExtentLoc.toString()
+ << " is " << _getExtent(txn, lastExtentLoc)->xnext.toString()
+ << ", should be null";
+ results->errors.push_back( sb.str() );
+ results->valid = false;
+ }
+ }
+ }
+ }
+ catch (const DBException& e) {
+ StringBuilder sb;
+ sb << "exception processing '"
+ << (testingLastExtent ? "lastExtent" : "firstExtent")
+ << "': " << e.what();
+ results->errors.push_back( sb.str() );
+ results->valid = false;
+ }
+
+ // 4444444444444444444444444
+
+ set<DiskLoc> recs;
+ if( scanData ) {
+ int n = 0;
+ int nInvalid = 0;
+ long long nQuantizedSize = 0;
+ long long nPowerOf2QuantizedSize = 0;
+ long long len = 0;
+ long long nlen = 0;
+ long long bsonLen = 0;
+ int outOfOrder = 0;
+ DiskLoc cl_last;
+
+ scoped_ptr<RecordIterator> iterator( getIterator( txn,
+ DiskLoc(),
+ false,
+ CollectionScanParams::FORWARD ) );
+ DiskLoc cl;
+ while ( !( cl = iterator->getNext() ).isNull() ) {
+ n++;
+
+ if ( n < 1000000 )
+ recs.insert(cl);
+ if ( isCapped() ) {
+ if ( cl < cl_last )
+ outOfOrder++;
+ cl_last = cl;
+ }
+
+ Record *r = recordFor(cl);
+ len += r->lengthWithHeaders();
+ nlen += r->netLength();
+
+ if ( r->lengthWithHeaders() ==
+ quantizeAllocationSpace( r->lengthWithHeaders() ) ) {
+ // Count the number of records having a size consistent with
+ // the quantizeAllocationSpace quantization implementation.
+ ++nQuantizedSize;
+ }
+
+ if ( r->lengthWithHeaders() ==
+ quantizePowerOf2AllocationSpace( r->lengthWithHeaders() ) ) {
+ // Count the number of records having a size consistent with the
+ // quantizePowerOf2AllocationSpace quantization implementation.
+ ++nPowerOf2QuantizedSize;
+ }
+
+ if (full){
+ size_t dataSize = 0;
+ const Status status = adaptor->validate( r->toRecordData(), &dataSize );
+ if (!status.isOK()) {
+ results->valid = false;
+ if (nInvalid == 0) // only log once;
+ results->errors.push_back( "invalid object detected (see logs)" );
+
+ nInvalid++;
+ log() << "Invalid object detected in " << _ns
+ << ": " << status.reason();
+ }
+ else {
+ bsonLen += dataSize;
+ }
+ }
+ }
+
+ if ( isCapped() && !_details->capLooped() ) {
+ output->append("cappedOutOfOrder", outOfOrder);
+ if ( outOfOrder > 1 ) {
+ results->valid = false;
+ results->errors.push_back( "too many out of order records" );
+ }
+ }
+ output->append("objectsFound", n);
+
+ if (full) {
+ output->append("invalidObjects", nInvalid);
+ }
+
+ output->appendNumber("nQuantizedSize", nQuantizedSize);
+ output->appendNumber("nPowerOf2QuantizedSize", nPowerOf2QuantizedSize);
+ output->appendNumber("bytesWithHeaders", len);
+ output->appendNumber("bytesWithoutHeaders", nlen);
+
+ if (full) {
+ output->appendNumber("bytesBson", bsonLen);
+ }
+ } // end scanData
+
+ // 55555555555555555555555555
+ BSONArrayBuilder deletedListArray;
+ for ( int i = 0; i < Buckets; i++ ) {
+ deletedListArray << _details->deletedListEntry(i).isNull();
+ }
+
+ int ndel = 0;
+ long long delSize = 0;
+ BSONArrayBuilder delBucketSizes;
+ int incorrect = 0;
+ for ( int i = 0; i < Buckets; i++ ) {
+ DiskLoc loc = _details->deletedListEntry(i);
+ try {
+ int k = 0;
+ while ( !loc.isNull() ) {
+ if ( recs.count(loc) )
+ incorrect++;
+ ndel++;
+
+ if ( loc.questionable() ) {
+ if( isCapped() && !loc.isValid() && i == 1 ) {
+ /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid
+ see comments in namespace.h
+ */
+ break;
+ }
+
+ string err( str::stream() << "bad pointer in deleted record list: "
+ << loc.toString()
+ << " bucket: " << i
+ << " k: " << k );
+ results->errors.push_back( err );
+ results->valid = false;
+ break;
+ }
+
+ const DeletedRecord* d = deletedRecordFor(loc);
+ delSize += d->lengthWithHeaders();
+ loc = d->nextDeleted();
+ k++;
+ txn->checkForInterrupt();
+ }
+ delBucketSizes << k;
+ }
+ catch (...) {
+ results->errors.push_back( (string)"exception in deleted chain for bucket " +
+ BSONObjBuilder::numStr(i) );
+ results->valid = false;
+ }
+ }
+ output->appendNumber("deletedCount", ndel);
+ output->appendNumber("deletedSize", delSize);
+ if ( full ) {
+ output->append( "delBucketSizes", delBucketSizes.arr() );
+ }
+
+ if ( incorrect ) {
+ results->errors.push_back( BSONObjBuilder::numStr(incorrect) +
+ " records from datafile are in deleted list" );
+ results->valid = false;
+ }
+
+ }
+ catch (AssertionException) {
+ results->errors.push_back( "exception during validate" );
+ results->valid = false;
+ }
+
+ return Status::OK();
+ }
+
+ void RecordStoreV1Base::appendCustomStats( OperationContext* txn,
+ BSONObjBuilder* result,
+ double scale ) const {
+ result->append( "lastExtentSize", _details->lastExtentSize(txn) / scale );
+ result->append( "paddingFactor", _details->paddingFactor() );
+ result->append( "userFlags", _details->userFlags() );
+
+ if ( isCapped() ) {
+ result->appendBool( "capped", true );
+ result->appendNumber( "max", _details->maxCappedDocs() );
+ }
+ }
+
+
+ namespace {
+ struct touch_location {
+ const char* root;
+ size_t length;
+ };
+ }
+
+ Status RecordStoreV1Base::touch( OperationContext* txn, BSONObjBuilder* output ) const {
+ Timer t;
+
+ std::vector<touch_location> ranges;
+ {
+ DiskLoc nextLoc = _details->firstExtent(txn);
+ Extent* ext = _getExtent( txn, nextLoc );
+ while ( ext ) {
+ touch_location tl;
+ tl.root = reinterpret_cast<const char*>(ext);
+ tl.length = ext->length;
+ ranges.push_back(tl);
+
+ nextLoc = ext->xnext;
+ if ( nextLoc.isNull() )
+ ext = NULL;
+ else
+ ext = _getExtent( txn, nextLoc );
+ }
+ }
+
+ std::string progress_msg = "touch " + std::string(txn->getNS()) + " extents";
+ ProgressMeterHolder pm(*txn->setMessage(progress_msg.c_str(),
+ "Touch Progress",
+ ranges.size()));
+
+ for ( std::vector<touch_location>::iterator it = ranges.begin(); it != ranges.end(); ++it ) {
+ touch_pages( it->root, it->length );
+ pm.hit();
+ txn->checkForInterrupt();
+ }
+ pm.finished();
+
+ if ( output ) {
+ output->append( "numRanges", static_cast<int>( ranges.size() ) );
+ output->append( "millis", t.millis() );
+ }
+
+ return Status::OK();
+ }
+
+ int RecordStoreV1Base::getRecordAllocationSize( int minRecordSize ) const {
+
+ if ( isCapped() )
+ return minRecordSize;
+
+ invariant( _details->paddingFactor() >= 1 );
+
+ if ( _details->isUserFlagSet( Flag_UsePowerOf2Sizes ) ) {
+ // quantize to the nearest bucketSize (or nearest 1mb boundary for large sizes).
+ return quantizePowerOf2AllocationSpace(minRecordSize);
+ }
+
+ // adjust for padding factor
+ return static_cast<int>(minRecordSize * _details->paddingFactor());
+ }
+
+ DiskLoc RecordStoreV1Base::IntraExtentIterator::getNext() {
+ if (_curr.isNull())
+ return DiskLoc();
+
+ const DiskLoc out = _curr; // we always return where we were, not where we will be.
+ const Record* rec = recordFor(_curr);
+ const int nextOfs = _forward ? rec->nextOfs() : rec->prevOfs();
+ _curr = (nextOfs == DiskLoc::NullOfs ? DiskLoc() : DiskLoc(_curr.a(), nextOfs));
+ return out;
+ }
+
+ void RecordStoreV1Base::IntraExtentIterator::invalidate(const DiskLoc& dl) {
+ if (dl == _curr) {
+ getNext();
+ }
+ }
+
+ /* @return the size for an allocated record quantized to 1/16th of the BucketSize
+ @param allocSize requested size to allocate
+ */
+ int RecordStoreV1Base::quantizeAllocationSpace(int allocSize) {
+ const int bucketIdx = bucket(allocSize);
+ int bucketSize = bucketSizes[bucketIdx];
+ int quantizeUnit = bucketSize / 16;
+ if (allocSize >= (1 << 22)) // 4mb
+ // all allocatons >= 4mb result in 4mb/16 quantization units, even if >= 8mb. idea is
+ // to reduce quantization overhead of large records at the cost of increasing the
+ // DeletedRecord size distribution in the largest bucket by factor of 4.
+ quantizeUnit = (1 << 18); // 256k
+ if (allocSize % quantizeUnit == 0)
+ // size is already quantized
+ return allocSize;
+ const int quantizedSpace = (allocSize | (quantizeUnit - 1)) + 1;
+ fassert(16484, quantizedSpace >= allocSize);
+ return quantizedSpace;
+ }
+
+ int RecordStoreV1Base::quantizePowerOf2AllocationSpace(int allocSize) {
+ for ( int i = 0; i < MaxBucket; i++ ) { // skips the largest (16MB) bucket
+ if ( bucketSizes[i] >= allocSize ) {
+ // Return the size of the first bucket sized >= the requested size.
+ return bucketSizes[i];
+ }
+ }
+
+ // if we get here, it means we're allocating more than 4mb, so round up
+ // to the nearest megabyte >= allocSize
+ const int MB = 1024*1024;
+ invariant(allocSize > 4*MB);
+ return (allocSize + (MB - 1)) & ~(MB - 1); // round up to MB alignment
+ }
+
+ int RecordStoreV1Base::bucket(int size) {
+ for ( int i = 0; i < Buckets; i++ ) {
+ if ( bucketSizes[i] > size ) {
+ // Return the first bucket sized _larger_ than the requested size.
+ return i;
+ }
+ }
+ return MaxBucket;
+ }
+
+ void RecordStoreV1Base::_paddingFits( OperationContext* txn ) {
+ MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less
+ double x = max(1.0, _details->paddingFactor() - 0.001 );
+ _details->setPaddingFactor( txn, x );
+ }
+ }
+
+ void RecordStoreV1Base::_paddingTooSmall( OperationContext* txn ) {
+ MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less
+ /* the more indexes we have, the higher the cost of a move. so we take that into
+ account herein. note on a move that insert() calls paddingFits(), thus
+ here for example with no inserts and nIndexes = 1 we have
+ .001*4-.001 or a 3:1 ratio to non moves -> 75% nonmoves. insert heavy
+ can pushes this down considerably. further tweaking will be a good idea but
+ this should be an adequate starting point.
+ */
+ double N = 4; // magic
+ double x = min(2.0,_details->paddingFactor() + (0.001 * N));
+ _details->setPaddingFactor( txn, x );
+ }
+ }
+
+ Status RecordStoreV1Base::setCustomOption( OperationContext* txn,
+ const BSONElement& option,
+ BSONObjBuilder* info ) {
+ if ( str::equals( "usePowerOf2Sizes", option.fieldName() ) ) {
+ bool oldPowerOf2 = _details->isUserFlagSet( Flag_UsePowerOf2Sizes );
+ bool newPowerOf2 = option.trueValue();
+
+ if ( oldPowerOf2 != newPowerOf2 ) {
+ // change userFlags
+ info->appendBool( "usePowerOf2Sizes_old", oldPowerOf2 );
+
+ if ( newPowerOf2 )
+ _details->setUserFlag( txn, Flag_UsePowerOf2Sizes );
+ else
+ _details->clearUserFlag( txn, Flag_UsePowerOf2Sizes );
+
+ info->appendBool( "usePowerOf2Sizes_new", newPowerOf2 );
+ }
+
+ return Status::OK();
+ }
+
+ return Status( ErrorCodes::InvalidOptions,
+ str::stream() << "no such option: " << option.fieldName() );
+ }
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_base.h b/src/mongo/db/storage/mmap_v1/record_store_v1_base.h
new file mode 100644
index 00000000000..72466c2b645
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_base.h
@@ -0,0 +1,303 @@
+// record_store_v1_base.h
+
+/**
+* Copyright (C) 2013-2014 MongoDB Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#pragma once
+
+#include "mongo/db/diskloc.h"
+#include "mongo/db/storage/record_store.h"
+
+namespace mongo {
+
+ class DeletedRecord;
+ class DocWriter;
+ class ExtentManager;
+ class Record;
+ class OperationContext;
+
+ struct Extent;
+
+ class RecordStoreV1MetaData {
+ public:
+ virtual ~RecordStoreV1MetaData(){}
+
+ virtual const DiskLoc& capExtent() const = 0;
+ virtual void setCapExtent( OperationContext* txn, const DiskLoc& loc ) = 0;
+
+ virtual const DiskLoc& capFirstNewRecord() const = 0;
+ virtual void setCapFirstNewRecord( OperationContext* txn, const DiskLoc& loc ) = 0;
+
+ bool capLooped() const { return capFirstNewRecord().isValid(); }
+
+ virtual long long dataSize() const = 0;
+ virtual long long numRecords() const = 0;
+
+ virtual void incrementStats( OperationContext* txn,
+ long long dataSizeIncrement,
+ long long numRecordsIncrement ) = 0;
+
+ virtual void setStats( OperationContext* txn,
+ long long dataSizeIncrement,
+ long long numRecordsIncrement ) = 0;
+
+ virtual const DiskLoc& deletedListEntry( int bucket ) const = 0;
+ virtual void setDeletedListEntry( OperationContext* txn,
+ int bucket,
+ const DiskLoc& loc ) = 0;
+ virtual void orphanDeletedList(OperationContext* txn) = 0;
+
+ virtual const DiskLoc& firstExtent( OperationContext* txn ) const = 0;
+ virtual void setFirstExtent( OperationContext* txn, const DiskLoc& loc ) = 0;
+
+ virtual const DiskLoc& lastExtent( OperationContext* txn ) const = 0;
+ virtual void setLastExtent( OperationContext* txn, const DiskLoc& loc ) = 0;
+
+ virtual bool isCapped() const = 0;
+
+ virtual bool isUserFlagSet( int flag ) const = 0;
+ virtual int userFlags() const = 0;
+ virtual bool setUserFlag( OperationContext* txn, int flag ) = 0;
+ virtual bool clearUserFlag( OperationContext* txn, int flag ) = 0;
+ virtual bool replaceUserFlags( OperationContext* txn, int flags ) = 0;
+
+ virtual int lastExtentSize( OperationContext* txn) const = 0;
+ virtual void setLastExtentSize( OperationContext* txn, int newMax ) = 0;
+
+ virtual long long maxCappedDocs() const = 0;
+
+ virtual double paddingFactor() const = 0;
+
+ virtual void setPaddingFactor( OperationContext* txn, double paddingFactor ) = 0;
+
+ };
+
+ class RecordStoreV1Base : public RecordStore {
+ public:
+
+ static const int Buckets;
+ static const int MaxBucket;
+
+ static const int bucketSizes[];
+
+ enum UserFlags {
+ Flag_UsePowerOf2Sizes = 1 << 0
+ };
+
+ // ------------
+
+ class IntraExtentIterator;
+
+ /**
+ * @param details - takes ownership
+ * @param em - does NOT take ownership
+ */
+ RecordStoreV1Base( const StringData& ns,
+ RecordStoreV1MetaData* details,
+ ExtentManager* em,
+ bool isSystemIndexes );
+
+ virtual ~RecordStoreV1Base();
+
+ virtual long long dataSize() const { return _details->dataSize(); }
+ virtual long long numRecords() const { return _details->numRecords(); }
+
+ virtual int64_t storageSize( OperationContext* txn,
+ BSONObjBuilder* extraInfo = NULL,
+ int level = 0 ) const;
+
+ virtual RecordData dataFor( const DiskLoc& loc ) const;
+
+ void deleteRecord( OperationContext* txn,
+ const DiskLoc& dl );
+
+ StatusWith<DiskLoc> insertRecord( OperationContext* txn,
+ const char* data,
+ int len,
+ bool enforceQuota );
+
+ StatusWith<DiskLoc> insertRecord( OperationContext* txn,
+ const DocWriter* doc,
+ bool enforceQuota );
+
+ virtual StatusWith<DiskLoc> updateRecord( OperationContext* txn,
+ const DiskLoc& oldLocation,
+ const char* data,
+ int len,
+ bool enforceQuota,
+ UpdateMoveNotifier* notifier );
+
+ virtual Status updateWithDamages( OperationContext* txn,
+ const DiskLoc& loc,
+ const char* damangeSource,
+ const mutablebson::DamageVector& damages );
+
+ virtual RecordIterator* getIteratorForRepair( OperationContext* txn ) const;
+
+ void increaseStorageSize( OperationContext* txn, int size, bool enforceQuota );
+
+ virtual Status validate( OperationContext* txn,
+ bool full, bool scanData,
+ ValidateAdaptor* adaptor,
+ ValidateResults* results, BSONObjBuilder* output ) const;
+
+ virtual void appendCustomStats( OperationContext* txn,
+ BSONObjBuilder* result,
+ double scale ) const;
+
+ virtual Status touch( OperationContext* txn, BSONObjBuilder* output ) const;
+
+ const RecordStoreV1MetaData* details() const { return _details.get(); }
+
+ /**
+ * @return the actual size to create
+ * will be >= oldRecordSize
+ * based on padding and any other flags
+ */
+ int getRecordAllocationSize( int minRecordSize ) const;
+
+ DiskLoc getExtentLocForRecord( OperationContext* txn, const DiskLoc& loc ) const;
+
+ DiskLoc getNextRecord( OperationContext* txn, const DiskLoc& loc ) const;
+ DiskLoc getPrevRecord( OperationContext* txn, const DiskLoc& loc ) const;
+
+ DiskLoc getNextRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const;
+ DiskLoc getPrevRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const;
+
+ /* @return the size for an allocated record quantized to 1/16th of the BucketSize.
+ @param allocSize requested size to allocate
+ The returned size will be greater than or equal to 'allocSize'.
+ */
+ static int quantizeAllocationSpace(int allocSize);
+
+ /**
+ * Quantize 'allocSize' to the nearest bucketSize (or nearest 1mb boundary for large sizes).
+ */
+ static int quantizePowerOf2AllocationSpace(int allocSize);
+
+ /* return which "deleted bucket" for this size object */
+ static int bucket(int size);
+
+ virtual Status setCustomOption( OperationContext* txn,
+ const BSONElement& option,
+ BSONObjBuilder* info = NULL );
+ protected:
+
+ virtual Record* recordFor( const DiskLoc& loc ) const;
+
+ const DeletedRecord* deletedRecordFor( const DiskLoc& loc ) const;
+
+ virtual bool isCapped() const = 0;
+
+ virtual StatusWith<DiskLoc> allocRecord( OperationContext* txn,
+ int lengthWithHeaders,
+ bool enforceQuota ) = 0;
+
+ // TODO: document, remove, what have you
+ virtual void addDeletedRec( OperationContext* txn, const DiskLoc& dloc) = 0;
+
+ // TODO: another sad one
+ virtual DeletedRecord* drec( const DiskLoc& loc ) const;
+
+ // just a wrapper for _extentManager->getExtent( loc );
+ Extent* _getExtent( OperationContext* txn, const DiskLoc& loc ) const;
+
+ DiskLoc _getExtentLocForRecord( OperationContext* txn, const DiskLoc& loc ) const;
+
+ DiskLoc _getNextRecord( OperationContext* txn, const DiskLoc& loc ) const;
+ DiskLoc _getPrevRecord( OperationContext* txn, const DiskLoc& loc ) const;
+
+ DiskLoc _getNextRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const;
+ DiskLoc _getPrevRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const;
+
+ /**
+ * finds the first suitable DiskLoc for data
+ * will return the DiskLoc of a newly created DeletedRecord
+ */
+ DiskLoc _findFirstSpot( OperationContext* txn, const DiskLoc& extDiskLoc, Extent* e );
+
+ /** add a record to the end of the linked list chain within this extent.
+ require: you must have already declared write intent for the record header.
+ */
+ void _addRecordToRecListInExtent(OperationContext* txn, Record* r, DiskLoc loc);
+
+ void _paddingTooSmall( OperationContext* txn );
+ void _paddingFits( OperationContext* txn );
+
+ /**
+ * internal
+ * doesn't check inputs or change padding
+ */
+ StatusWith<DiskLoc> _insertRecord( OperationContext* txn,
+ const char* data,
+ int len,
+ bool enforceQuota );
+
+ scoped_ptr<RecordStoreV1MetaData> _details;
+ ExtentManager* _extentManager;
+ bool _isSystemIndexes;
+
+ friend class RecordStoreV1RepairIterator;
+ };
+
+ /**
+ * Iterates over all records within a single extent.
+ *
+ * EOF at end of extent, even if there are more extents.
+ */
+ class RecordStoreV1Base::IntraExtentIterator : public RecordIterator {
+ public:
+ IntraExtentIterator(OperationContext* txn,
+ DiskLoc start,
+ const RecordStoreV1Base* rs,
+ bool forward = true)
+ : _txn(txn), _curr(start), _rs(rs), _forward(forward) {}
+
+ virtual bool isEOF() { return _curr.isNull(); }
+
+ virtual DiskLoc curr() { return _curr; }
+
+ virtual DiskLoc getNext( );
+
+ virtual void invalidate(const DiskLoc& dl);
+
+ virtual void prepareToYield() {}
+
+ virtual bool recoverFromYield() { return true; }
+
+ virtual RecordData dataFor( const DiskLoc& loc ) const { return _rs->dataFor(loc); }
+
+ private:
+ virtual const Record* recordFor( const DiskLoc& loc ) const { return _rs->recordFor(loc); }
+ OperationContext* _txn;
+ DiskLoc _curr;
+ const RecordStoreV1Base* _rs;
+ bool _forward;
+ };
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp
new file mode 100644
index 00000000000..c8524c76e22
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp
@@ -0,0 +1,717 @@
+// record_store_v1_capped.cpp
+
+/**
+ * Copyright (C) 2013 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h"
+
+#include "mongo/db/operation_context_impl.h"
+#include "mongo/db/storage/mmap_v1/extent.h"
+#include "mongo/db/storage/mmap_v1/extent_manager.h"
+#include "mongo/db/storage/mmap_v1/record.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h"
+#include "mongo/util/mmap.h"
+#include "mongo/util/mongoutils/str.h"
+
+/*
+ capped collection layout
+
+ d's below won't exist if things align perfectly:
+
+ extent1 -> extent2 -> extent3
+ ------------------- ----------------------- ---------------------
+ d r r r r r r r r d d r r r r d r r r r r d d r r r r r r r r r d
+ ^ ^
+ oldest newest
+
+ ^cappedFirstDeletedInCurExtent()
+ ^cappedLastDelRecLastExtent()
+ ^cappedListOfAllDeletedRecords()
+*/
+
+#define DDD(x)
+
+namespace mongo {
+
+ CappedRecordStoreV1::CappedRecordStoreV1( OperationContext* txn,
+ CappedDocumentDeleteCallback* collection,
+ const StringData& ns,
+ RecordStoreV1MetaData* details,
+ ExtentManager* em,
+ bool isSystemIndexes )
+ : RecordStoreV1Base( ns, details, em, isSystemIndexes ),
+ _deleteCallback( collection ) {
+
+ DiskLoc extentLoc = details->firstExtent(txn);
+ while ( !extentLoc.isNull() ) {
+ _extentAdvice.push_back( _extentManager->cacheHint( extentLoc,
+ ExtentManager::Sequential ) );
+ Extent* extent = em->getExtent( extentLoc );
+ extentLoc = extent->xnext;
+ }
+
+ // this is for VERY VERY old versions of capped collections
+ cappedCheckMigrate(txn);
+ }
+
+ CappedRecordStoreV1::~CappedRecordStoreV1() {
+ }
+
+ StatusWith<DiskLoc> CappedRecordStoreV1::allocRecord( OperationContext* txn,
+ int lenToAlloc,
+ bool enforceQuota ) {
+ {
+ // align very slightly.
+ lenToAlloc = (lenToAlloc + 3) & 0xfffffffc;
+ }
+
+ if ( lenToAlloc > theCapExtent()->length ) {
+ // the extent check is a way to try and improve performance
+ // since we have to iterate all the extents (for now) to get
+ // storage size
+ if ( lenToAlloc > storageSize(txn) ) {
+ return StatusWith<DiskLoc>( ErrorCodes::BadValue,
+ mongoutils::str::stream()
+ << "document is larger than capped size "
+ << lenToAlloc << " > " << storageSize(txn),
+ 16328 );
+ }
+
+ }
+ DiskLoc loc;
+ { // do allocation
+
+ // signal done allocating new extents.
+ if ( !cappedLastDelRecLastExtent().isValid() )
+ setLastDelRecLastExtent( txn, DiskLoc() );
+
+ invariant( lenToAlloc < 400000000 );
+ int passes = 0;
+ int maxPasses = ( lenToAlloc / 30 ) + 2; // 30 is about the smallest entry that could go in the oplog
+ if ( maxPasses < 5000 ) {
+ // this is for bacwards safety since 5000 was the old value
+ maxPasses = 5000;
+ }
+
+ // delete records until we have room and the max # objects limit achieved.
+
+ /* this fails on a rename -- that is ok but must keep commented out */
+ //invariant( theCapExtent()->ns == ns );
+
+ theCapExtent()->assertOk();
+ DiskLoc firstEmptyExtent;
+ while ( 1 ) {
+ if ( _details->numRecords() < _details->maxCappedDocs() ) {
+ loc = __capAlloc( txn, lenToAlloc );
+ if ( !loc.isNull() )
+ break;
+ }
+
+ // If on first iteration through extents, don't delete anything.
+ if ( !_details->capFirstNewRecord().isValid() ) {
+ advanceCapExtent( txn, _ns );
+
+ if ( _details->capExtent() != _details->firstExtent(txn) )
+ _details->setCapFirstNewRecord( txn, DiskLoc().setInvalid() );
+ // else signal done with first iteration through extents.
+ continue;
+ }
+
+ if ( !_details->capFirstNewRecord().isNull() &&
+ theCapExtent()->firstRecord == _details->capFirstNewRecord() ) {
+ // We've deleted all records that were allocated on the previous
+ // iteration through this extent.
+ advanceCapExtent( txn, _ns );
+ continue;
+ }
+
+ if ( theCapExtent()->firstRecord.isNull() ) {
+ if ( firstEmptyExtent.isNull() )
+ firstEmptyExtent = _details->capExtent();
+ advanceCapExtent( txn, _ns );
+ if ( firstEmptyExtent == _details->capExtent() ) {
+ _maybeComplain( txn, lenToAlloc );
+ return StatusWith<DiskLoc>( ErrorCodes::InternalError,
+ "no space in capped collection" );
+ }
+ continue;
+ }
+
+ DiskLoc fr = theCapExtent()->firstRecord;
+ Status status = _deleteCallback->aboutToDeleteCapped( txn, fr );
+ if ( !status.isOK() )
+ return StatusWith<DiskLoc>( status );
+ deleteRecord( txn, fr );
+
+ compact(txn);
+ if( ++passes > maxPasses ) {
+ StringBuilder sb;
+ sb << "passes >= maxPasses in CappedRecordStoreV1::cappedAlloc: ns: " << _ns
+ << ", lenToAlloc: " << lenToAlloc
+ << ", maxPasses: " << maxPasses
+ << ", _maxDocsInCapped: " << _details->maxCappedDocs()
+ << ", nrecords: " << _details->numRecords()
+ << ", datasize: " << _details->dataSize();
+
+ return StatusWith<DiskLoc>( ErrorCodes::InternalError, sb.str() );
+ }
+ }
+
+ // Remember first record allocated on this iteration through capExtent.
+ if ( _details->capFirstNewRecord().isValid() && _details->capFirstNewRecord().isNull() )
+ _details->setCapFirstNewRecord( txn, loc );
+ }
+
+ invariant( !loc.isNull() );
+
+ // possibly slice up if we've allocated too much space
+
+ DeletedRecord *r = drec( loc );
+
+ /* note we want to grab from the front so our next pointers on disk tend
+ to go in a forward direction which is important for performance. */
+ int regionlen = r->lengthWithHeaders();
+ invariant( r->extentOfs() < loc.getOfs() );
+
+ int left = regionlen - lenToAlloc;
+
+ /* split off some for further use. */
+ txn->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc;
+ DiskLoc newDelLoc = loc;
+ newDelLoc.inc(lenToAlloc);
+ DeletedRecord* newDel = drec( newDelLoc );
+ DeletedRecord* newDelW = txn->recoveryUnit()->writing(newDel);
+ newDelW->extentOfs() = r->extentOfs();
+ newDelW->lengthWithHeaders() = left;
+ newDelW->nextDeleted().Null();
+
+ addDeletedRec(txn, newDelLoc);
+
+ return StatusWith<DiskLoc>( loc );
+ }
+
+ Status CappedRecordStoreV1::truncate(OperationContext* txn) {
+ setLastDelRecLastExtent( txn, DiskLoc() );
+ setListOfAllDeletedRecords( txn, DiskLoc() );
+
+ // preserve firstExtent/lastExtent
+ _details->setCapExtent( txn, _details->firstExtent(txn) );
+ _details->setStats( txn, 0, 0 );
+ // preserve lastExtentSize
+ // nIndexes preserve 0
+ // capped preserve true
+ // max preserve
+ _details->setPaddingFactor( txn, 1.0 );
+ _details->setCapFirstNewRecord( txn, DiskLoc().setInvalid() );
+ setLastDelRecLastExtent( txn, DiskLoc().setInvalid() );
+ // dataFileVersion preserve
+ // indexFileVersion preserve
+
+ // Reset all existing extents and recreate the deleted list.
+ Extent* ext;
+ for( DiskLoc extLoc = _details->firstExtent(txn);
+ !extLoc.isNull();
+ extLoc = ext->xnext ) {
+ ext = _extentManager->getExtent(extLoc);
+
+ txn->recoveryUnit()->writing( &ext->firstRecord )->Null();
+ txn->recoveryUnit()->writing( &ext->lastRecord )->Null();
+
+ addDeletedRec( txn, _findFirstSpot( txn, extLoc, ext ) );
+ }
+
+ return Status::OK();
+ }
+
+ void CappedRecordStoreV1::temp_cappedTruncateAfter( OperationContext* txn,
+ DiskLoc end,
+ bool inclusive ) {
+ cappedTruncateAfter( txn, _ns.c_str(), end, inclusive );
+ }
+
+ /* combine adjacent deleted records *for the current extent* of the capped collection
+
+ this is O(n^2) but we call it for capped tables where typically n==1 or 2!
+ (or 3...there will be a little unused sliver at the end of the extent.)
+ */
+ void CappedRecordStoreV1::compact(OperationContext* txn) {
+ DDD( "CappedRecordStoreV1::compact enter" );
+
+ vector<DiskLoc> drecs;
+
+ // Pull out capExtent's DRs from deletedList
+ DiskLoc i = cappedFirstDeletedInCurExtent();
+ for (; !i.isNull() && inCapExtent( i ); i = deletedRecordFor( i )->nextDeleted() ) {
+ DDD( "\t" << i );
+ drecs.push_back( i );
+ }
+
+ setFirstDeletedInCurExtent( txn, i );
+
+ std::sort( drecs.begin(), drecs.end() );
+ DDD( "\t drecs.size(): " << drecs.size() );
+
+ vector<DiskLoc>::const_iterator j = drecs.begin();
+ invariant( j != drecs.end() );
+ DiskLoc a = *j;
+ while ( 1 ) {
+ j++;
+ if ( j == drecs.end() ) {
+ DDD( "\t compact adddelrec" );
+ addDeletedRec(txn, a);
+ break;
+ }
+ DiskLoc b = *j;
+ while ( a.a() == b.a() &&
+ a.getOfs() + drec( a )->lengthWithHeaders() == b.getOfs() ) {
+
+ // a & b are adjacent. merge.
+ txn->recoveryUnit()->writingInt( drec(a)->lengthWithHeaders() ) += drec(b)->lengthWithHeaders();
+ j++;
+ if ( j == drecs.end() ) {
+ DDD( "\t compact adddelrec2" );
+ addDeletedRec(txn, a);
+ return;
+ }
+ b = *j;
+ }
+ DDD( "\t compact adddelrec3" );
+ addDeletedRec(txn, a);
+ a = b;
+ }
+
+ }
+
+ const DiskLoc &CappedRecordStoreV1::cappedFirstDeletedInCurExtent() const {
+ if ( cappedLastDelRecLastExtent().isNull() )
+ return cappedListOfAllDeletedRecords();
+ else
+ return drec(cappedLastDelRecLastExtent())->nextDeleted();
+ }
+
+ void CappedRecordStoreV1::setFirstDeletedInCurExtent( OperationContext* txn,
+ const DiskLoc& loc ) {
+ if ( cappedLastDelRecLastExtent().isNull() )
+ setListOfAllDeletedRecords( txn, loc );
+ else
+ *txn->recoveryUnit()->writing( &drec(cappedLastDelRecLastExtent())->nextDeleted() ) = loc;
+ }
+
+ void CappedRecordStoreV1::cappedCheckMigrate(OperationContext* txn) {
+ // migrate old RecordStoreV1MetaData format
+ if ( _details->capExtent().a() == 0 && _details->capExtent().getOfs() == 0 ) {
+ _details->setCapFirstNewRecord( txn, DiskLoc().setInvalid() );
+ // put all the DeletedRecords in cappedListOfAllDeletedRecords()
+ for ( int i = 1; i < Buckets; ++i ) {
+ DiskLoc first = _details->deletedListEntry( i );
+ if ( first.isNull() )
+ continue;
+ DiskLoc last = first;
+ for (; !drec(last)->nextDeleted().isNull(); last = drec(last)->nextDeleted() );
+ *txn->recoveryUnit()->writing(&drec(last)->nextDeleted()) = cappedListOfAllDeletedRecords();
+ setListOfAllDeletedRecords( txn, first );
+ _details->setDeletedListEntry(txn, i, DiskLoc());
+ }
+ // NOTE cappedLastDelRecLastExtent() set to DiskLoc() in above
+
+ // Last, in case we're killed before getting here
+ _details->setCapExtent( txn, _details->firstExtent(txn) );
+ }
+ }
+
+ bool CappedRecordStoreV1::inCapExtent( const DiskLoc &dl ) const {
+ invariant( !dl.isNull() );
+
+ if ( dl.a() != _details->capExtent().a() )
+ return false;
+
+ if ( dl.getOfs() < _details->capExtent().getOfs() )
+ return false;
+
+ const Extent* e = theCapExtent();
+ int end = _details->capExtent().getOfs() + e->length;
+ return dl.getOfs() <= end;
+ }
+
+ bool CappedRecordStoreV1::nextIsInCapExtent( const DiskLoc &dl ) const {
+ invariant( !dl.isNull() );
+ DiskLoc next = drec(dl)->nextDeleted();
+ if ( next.isNull() )
+ return false;
+ return inCapExtent( next );
+ }
+
+ void CappedRecordStoreV1::advanceCapExtent( OperationContext* txn, const StringData& ns ) {
+ // We want cappedLastDelRecLastExtent() to be the last DeletedRecord of the prev cap extent
+ // (or DiskLoc() if new capExtent == firstExtent)
+ if ( _details->capExtent() == _details->lastExtent(txn) )
+ setLastDelRecLastExtent( txn, DiskLoc() );
+ else {
+ DiskLoc i = cappedFirstDeletedInCurExtent();
+ for (; !i.isNull() && nextIsInCapExtent( i ); i = drec(i)->nextDeleted() );
+ setLastDelRecLastExtent( txn, i );
+ }
+
+ _details->setCapExtent( txn,
+ theCapExtent()->xnext.isNull() ? _details->firstExtent(txn)
+ : theCapExtent()->xnext );
+
+ /* this isn't true if a collection has been renamed...that is ok just used for diagnostics */
+ //dassert( theCapExtent()->ns == ns );
+
+ theCapExtent()->assertOk();
+ _details->setCapFirstNewRecord( txn, DiskLoc() );
+ }
+
+ DiskLoc CappedRecordStoreV1::__capAlloc( OperationContext* txn, int len ) {
+ DiskLoc prev = cappedLastDelRecLastExtent();
+ DiskLoc i = cappedFirstDeletedInCurExtent();
+ DiskLoc ret;
+ for (; !i.isNull() && inCapExtent( i ); prev = i, i = drec(i)->nextDeleted() ) {
+ // We need to keep at least one DR per extent in cappedListOfAllDeletedRecords(),
+ // so make sure there's space to create a DR at the end.
+ if ( drec(i)->lengthWithHeaders() >= len + 24 ) {
+ ret = i;
+ break;
+ }
+ }
+
+ /* unlink ourself from the deleted list */
+ if ( !ret.isNull() ) {
+ if ( prev.isNull() )
+ setListOfAllDeletedRecords( txn, drec(ret)->nextDeleted() );
+ else
+ *txn->recoveryUnit()->writing(&drec(prev)->nextDeleted()) = drec(ret)->nextDeleted();
+ *txn->recoveryUnit()->writing(&drec(ret)->nextDeleted()) = DiskLoc().setInvalid(); // defensive.
+ invariant( drec(ret)->extentOfs() < ret.getOfs() );
+ }
+
+ return ret;
+ }
+
+ void CappedRecordStoreV1::cappedTruncateLastDelUpdate(OperationContext* txn) {
+ if ( _details->capExtent() == _details->firstExtent(txn) ) {
+ // Only one extent of the collection is in use, so there
+ // is no deleted record in a previous extent, so nullify
+ // cappedLastDelRecLastExtent().
+ setLastDelRecLastExtent( txn, DiskLoc() );
+ }
+ else {
+ // Scan through all deleted records in the collection
+ // until the last deleted record for the extent prior
+ // to the new capExtent is found. Then set
+ // cappedLastDelRecLastExtent() to that deleted record.
+ DiskLoc i = cappedListOfAllDeletedRecords();
+ for( ;
+ !drec(i)->nextDeleted().isNull() &&
+ !inCapExtent( drec(i)->nextDeleted() );
+ i = drec(i)->nextDeleted() );
+ // In our capped storage model, every extent must have at least one
+ // deleted record. Here we check that 'i' is not the last deleted
+ // record. (We expect that there will be deleted records in the new
+ // capExtent as well.)
+ invariant( !drec(i)->nextDeleted().isNull() );
+ setLastDelRecLastExtent( txn, i );
+ }
+ }
+
+ void CappedRecordStoreV1::cappedTruncateAfter(OperationContext* txn,
+ const char* ns,
+ DiskLoc end,
+ bool inclusive) {
+ invariant( cappedLastDelRecLastExtent().isValid() );
+
+ // We iteratively remove the newest document until the newest document
+ // is 'end', then we remove 'end' if requested.
+ bool foundLast = false;
+ while( 1 ) {
+ if ( foundLast ) {
+ // 'end' has been found and removed, so break.
+ break;
+ }
+ txn->recoveryUnit()->commitIfNeeded();
+ // 'curr' will point to the newest document in the collection.
+ DiskLoc curr = theCapExtent()->lastRecord;
+ invariant( !curr.isNull() );
+ if ( curr == end ) {
+ if ( inclusive ) {
+ // 'end' has been found, so break next iteration.
+ foundLast = true;
+ }
+ else {
+ // 'end' has been found, so break.
+ break;
+ }
+ }
+
+ // TODO The algorithm used in this function cannot generate an
+ // empty collection, but we could call emptyCappedCollection() in
+ // this case instead of asserting.
+ uassert( 13415, "emptying the collection is not allowed", _details->numRecords() > 1 );
+
+ // Delete the newest record, and coalesce the new deleted
+ // record with existing deleted records.
+ Status status = _deleteCallback->aboutToDeleteCapped( txn, curr );
+ uassertStatusOK( status );
+ deleteRecord( txn, curr );
+ compact(txn);
+
+ // This is the case where we have not yet had to remove any
+ // documents to make room for other documents, and we are allocating
+ // documents from free space in fresh extents instead of reusing
+ // space from familiar extents.
+ if ( !_details->capLooped() ) {
+
+ // We just removed the last record from the 'capExtent', and
+ // the 'capExtent' can't be empty, so we set 'capExtent' to
+ // capExtent's prev extent.
+ if ( theCapExtent()->lastRecord.isNull() ) {
+ invariant( !theCapExtent()->xprev.isNull() );
+ // NOTE Because we didn't delete the last document, and
+ // capLooped() is false, capExtent is not the first extent
+ // so xprev will be nonnull.
+ _details->setCapExtent( txn, theCapExtent()->xprev );
+ theCapExtent()->assertOk();
+
+ // update cappedLastDelRecLastExtent()
+ cappedTruncateLastDelUpdate(txn);
+ }
+ continue;
+ }
+
+ // This is the case where capLooped() is true, and we just deleted
+ // from capExtent, and we just deleted capFirstNewRecord, which was
+ // the last record on the fresh side of capExtent.
+ // NOTE In this comparison, curr and potentially capFirstNewRecord
+ // may point to invalid data, but we can still compare the
+ // references themselves.
+ if ( curr == _details->capFirstNewRecord() ) {
+
+ // Set 'capExtent' to the first nonempty extent prior to the
+ // initial capExtent. There must be such an extent because we
+ // have not deleted the last document in the collection. It is
+ // possible that all extents other than the capExtent are empty.
+ // In this case we will keep the initial capExtent and specify
+ // that all records contained within are on the fresh rather than
+ // stale side of the extent.
+ DiskLoc newCapExtent = _details->capExtent();
+ do {
+ // Find the previous extent, looping if necessary.
+ newCapExtent = ( newCapExtent == _details->firstExtent(txn) ) ?
+ _details->lastExtent(txn) :
+ _extentManager->getExtent(newCapExtent)->xprev;
+ _extentManager->getExtent(newCapExtent)->assertOk();
+ }
+ while ( _extentManager->getExtent(newCapExtent)->firstRecord.isNull() );
+ _details->setCapExtent( txn, newCapExtent );
+
+ // Place all documents in the new capExtent on the fresh side
+ // of the capExtent by setting capFirstNewRecord to the first
+ // document in the new capExtent.
+ _details->setCapFirstNewRecord( txn, theCapExtent()->firstRecord );
+
+ // update cappedLastDelRecLastExtent()
+ cappedTruncateLastDelUpdate(txn);
+ }
+ }
+ }
+
+ const DiskLoc& CappedRecordStoreV1::cappedListOfAllDeletedRecords() const {
+ return _details->deletedListEntry(0);
+ }
+
+ void CappedRecordStoreV1::setListOfAllDeletedRecords( OperationContext* txn,
+ const DiskLoc& loc ) {
+ return _details->setDeletedListEntry(txn, 0, loc);
+ }
+
+ const DiskLoc& CappedRecordStoreV1::cappedLastDelRecLastExtent() const {
+ return _details->deletedListEntry(1);
+ }
+
+ void CappedRecordStoreV1::setLastDelRecLastExtent( OperationContext* txn,
+ const DiskLoc& loc ) {
+ return _details->setDeletedListEntry(txn, 1, loc);
+ }
+
+ Extent* CappedRecordStoreV1::theCapExtent() const {
+ return _extentManager->getExtent(_details->capExtent());
+ }
+
+ void CappedRecordStoreV1::addDeletedRec( OperationContext* txn, const DiskLoc& dloc ) {
+ DeletedRecord* d = txn->recoveryUnit()->writing( drec( dloc ) );
+
+ DEBUGGING log() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs() << endl;
+ if ( !cappedLastDelRecLastExtent().isValid() ) {
+ // Initial extent allocation. Insert at end.
+ d->nextDeleted() = DiskLoc();
+ if ( cappedListOfAllDeletedRecords().isNull() )
+ setListOfAllDeletedRecords( txn, dloc );
+ else {
+ DiskLoc i = cappedListOfAllDeletedRecords();
+ for (; !drec(i)->nextDeleted().isNull(); i = drec(i)->nextDeleted() )
+ ;
+ *txn->recoveryUnit()->writing(&drec(i)->nextDeleted()) = dloc;
+ }
+ }
+ else {
+ d->nextDeleted() = cappedFirstDeletedInCurExtent();
+ setFirstDeletedInCurExtent( txn, dloc );
+ // always compact() after this so order doesn't matter
+ }
+ }
+
+ RecordIterator* CappedRecordStoreV1::getIterator( OperationContext* txn,
+ const DiskLoc& start,
+ bool tailable,
+ const CollectionScanParams::Direction& dir) const {
+ return new CappedRecordStoreV1Iterator( txn, this, start, tailable, dir );
+ }
+
+ vector<RecordIterator*> CappedRecordStoreV1::getManyIterators( OperationContext* txn ) const {
+ OwnedPointerVector<RecordIterator> iterators;
+
+ if (!_details->capLooped()) {
+ // if we haven't looped yet, just spit out all extents (same as non-capped impl)
+ const Extent* ext;
+ for (DiskLoc extLoc = details()->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) {
+ ext = _getExtent(txn, extLoc);
+ if (ext->firstRecord.isNull())
+ continue;
+
+ iterators.push_back(new RecordStoreV1Base::IntraExtentIterator(txn,
+ ext->firstRecord,
+ this));
+ }
+ }
+ else {
+ // if we've looped we need to iterate the extents, starting and ending with the
+ // capExtent
+ const DiskLoc capExtent = details()->capExtent();
+ invariant(!capExtent.isNull());
+ invariant(capExtent.isValid());
+
+ // First do the "old" portion of capExtent if there is any
+ DiskLoc extLoc = capExtent;
+ {
+ const Extent* ext = _getExtent(txn, extLoc);
+ if (ext->firstRecord != details()->capFirstNewRecord()) {
+ // this means there is old data in capExtent
+ iterators.push_back(new RecordStoreV1Base::IntraExtentIterator(txn,
+ ext->firstRecord,
+ this));
+ }
+
+ extLoc = ext->xnext.isNull() ? details()->firstExtent(txn) : ext->xnext;
+ }
+
+ // Next handle all the other extents
+ while (extLoc != capExtent) {
+ const Extent* ext = _getExtent(txn, extLoc);
+ iterators.push_back(new RecordStoreV1Base::IntraExtentIterator(txn,
+ ext->firstRecord,
+ this));
+
+ extLoc = ext->xnext.isNull() ? details()->firstExtent(txn) : ext->xnext;
+ }
+
+ // Finally handle the "new" data in the capExtent
+ iterators.push_back(
+ new RecordStoreV1Base::IntraExtentIterator(txn,
+ details()->capFirstNewRecord(),
+ this));
+ }
+
+ return iterators.release();
+ }
+
+ Status CappedRecordStoreV1::compact( OperationContext* txn,
+ RecordStoreCompactAdaptor* adaptor,
+ const CompactOptions* options,
+ CompactStats* stats ) {
+ invariant(false);
+ }
+
+ void CappedRecordStoreV1::_maybeComplain( OperationContext* txn, int len ) const {
+ RARELY {
+ std::stringstream buf;
+ buf << "couldn't make room for record len: " << len << " in capped ns " << _ns << '\n';
+ buf << "numRecords: " << numRecords() << '\n';
+ int i = 0;
+ for ( DiskLoc e = _details->firstExtent(txn);
+ !e.isNull();
+ e = _extentManager->getExtent( e )->xnext, ++i ) {
+ buf << " Extent " << i;
+ if ( e == _details->capExtent() )
+ buf << " (capExtent)";
+ buf << ' ' << e;
+ buf << '\n';
+
+ buf << " magic: " << hex << _extentManager->getExtent( e )->magic << dec
+ << " extent->ns: " << _extentManager->getExtent( e )->nsDiagnostic.toString()
+ << '\n';
+ buf << " fr: " << _extentManager->getExtent( e )->firstRecord.toString()
+ << " lr: " << _extentManager->getExtent( e )->lastRecord.toString()
+ << " extent->len: " << _extentManager->getExtent( e )->length << '\n';
+ }
+
+ warning() << buf.str();
+
+ // assume it is unusually large record; if not, something is broken
+ fassert( 17438, len * 5 > _details->lastExtentSize(txn) );
+ }
+ }
+
+ DiskLoc CappedRecordStoreV1::firstRecord( OperationContext* txn,
+ const DiskLoc &startExtent ) const {
+ for (DiskLoc i = startExtent.isNull() ? _details->firstExtent(txn) : startExtent;
+ !i.isNull();
+ i = _extentManager->getExtent( i )->xnext ) {
+
+ Extent* e = _extentManager->getExtent( i );
+
+ if ( !e->firstRecord.isNull() )
+ return e->firstRecord;
+ }
+ return DiskLoc();
+ }
+
+ DiskLoc CappedRecordStoreV1::lastRecord( OperationContext* txn,
+ const DiskLoc &startExtent ) const {
+ for (DiskLoc i = startExtent.isNull() ? _details->lastExtent(txn) : startExtent;
+ !i.isNull();
+ i = _extentManager->getExtent( i )->xprev ) {
+
+ Extent* e = _extentManager->getExtent( i );
+ if ( !e->lastRecord.isNull() )
+ return e->lastRecord;
+ }
+ return DiskLoc();
+ }
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped.h b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.h
new file mode 100644
index 00000000000..4422b5d451b
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.h
@@ -0,0 +1,139 @@
+// record_store_v1_capped.h
+
+/**
+* Copyright (C) 2013 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#pragma once
+
+#include "mongo/base/owned_pointer_vector.h"
+#include "mongo/db/diskloc.h"
+#include "mongo/db/storage/capped_callback.h"
+#include "mongo/db/storage/mmap_v1/extent_manager.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_base.h"
+
+namespace mongo {
+
+ class CappedRecordStoreV1 : public RecordStoreV1Base {
+ public:
+ CappedRecordStoreV1( OperationContext* txn,
+ CappedDocumentDeleteCallback* collection,
+ const StringData& ns,
+ RecordStoreV1MetaData* details,
+ ExtentManager* em,
+ bool isSystemIndexes );
+
+ virtual ~CappedRecordStoreV1();
+
+ const char* name() const { return "CappedRecordStoreV1"; }
+
+ virtual Status truncate(OperationContext* txn);
+
+ /**
+ * Truncate documents newer than the document at 'end' from the capped
+ * collection. The collection cannot be completely emptied using this
+ * function. An assertion will be thrown if that is attempted.
+ * @param inclusive - Truncate 'end' as well iff true
+ * XXX: this will go away soon, just needed to move for now
+ */
+ virtual void temp_cappedTruncateAfter( OperationContext* txn, DiskLoc end, bool inclusive );
+
+ virtual RecordIterator* getIterator( OperationContext* txn,
+ const DiskLoc& start, bool tailable,
+ const CollectionScanParams::Direction& dir) const;
+
+ virtual std::vector<RecordIterator*> getManyIterators( OperationContext* txn ) const;
+
+ virtual bool compactSupported() const { return false; }
+
+ virtual Status compact( OperationContext* txn,
+ RecordStoreCompactAdaptor* adaptor,
+ const CompactOptions* options,
+ CompactStats* stats );
+
+ // Start from firstExtent by default.
+ DiskLoc firstRecord( OperationContext* txn,
+ const DiskLoc &startExtent = DiskLoc() ) const;
+ // Start from lastExtent by default.
+ DiskLoc lastRecord( OperationContext* txn,
+ const DiskLoc &startExtent = DiskLoc() ) const;
+
+ protected:
+
+ virtual bool isCapped() const { return true; }
+
+ virtual void setCappedDeleteCallback( CappedDocumentDeleteCallback* cb ) {
+ _deleteCallback = cb;
+ }
+
+ virtual StatusWith<DiskLoc> allocRecord( OperationContext* txn,
+ int lengthWithHeaders,
+ bool enforceQuota );
+
+ virtual void addDeletedRec(OperationContext* txn, const DiskLoc& dloc);
+
+ private:
+ // -- start copy from cap.cpp --
+ void compact(OperationContext* txn);
+ const DiskLoc& cappedFirstDeletedInCurExtent() const;
+ void setFirstDeletedInCurExtent( OperationContext* txn, const DiskLoc& loc );
+ void cappedCheckMigrate(OperationContext* txn);
+ DiskLoc __capAlloc( OperationContext* txn, int len );
+ bool inCapExtent( const DiskLoc &dl ) const;
+ const DiskLoc& cappedListOfAllDeletedRecords() const;
+ const DiskLoc& cappedLastDelRecLastExtent() const;
+ void setListOfAllDeletedRecords( OperationContext* txn, const DiskLoc& loc );
+ void setLastDelRecLastExtent( OperationContext* txn, const DiskLoc& loc );
+ Extent *theCapExtent() const;
+ bool nextIsInCapExtent( const DiskLoc &dl ) const;
+ void advanceCapExtent( OperationContext* txn, const StringData& ns );
+ void cappedTruncateLastDelUpdate(OperationContext* txn);
+
+ /**
+ * Truncate documents newer than the document at 'end' from the capped
+ * collection. The collection cannot be completely emptied using this
+ * function. An assertion will be thrown if that is attempted.
+ * @param inclusive - Truncate 'end' as well iff true
+ */
+ void cappedTruncateAfter(OperationContext* txn,
+ const char* ns,
+ DiskLoc end,
+ bool inclusive);
+
+ void _maybeComplain( OperationContext* txn, int len ) const;
+
+ // -- end copy from cap.cpp --
+
+ CappedDocumentDeleteCallback* _deleteCallback;
+
+ OwnedPointerVector<ExtentManager::CacheHint> _extentAdvice;
+
+ friend class CappedRecordStoreV1Iterator;
+ };
+
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp
new file mode 100644
index 00000000000..11f7894fe77
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp
@@ -0,0 +1,237 @@
+/**
+ * Copyright (C) 2013 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h"
+
+#include "mongo/db/catalog/collection.h"
+#include "mongo/db/storage/mmap_v1/extent.h"
+#include "mongo/db/storage/mmap_v1/extent_manager.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h"
+
+namespace mongo {
+
+
+ //
+ // Capped collection traversal
+ //
+ CappedRecordStoreV1Iterator::CappedRecordStoreV1Iterator( OperationContext* txn,
+ const CappedRecordStoreV1* collection,
+ const DiskLoc& start, bool tailable,
+ const CollectionScanParams::Direction& dir)
+ : _txn(txn), _recordStore(collection), _curr(start), _tailable(tailable),
+ _direction(dir), _killedByInvalidate(false) {
+
+ if (_curr.isNull()) {
+
+ const RecordStoreV1MetaData* nsd = _recordStore->details();
+
+ // If a start position isn't specified, we fill one out from the start of the
+ // collection.
+ if (CollectionScanParams::FORWARD == _direction) {
+ // Going forwards.
+ if (!nsd->capLooped()) {
+ // If our capped collection doesn't loop around, the first record is easy.
+ _curr = collection->firstRecord(_txn);
+ }
+ else {
+ // Our capped collection has "looped' around.
+ // Copied verbatim from ForwardCappedCursor::init.
+ // TODO ELABORATE
+ _curr = _getExtent( nsd->capExtent() )->firstRecord;
+ if (!_curr.isNull() && _curr == nsd->capFirstNewRecord()) {
+ _curr = _getExtent( nsd->capExtent() )->lastRecord;
+ _curr = nextLoop(_curr);
+ }
+ }
+ }
+ else {
+ // Going backwards
+ if (!nsd->capLooped()) {
+ // Start at the end.
+ _curr = collection->lastRecord(_txn);
+ }
+ else {
+ _curr = _getExtent( nsd->capExtent() )->lastRecord;
+ }
+ }
+ }
+ }
+
+ bool CappedRecordStoreV1Iterator::isEOF() { return _curr.isNull(); }
+
+ DiskLoc CappedRecordStoreV1Iterator::curr() { return _curr; }
+
+ DiskLoc CappedRecordStoreV1Iterator::getNext() {
+ DiskLoc ret = _curr;
+
+ // Move to the next thing.
+ if (!isEOF()) {
+ _prev = _curr;
+ _curr = getNextCapped(_curr);
+ }
+ else if (_tailable && !_prev.isNull()) {
+ // If we're tailable, there COULD have been something inserted even though we were
+ // previously EOF. Look at the next thing from 'prev' and see.
+ DiskLoc newCurr = getNextCapped(_prev);
+
+ if (!newCurr.isNull()) {
+ // There's something new to return. _curr always points to the next thing to
+ // return. Update it, and move _prev to the thing we just returned.
+ _prev = ret = newCurr;
+ _curr = getNextCapped(_prev);
+ }
+ }
+
+ return ret;
+ }
+
+ void CappedRecordStoreV1Iterator::invalidate(const DiskLoc& dl) {
+ if ((_tailable && _curr.isNull() && dl == _prev) || (dl == _curr)) {
+ // In the _tailable case, we're about to kill the DiskLoc that we're tailing. Nothing
+ // that we can possibly do to survive that.
+ //
+ // In the _curr case, we *could* move to the next thing, since there is actually a next
+ // thing, but according to clientcursor.cpp:
+ // "note we cannot advance here. if this condition occurs, writes to the oplog
+ // have "caught" the reader. skipping ahead, the reader would miss postentially
+ // important data."
+ _curr = _prev = DiskLoc();
+ _killedByInvalidate = true;
+ }
+ }
+
+ void CappedRecordStoreV1Iterator::prepareToYield() {
+ }
+
+ bool CappedRecordStoreV1Iterator::recoverFromYield() {
+ // If invalidate invalidated the DiskLoc we relied on, give up now.
+ if (_killedByInvalidate) {
+ _recordStore = NULL;
+ return false;
+ }
+
+ return true;
+ }
+
+ DiskLoc CappedRecordStoreV1Iterator::getNextCapped(const DiskLoc& dl) {
+ invariant(!dl.isNull());
+ const RecordStoreV1MetaData* details = _recordStore->details();
+
+ if (CollectionScanParams::FORWARD == _direction) {
+ // If it's not looped, it's easy.
+ if (!_recordStore->details()->capLooped()) {
+ return _getNextRecord( dl );
+ }
+
+ // TODO ELABORATE
+ // EOF.
+ if (dl == _getExtent( details->capExtent() )->lastRecord) {
+ return DiskLoc();
+ }
+
+ DiskLoc ret = nextLoop(dl);
+
+ // If we become capFirstNewRecord from same extent, advance to next extent.
+ if (ret == details->capFirstNewRecord() && ret != _getExtent( details->capExtent() )->firstRecord) {
+ ret = nextLoop(_getExtent( details->capExtent() )->lastRecord);
+ }
+
+ // If we have just gotten to beginning of capExtent, skip to capFirstNewRecord
+ if (ret == _getExtent( details->capExtent() )->firstRecord) { ret = details->capFirstNewRecord(); }
+
+ return ret;
+ }
+ else {
+ if (!details->capLooped()) { return _getPrevRecord( dl ); }
+
+ // TODO ELABORATE
+ // Last record
+ if (details->capFirstNewRecord() == _getExtent( details->capExtent() )->firstRecord) {
+ if (dl == nextLoop(_getExtent( details->capExtent() )->lastRecord)) {
+ return DiskLoc();
+ }
+ }
+ else {
+ if (dl == _getExtent( details->capExtent() )->firstRecord) { return DiskLoc(); }
+ }
+
+ DiskLoc ret;
+ // If we are capFirstNewRecord, advance to prev extent, otherwise just get prev.
+ if (dl == details->capFirstNewRecord()) {
+ ret = prevLoop(_getExtent( details->capExtent() )->firstRecord);
+ }
+ else {
+ ret = prevLoop(dl);
+ }
+
+ // If we just became last in cap extent, advance past capFirstNewRecord
+ // (We know ext(capExtent)->firstRecord != capFirstNewRecord, since would
+ // have returned DiskLoc() earlier otherwise.)
+ if (ret == _getExtent( details->capExtent() )->lastRecord) {
+ ret = _getPrevRecord( details->capFirstNewRecord() );
+ }
+
+ return ret;
+ }
+ }
+
+ DiskLoc CappedRecordStoreV1Iterator::nextLoop(const DiskLoc& prev) {
+ // TODO ELABORATE
+ DiskLoc next = _getNextRecord( prev );
+ if (!next.isNull()) {
+ return next;
+ }
+ return _recordStore->firstRecord(_txn);
+ }
+
+ DiskLoc CappedRecordStoreV1Iterator::prevLoop(const DiskLoc& curr) {
+ // TODO ELABORATE
+ DiskLoc prev = _getPrevRecord( curr );
+ if (!prev.isNull()) {
+ return prev;
+ }
+ return _recordStore->lastRecord(_txn);
+ }
+
+ RecordData CappedRecordStoreV1Iterator::dataFor( const DiskLoc& loc ) const {
+ return _recordStore->dataFor( loc );
+ }
+
+ Extent* CappedRecordStoreV1Iterator::_getExtent( const DiskLoc& loc ) {
+ return _recordStore->_extentManager->getExtent( loc );
+ }
+
+ DiskLoc CappedRecordStoreV1Iterator::_getNextRecord( const DiskLoc& loc ) {
+ return _recordStore->getNextRecord( _txn, loc );
+ }
+
+ DiskLoc CappedRecordStoreV1Iterator::_getPrevRecord( const DiskLoc& loc ) {
+ return _recordStore->getPrevRecord( _txn, loc );
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h
new file mode 100644
index 00000000000..501986d98fa
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h
@@ -0,0 +1,100 @@
+/**
+ * Copyright (C) 2013 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/storage/record_store.h"
+
+namespace mongo {
+
+ class CappedRecordStoreV1;
+
+ struct Extent;
+
+ /**
+ * This class iterates over a capped collection identified by 'ns'.
+ * The collection must exist when the constructor is called.
+ *
+ * If start is not DiskLoc(), the iteration begins at that DiskLoc.
+ *
+ * If tailable is true, getNext() can be called after isEOF. It will use the last valid
+ * returned DiskLoc and try to find the next record from that.
+ */
+ class CappedRecordStoreV1Iterator : public RecordIterator {
+ public:
+ CappedRecordStoreV1Iterator( OperationContext* txn,
+ const CappedRecordStoreV1* collection,
+ const DiskLoc& start,
+ bool tailable,
+ const CollectionScanParams::Direction& dir );
+ virtual ~CappedRecordStoreV1Iterator() { }
+
+ // If this is a tailable cursor, isEOF could change its mind after a call to getNext().
+ virtual bool isEOF();
+ virtual DiskLoc getNext();
+ virtual DiskLoc curr();
+
+ virtual void invalidate(const DiskLoc& dl);
+ virtual void prepareToYield();
+ virtual bool recoverFromYield();
+
+ virtual RecordData dataFor( const DiskLoc& loc ) const;
+ private:
+ /**
+ * Internal collection navigation helper methods.
+ */
+ DiskLoc getNextCapped(const DiskLoc& dl);
+ DiskLoc prevLoop(const DiskLoc& curr);
+ DiskLoc nextLoop(const DiskLoc& prev);
+
+ // some helpers - these move to RecordStore probably
+ Extent* _getExtent( const DiskLoc& loc );
+ DiskLoc _getNextRecord( const DiskLoc& loc );
+ DiskLoc _getPrevRecord( const DiskLoc& loc );
+
+ // transactional context for read locks. Not owned by us
+ OperationContext* _txn;
+
+ // The collection we're iterating over.
+ const CappedRecordStoreV1* _recordStore;
+
+ // The result returned on the next call to getNext().
+ DiskLoc _curr;
+
+ // If we're tailable, we try to progress from the last valid result when we hit the end.
+ DiskLoc _prev;
+ bool _tailable;
+
+ CollectionScanParams::Direction _direction;
+
+ // If invalidate kills the DiskLoc we need to move forward, we kill the iterator. See the
+ // comment in the body of invalidate(...).
+ bool _killedByInvalidate;
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp
new file mode 100644
index 00000000000..6e423b9e073
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp
@@ -0,0 +1,558 @@
+// record_store_v1_capped_test.cpp
+
+/**
+ * Copyright (C) 2014 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h"
+
+#include "mongo/db/operation_context_noop.h"
+#include "mongo/db/storage/mmap_v1/record.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h"
+#include "mongo/unittest/unittest.h"
+
+using namespace mongo;
+
+namespace {
+
+ // Provides data to be inserted. Must be large enough for largest possible record.
+ // Should be in BSS so unused portions should be free.
+ char zeros[20*1024*1024] = {};
+
+ class DummyCappedDocumentDeleteCallback : public CappedDocumentDeleteCallback {
+ public:
+ Status aboutToDeleteCapped( OperationContext* txn, const DiskLoc& loc ) {
+ deleted.push_back( loc );
+ return Status::OK();
+ }
+ vector<DiskLoc> deleted;
+ };
+
+ void simpleInsertTest( const char* buf, int size ) {
+
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+ DummyCappedDocumentDeleteCallback cb;
+
+ string myns = "test.simple1";
+ CappedRecordStoreV1 rs( &txn, &cb, myns, md, &em, false );
+
+ rs.increaseStorageSize( &txn, 1024, -1 );
+
+ ASSERT_NOT_OK( rs.insertRecord( &txn, buf, 3, 1000 ).getStatus() );
+
+ rs.insertRecord( &txn, buf, size, 10000 );
+
+ {
+ BSONObjBuilder b;
+ int64_t storageSize = rs.storageSize( &txn, &b );
+ BSONObj obj = b.obj();
+ ASSERT_EQUALS( 1, obj["numExtents"].numberInt() );
+ ASSERT_EQUALS( storageSize, em.quantizeExtentSize( 1024 ) );
+ }
+
+ for ( int i = 0; i < 1000; i++ ) {
+ ASSERT_OK( rs.insertRecord( &txn, buf, size, 10000 ).getStatus() );
+ }
+
+ long long start = md->numRecords();
+ for ( int i = 0; i < 1000; i++ ) {
+ ASSERT_OK( rs.insertRecord( &txn, buf, size, 10000 ).getStatus() );
+ }
+ ASSERT_EQUALS( start, md->numRecords() );
+ ASSERT_GREATER_THAN( start, 100 );
+ ASSERT_LESS_THAN( start, 1000 );
+ }
+
+ TEST(CappedRecordStoreV1, SimpleInsertSize4) {
+ simpleInsertTest("abcd", 4);
+ }
+ TEST(CappedRecordStoreV1, SimpleInsertSize8) {
+ simpleInsertTest("abcdefgh", 8);
+ }
+
+ TEST(CappedRecordStoreV1, EmptySingleExtent) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ LocAndSize records[] = {
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1000), 1000},
+ {}
+ };
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid());
+ initializeV1RS(&txn, records, drecs, &em, md);
+ }
+
+ rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 1000), 100},
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1100), 900},
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc().setInvalid()); // unlooped
+ }
+ }
+
+ TEST(CappedRecordStoreV1, FirstLoopWithSingleExtentExactSize) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ LocAndSize records[] = {
+ {DiskLoc(0, 1000), 100},
+ {DiskLoc(0, 1100), 100},
+ {DiskLoc(0, 1200), 100},
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100},
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1500), 50},
+ {}
+ };
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped
+ initializeV1RS(&txn, records, drecs, &em, md);
+ }
+
+ rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 1200), 100}, // first old record
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100}, // last old record
+ {DiskLoc(0, 1000), 100}, // first new record
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1100), 100}, // gap after newest record XXX this is probably a bug
+ {DiskLoc(0, 1500), 50}, // gap at end of extent
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+ }
+ }
+
+ TEST(CappedRecordStoreV1, NonFirstLoopWithSingleExtentExactSize) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ LocAndSize records[] = {
+ {DiskLoc(0, 1000), 100},
+ {DiskLoc(0, 1100), 100},
+ {DiskLoc(0, 1200), 100},
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100},
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1500), 50},
+ {}
+ };
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000));
+ initializeV1RS(&txn, records, drecs, &em, md);
+ }
+
+ rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 1200), 100}, // first old record
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100}, // last old record
+ {DiskLoc(0, 1000), 100}, // first new record
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1100), 100}, // gap after newest record XXX this is probably a bug
+ {DiskLoc(0, 1500), 50}, // gap at end of extent
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+ }
+ }
+
+ /**
+ * Current code always tries to leave 24 bytes to create a DeletedRecord.
+ */
+ TEST(CappedRecordStoreV1, WillLoopWithout24SpareBytes) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ LocAndSize records[] = {
+ {DiskLoc(0, 1000), 100},
+ {DiskLoc(0, 1100), 100},
+ {DiskLoc(0, 1200), 100},
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100},
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1500), 123},
+ {}
+ };
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000));
+ initializeV1RS(&txn, records, drecs, &em, md);
+ }
+
+ rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 1200), 100}, // first old record
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100}, // last old record
+ {DiskLoc(0, 1000), 100}, // first new record
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1100), 100}, // gap after newest record
+ {DiskLoc(0, 1500), 123}, // gap at end of extent
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+ }
+ }
+
+ TEST(CappedRecordStoreV1, WontLoopWith24SpareBytes) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ LocAndSize records[] = {
+ {DiskLoc(0, 1000), 100},
+ {DiskLoc(0, 1100), 100},
+ {DiskLoc(0, 1200), 100},
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100},
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1500), 124},
+ {}
+ };
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000));
+ initializeV1RS(&txn, records, drecs, &em, md);
+ }
+
+ rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 1000), 100},
+ {DiskLoc(0, 1100), 100},
+ {DiskLoc(0, 1200), 100},
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100},
+ {DiskLoc(0, 1500), 100},
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1600), 24}, // gap at end of extent
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+ }
+ }
+
+ TEST(CappedRecordStoreV1, MoveToSecondExtentUnLooped) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ // Two extents, each with 1000 bytes.
+ LocAndSize records[] = {
+ {DiskLoc(0, 1000), 500},
+ {DiskLoc(0, 1500), 300},
+ {DiskLoc(0, 1800), 100},
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1900), 100},
+ {DiskLoc(1, 1000), 1000},
+ {}
+ };
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid());
+ initializeV1RS(&txn, records, drecs, &em, md);
+ }
+
+ rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 1000), 500},
+ {DiskLoc(0, 1500), 300},
+ {DiskLoc(0, 1800), 100},
+
+ {DiskLoc(1, 1000), 100},
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1900), 100},
+ {DiskLoc(1, 1100), 900},
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(1, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc().setInvalid()); // unlooped
+ }
+ }
+
+ TEST(CappedRecordStoreV1, MoveToSecondExtentLooped) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ // Two extents, each with 1000 bytes.
+ LocAndSize records[] = {
+ {DiskLoc(0, 1800), 100}, // old
+ {DiskLoc(0, 1000), 500}, // first new
+ {DiskLoc(0, 1500), 400},
+
+ {DiskLoc(1, 1000), 300},
+ {DiskLoc(1, 1300), 600},
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1900), 100},
+ {DiskLoc(1, 1900), 100},
+ {}
+ };
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000));
+ initializeV1RS(&txn, records, drecs, &em, md);
+ }
+
+ rs.insertRecord(&txn, zeros, 200 - Record::HeaderSize, false);
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 1000), 500},
+ {DiskLoc(0, 1500), 400},
+
+ {DiskLoc(1, 1300), 600}, // old
+ {DiskLoc(1, 1000), 200}, // first new
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1800), 200},
+ {DiskLoc(1, 1200), 100},
+ {DiskLoc(1, 1900), 100},
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(1, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(1, 1000));
+ }
+ }
+
+ //
+ // XXX The CappedRecordStoreV1Scrambler suite of tests describe existing behavior that is less
+ // than ideal. Any improved implementation will need to be able to handle a collection that has
+ // been scrambled like this.
+ //
+
+ /**
+ * This is a minimal example that shows the current allocator laying out records out-of-order.
+ */
+ TEST(CappedRecordStoreV1Scrambler, Minimal) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ // Starting with a single empty 1000 byte extent.
+ LocAndSize records[] = {
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1000), 1000},
+ {}
+ };
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped
+ initializeV1RS(&txn, records, drecs, &em, md);
+ }
+
+ rs.insertRecord(&txn, zeros, 500 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 300 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 400 - Record::HeaderSize, false); // won't fit at end so wraps
+ rs.insertRecord(&txn, zeros, 120 - Record::HeaderSize, false); // fits at end
+ rs.insertRecord(&txn, zeros, 60 - Record::HeaderSize, false); // fits in earlier hole
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 1500), 300}, // 2nd insert
+ {DiskLoc(0, 1000), 400}, // 3rd (1st new)
+ {DiskLoc(0, 1800), 120}, // 4th
+ {DiskLoc(0, 1400), 60}, // 5th
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1460), 40},
+ {DiskLoc(0, 1920), 80},
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+ }
+ }
+
+ /**
+ * This tests a specially crafted set of inserts that scrambles a capped collection in a way
+ * that leaves 4 deleted records in a single extent.
+ */
+ TEST(CappedRecordStoreV1Scrambler, FourDeletedRecordsInSingleExtent) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+ DummyCappedDocumentDeleteCallback cb;
+ CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+ {
+ // Starting with a single empty 1000 byte extent.
+ LocAndSize records[] = {
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1000), 1000},
+ {}
+ };
+ md->setCapExtent(&txn, DiskLoc(0, 0));
+ md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped
+ initializeV1RS(&txn, records, drecs, &em, md);
+ }
+
+ // This list of sizes was empirically generated to achieve this outcome. Don't think too
+ // much about them.
+ rs.insertRecord(&txn, zeros, 500 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 300 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 304 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 76 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 96 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 76 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 200 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 200 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 56 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 96 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 104 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 96 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 60 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 60 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 146 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 146 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 40 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 40 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 36 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 96 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 200 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 60 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 64 - Record::HeaderSize, false);
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 1148), 148},
+ {DiskLoc(0, 1936), 40},
+ {DiskLoc(0, 1712), 40},
+ {DiskLoc(0, 1296), 36},
+ {DiskLoc(0, 1752), 100},
+ {DiskLoc(0, 1332), 96},
+ {DiskLoc(0, 1428), 200},
+ {DiskLoc(0, 1852), 60},
+ {DiskLoc(0, 1000), 64}, // (1st new)
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1064), 84},
+ {DiskLoc(0, 1976), 24},
+ {DiskLoc(0, 1912), 24},
+ {DiskLoc(0, 1628), 84},
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+ ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+ }
+ }
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp
new file mode 100644
index 00000000000..a210c0dc0f3
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp
@@ -0,0 +1,192 @@
+/**
+ * Copyright (C) 2014 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h"
+
+#include "mongo/db/catalog/collection.h"
+#include "mongo/db/storage/mmap_v1/extent.h"
+#include "mongo/db/storage/mmap_v1/extent_manager.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h"
+
+namespace mongo {
+
+ RecordStoreV1RepairIterator::RecordStoreV1RepairIterator(OperationContext* txn,
+ const RecordStoreV1Base* recordStore)
+ : _txn(txn), _recordStore(recordStore), _stage(FORWARD_SCAN) {
+
+ // Position the iterator at the first record
+ //
+ getNext();
+ }
+
+ bool RecordStoreV1RepairIterator::isEOF() {
+ return _currRecord.isNull();
+ }
+
+ DiskLoc RecordStoreV1RepairIterator::curr() { return _currRecord; }
+
+ DiskLoc RecordStoreV1RepairIterator::getNext() {
+ DiskLoc retVal = _currRecord;
+
+ const ExtentManager* em = _recordStore->_extentManager;
+
+ while (true) {
+ if (_currRecord.isNull()) {
+
+ if (!_advanceToNextValidExtent()) {
+ return retVal;
+ }
+
+ _seenInCurrentExtent.clear();
+
+ // Otherwise _advanceToNextValidExtent would have returned false
+ //
+ invariant(!_currExtent.isNull());
+
+ const Extent* e = em->getExtent(_currExtent, false);
+ _currRecord = (FORWARD_SCAN == _stage ? e->firstRecord : e->lastRecord);
+ }
+ else {
+ switch (_stage) {
+ case FORWARD_SCAN:
+ _currRecord = _recordStore->getNextRecordInExtent(_txn, _currRecord);
+ break;
+ case BACKWARD_SCAN:
+ _currRecord = _recordStore->getPrevRecordInExtent(_txn, _currRecord);
+ break;
+ default:
+ invariant(!"This should never be reached.");
+ break;
+ }
+ }
+
+ if (_currRecord.isNull()) {
+ continue;
+ }
+
+ // Validate the contents of the record's disk location and deduplicate
+ //
+ if (!_seenInCurrentExtent.insert(_currRecord).second) {
+ error() << "infinite loop in extent, seen: " << _currRecord << " before" << endl;
+ _currRecord = DiskLoc();
+ continue;
+ }
+
+ if (_currRecord.getOfs() <= 0){
+ error() << "offset is 0 for record which should be impossible" << endl;
+ _currRecord = DiskLoc();
+ continue;
+ }
+
+ return retVal;
+ }
+ }
+
+ bool RecordStoreV1RepairIterator::_advanceToNextValidExtent() {
+ const ExtentManager* em = _recordStore->_extentManager;
+
+ while (true) {
+ if (_currExtent.isNull()) {
+ switch (_stage) {
+ case FORWARD_SCAN:
+ _currExtent = _recordStore->details()->firstExtent(_txn);
+ break;
+ case BACKWARD_SCAN:
+ _currExtent = _recordStore->details()->lastExtent(_txn);
+ break;
+ default:
+ invariant(DONE == _stage);
+ return false;
+ }
+ }
+ else {
+ // If _currExtent is not NULL, then it must point to a valid extent, so no extra
+ // checks here.
+ //
+ const Extent* e = em->getExtent(_currExtent, false);
+ _currExtent = (FORWARD_SCAN == _stage ? e->xnext : e->xprev);
+ }
+
+ bool hasNextExtent = !_currExtent.isNull();
+
+ // Sanity checks for the extent's disk location
+ //
+ if (hasNextExtent && (!_currExtent.isValid() || (_currExtent.getOfs() <= 0))) {
+ error() << "Invalid extent location: " << _currExtent << endl;
+
+ // Switch the direction of scan
+ //
+ hasNextExtent = false;
+ }
+
+ if (hasNextExtent) {
+ break;
+ }
+
+ // Swap the direction of scan and loop again
+ //
+ switch (_stage) {
+ case FORWARD_SCAN:
+ _stage = BACKWARD_SCAN;
+ break;
+ case BACKWARD_SCAN:
+ _stage = DONE;
+ break;
+ default:
+ invariant(!"This should never be reached.");
+ break;
+ }
+
+ _currExtent = DiskLoc();
+ }
+
+
+ // Check _currExtent's contents for validity, but do not count is as failure if they
+ // don't check out.
+ //
+ const Extent* e = em->getExtent(_currExtent, false);
+ if (!e->isOk()){
+ warning() << "Extent not ok magic: " << e->magic << " going to try to continue"
+ << endl;
+ }
+
+ log() << (FORWARD_SCAN == _stage ? "FORWARD" : "BACKWARD") << " Extent loc: "
+ << _currExtent << ", length: " << e->length << endl;
+
+ return true;
+ }
+
+ void RecordStoreV1RepairIterator::invalidate(const DiskLoc& dl) {
+ verify(!"Invalidate is not supported for RecordStoreV1RepairIterator.");
+ }
+
+ RecordData RecordStoreV1RepairIterator::dataFor(const DiskLoc& loc) const {
+ return _recordStore->dataFor( loc );
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h
new file mode 100644
index 00000000000..c75c1c790c1
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h
@@ -0,0 +1,96 @@
+/**
+ * Copyright (C) 2014 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include <set>
+
+#include "mongo/db/storage/record_store.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_base.h"
+
+namespace mongo {
+
+ /**
+ * This iterator will go over the collection twice - once going forward (first extent -> last
+ * extent) and once backwards in an attempt to salvage potentially corrupted or unreachable
+ * records. It is used by the mongodump --repair option.
+ */
+ class RecordStoreV1RepairIterator : public RecordIterator {
+ public:
+ RecordStoreV1RepairIterator(OperationContext* txn,
+ const RecordStoreV1Base* recordStore);
+ virtual ~RecordStoreV1RepairIterator() { }
+
+ virtual bool isEOF();
+ virtual DiskLoc getNext();
+ virtual DiskLoc curr();
+
+ virtual void invalidate(const DiskLoc& dl);
+ virtual void prepareToYield() { }
+ virtual bool recoverFromYield() {
+ return true;
+ }
+
+ virtual RecordData dataFor( const DiskLoc& loc ) const;
+
+ private:
+
+ /**
+ * Based on the direction of scan, finds the next valid (un-corrupted) extent in the chain
+ * and sets _currExtent to point to that.
+ *
+ * @return true if valid extent was found (_currExtent will not be null)
+ * false otherwise and _currExtent will be null
+ */
+ bool _advanceToNextValidExtent();
+
+ // transactional context for read locks. Not owned by us
+ OperationContext* _txn;
+
+ // Reference to the owning RecordStore. The store must not be deleted while there are
+ // active iterators on it.
+ //
+ const RecordStoreV1Base* _recordStore;
+
+ DiskLoc _currExtent;
+ DiskLoc _currRecord;
+
+ enum Stage {
+ FORWARD_SCAN = 0,
+ BACKWARD_SCAN = 1,
+ DONE = 2
+ };
+
+ Stage _stage;
+
+ // Used to find cycles within an extent. Cleared after each extent has been processed.
+ //
+ std::set<DiskLoc> _seenInCurrentExtent;
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp
new file mode 100644
index 00000000000..7a9d17974eb
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp
@@ -0,0 +1,505 @@
+// record_store_v1_simple.cpp
+
+/**
+ * Copyright (C) 2013-2014 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h"
+
+#include "mongo/base/counter.h"
+#include "mongo/db/catalog/collection.h"
+#include "mongo/db/curop.h"
+#include "mongo/db/commands/server_status_metric.h"
+#include "mongo/db/storage/mmap_v1/extent.h"
+#include "mongo/db/storage/mmap_v1/extent_manager.h"
+#include "mongo/db/storage/mmap_v1/record.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h"
+#include "mongo/util/log.h"
+#include "mongo/util/progress_meter.h"
+#include "mongo/util/timer.h"
+#include "mongo/util/touch_pages.h"
+
+namespace mongo {
+
+ MONGO_LOG_DEFAULT_COMPONENT_FILE(::mongo::logger::LogComponent::kStorage);
+
+ static Counter64 freelistAllocs;
+ static Counter64 freelistBucketExhausted;
+ static Counter64 freelistIterations;
+
+ static ServerStatusMetricField<Counter64> dFreelist1( "storage.freelist.search.requests",
+ &freelistAllocs );
+
+ static ServerStatusMetricField<Counter64> dFreelist2( "storage.freelist.search.bucketExhausted",
+ &freelistBucketExhausted );
+
+ static ServerStatusMetricField<Counter64> dFreelist3( "storage.freelist.search.scanned",
+ &freelistIterations );
+
+ SimpleRecordStoreV1::SimpleRecordStoreV1( OperationContext* txn,
+ const StringData& ns,
+ RecordStoreV1MetaData* details,
+ ExtentManager* em,
+ bool isSystemIndexes )
+ : RecordStoreV1Base( ns, details, em, isSystemIndexes ) {
+
+ invariant( !details->isCapped() );
+ _normalCollection = NamespaceString::normal( ns );
+ if ( _details->paddingFactor() == 0 ) {
+ warning() << "implicit updgrade of paddingFactor of very old collection" << endl;
+ _details->setPaddingFactor(txn, 1.0);
+ }
+
+ }
+
+ SimpleRecordStoreV1::~SimpleRecordStoreV1() {
+ }
+
+ DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents( OperationContext* txn,
+ int lenToAlloc ) {
+ // align size up to a multiple of 4
+ lenToAlloc = (lenToAlloc + (4-1)) & ~(4-1);
+
+ freelistAllocs.increment();
+ DiskLoc loc;
+ {
+ DiskLoc *prev = 0;
+ DiskLoc *bestprev = 0;
+ DiskLoc bestmatch;
+ int bestmatchlen = INT_MAX; // sentinel meaning we haven't found a record big enough
+ int b = bucket(lenToAlloc);
+ DiskLoc cur = _details->deletedListEntry(b);
+
+ int extra = 5; // look for a better fit, a little.
+ int chain = 0;
+ while ( 1 ) {
+ { // defensive check
+ int fileNumber = cur.a();
+ int fileOffset = cur.getOfs();
+ if (fileNumber < -1 || fileNumber >= 100000 || fileOffset < 0) {
+ StringBuilder sb;
+ sb << "Deleted record list corrupted in collection " << _ns
+ << ", bucket " << b
+ << ", link number " << chain
+ << ", invalid link is " << cur.toString()
+ << ", throwing Fatal Assertion";
+ log() << sb.str() << endl;
+ fassertFailed(16469);
+ }
+ }
+ if ( cur.isNull() ) {
+ // move to next bucket. if we were doing "extra", just break
+ if ( bestmatchlen < INT_MAX )
+ break;
+
+ if ( chain > 0 ) {
+ // if we looked at things in the right bucket, but they were not suitable
+ freelistBucketExhausted.increment();
+ }
+
+ b++;
+ if ( b > MaxBucket ) {
+ // out of space. alloc a new extent.
+ freelistIterations.increment( 1 + chain );
+ return DiskLoc();
+ }
+ cur = _details->deletedListEntry(b);
+ prev = 0;
+ continue;
+ }
+ DeletedRecord *r = drec(cur);
+ if ( r->lengthWithHeaders() >= lenToAlloc &&
+ r->lengthWithHeaders() < bestmatchlen ) {
+ bestmatchlen = r->lengthWithHeaders();
+ bestmatch = cur;
+ bestprev = prev;
+ if (r->lengthWithHeaders() == lenToAlloc)
+ // exact match, stop searching
+ break;
+ }
+ if ( bestmatchlen < INT_MAX && --extra <= 0 )
+ break;
+ if ( ++chain > 30 && b <= MaxBucket ) {
+ // too slow, force move to next bucket to grab a big chunk
+ //b++;
+ freelistIterations.increment( chain );
+ chain = 0;
+ cur.Null();
+ }
+ else {
+ cur = r->nextDeleted();
+ prev = &r->nextDeleted();
+ }
+ }
+
+ // unlink ourself from the deleted list
+ DeletedRecord *bmr = drec(bestmatch);
+ if ( bestprev ) {
+ *txn->recoveryUnit()->writing(bestprev) = bmr->nextDeleted();
+ }
+ else {
+ // should be the front of a free-list
+ int myBucket = bucket(bmr->lengthWithHeaders());
+ invariant( _details->deletedListEntry(myBucket) == bestmatch );
+ _details->setDeletedListEntry(txn, myBucket, bmr->nextDeleted());
+ }
+ *txn->recoveryUnit()->writing(&bmr->nextDeleted()) = DiskLoc().setInvalid(); // defensive.
+ invariant(bmr->extentOfs() < bestmatch.getOfs());
+
+ freelistIterations.increment( 1 + chain );
+ loc = bestmatch;
+ }
+
+ if ( loc.isNull() )
+ return loc;
+
+ // determine if we should chop up
+
+ DeletedRecord *r = drec(loc);
+
+ /* note we want to grab from the front so our next pointers on disk tend
+ to go in a forward direction which is important for performance. */
+ int regionlen = r->lengthWithHeaders();
+ invariant( r->extentOfs() < loc.getOfs() );
+
+ int left = regionlen - lenToAlloc;
+ if ( left < 24 || left < (lenToAlloc / 8) ) {
+ // you get the whole thing.
+ return loc;
+ }
+
+ // don't quantize:
+ // - $ collections (indexes) as we already have those aligned the way we want SERVER-8425
+ if ( _normalCollection ) {
+ // we quantize here so that it only impacts newly sized records
+ // this prevents oddities with older records and space re-use SERVER-8435
+ lenToAlloc = std::min( r->lengthWithHeaders(),
+ quantizeAllocationSpace( lenToAlloc ) );
+ left = regionlen - lenToAlloc;
+
+ if ( left < 24 ) {
+ // you get the whole thing.
+ return loc;
+ }
+ }
+
+ /* split off some for further use. */
+ txn->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc;
+ DiskLoc newDelLoc = loc;
+ newDelLoc.inc(lenToAlloc);
+ DeletedRecord* newDel = drec(newDelLoc);
+ DeletedRecord* newDelW = txn->recoveryUnit()->writing(newDel);
+ newDelW->extentOfs() = r->extentOfs();
+ newDelW->lengthWithHeaders() = left;
+ newDelW->nextDeleted().Null();
+
+ addDeletedRec( txn, newDelLoc );
+ return loc;
+ }
+
+ StatusWith<DiskLoc> SimpleRecordStoreV1::allocRecord( OperationContext* txn,
+ int lengthWithHeaders,
+ bool enforceQuota ) {
+ DiskLoc loc = _allocFromExistingExtents( txn, lengthWithHeaders );
+ if ( !loc.isNull() )
+ return StatusWith<DiskLoc>( loc );
+
+ LOG(1) << "allocating new extent";
+
+ increaseStorageSize( txn,
+ _extentManager->followupSize( lengthWithHeaders,
+ _details->lastExtentSize(txn)),
+ enforceQuota );
+
+ loc = _allocFromExistingExtents( txn, lengthWithHeaders );
+ if ( !loc.isNull() ) {
+ // got on first try
+ return StatusWith<DiskLoc>( loc );
+ }
+
+ log() << "warning: alloc() failed after allocating new extent. "
+ << "lengthWithHeaders: " << lengthWithHeaders << " last extent size:"
+ << _details->lastExtentSize(txn) << "; trying again";
+
+ for ( int z = 0; z < 10 && lengthWithHeaders > _details->lastExtentSize(txn); z++ ) {
+ log() << "try #" << z << endl;
+
+ increaseStorageSize( txn,
+ _extentManager->followupSize( lengthWithHeaders,
+ _details->lastExtentSize(txn)),
+ enforceQuota );
+
+ loc = _allocFromExistingExtents( txn, lengthWithHeaders );
+ if ( ! loc.isNull() )
+ return StatusWith<DiskLoc>( loc );
+ }
+
+ return StatusWith<DiskLoc>( ErrorCodes::InternalError, "cannot allocate space" );
+ }
+
+ Status SimpleRecordStoreV1::truncate(OperationContext* txn) {
+ return Status( ErrorCodes::InternalError,
+ "SimpleRecordStoreV1::truncate not implemented" );
+ }
+
+ void SimpleRecordStoreV1::addDeletedRec( OperationContext* txn, const DiskLoc& dloc ) {
+ DeletedRecord* d = drec( dloc );
+
+ DEBUGGING log() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs() << endl;
+
+ int b = bucket(d->lengthWithHeaders());
+ *txn->recoveryUnit()->writing(&d->nextDeleted()) = _details->deletedListEntry(b);
+ _details->setDeletedListEntry(txn, b, dloc);
+ }
+
+ RecordIterator* SimpleRecordStoreV1::getIterator( OperationContext* txn,
+ const DiskLoc& start,
+ bool tailable,
+ const CollectionScanParams::Direction& dir) const {
+ return new SimpleRecordStoreV1Iterator( txn, this, start, dir );
+ }
+
+ vector<RecordIterator*> SimpleRecordStoreV1::getManyIterators( OperationContext* txn ) const {
+ OwnedPointerVector<RecordIterator> iterators;
+ const Extent* ext;
+ for (DiskLoc extLoc = details()->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) {
+ ext = _getExtent(txn, extLoc);
+ if (ext->firstRecord.isNull())
+ continue;
+ iterators.push_back(
+ new RecordStoreV1Base::IntraExtentIterator(txn, ext->firstRecord, this));
+ }
+
+ return iterators.release();
+ }
+
+ class CompactDocWriter : public DocWriter {
+ public:
+ /**
+ * param allocationSize - allocation size WITH header
+ */
+ CompactDocWriter( const Record* rec, unsigned dataSize, size_t allocationSize )
+ : _rec( rec ),
+ _dataSize( dataSize ),
+ _allocationSize( allocationSize ) {
+ }
+
+ virtual ~CompactDocWriter() {}
+
+ virtual void writeDocument( char* buf ) const {
+ memcpy( buf, _rec->data(), _dataSize );
+ }
+
+ virtual size_t documentSize() const {
+ return _allocationSize - Record::HeaderSize;
+ }
+
+ virtual bool addPadding() const {
+ return false;
+ }
+
+ private:
+ const Record* _rec;
+ size_t _dataSize;
+ size_t _allocationSize;
+ };
+
+ void SimpleRecordStoreV1::_compactExtent(OperationContext* txn,
+ const DiskLoc diskloc,
+ int extentNumber,
+ RecordStoreCompactAdaptor* adaptor,
+ const CompactOptions* compactOptions,
+ CompactStats* stats ) {
+
+ log() << "compact begin extent #" << extentNumber
+ << " for namespace " << _ns << " " << diskloc;
+
+ unsigned oldObjSize = 0; // we'll report what the old padding was
+ unsigned oldObjSizeWithPadding = 0;
+
+ Extent *e = _extentManager->getExtent( diskloc );
+ e->assertOk();
+ fassert( 17437, e->validates(diskloc) );
+
+ {
+ // the next/prev pointers within the extent might not be in order so we first
+ // page the whole thing in sequentially
+ log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
+ Timer t;
+ size_t length = e->length;
+
+ touch_pages( reinterpret_cast<const char*>(e), length );
+ int ms = t.millis();
+ if( ms > 1000 )
+ log() << "compact end paging in " << ms << "ms "
+ << e->length/1000000.0/t.seconds() << "MB/sec" << endl;
+ }
+
+ {
+ log() << "compact copying records" << endl;
+ long long datasize = 0;
+ long long nrecords = 0;
+ DiskLoc L = e->firstRecord;
+ if( !L.isNull() ) {
+ while( 1 ) {
+ Record *recOld = recordFor(L);
+ RecordData oldData = recOld->toRecordData();
+ L = getNextRecordInExtent(txn, L);
+
+ if ( compactOptions->validateDocuments && !adaptor->isDataValid( oldData ) ) {
+ // object is corrupt!
+ log() << "compact skipping corrupt document!";
+ stats->corruptDocuments++;
+ }
+ else {
+ unsigned dataSize = adaptor->dataSize( oldData );
+ unsigned docSize = dataSize;
+
+ nrecords++;
+ oldObjSize += docSize;
+ oldObjSizeWithPadding += recOld->netLength();
+
+ unsigned lenWHdr = docSize + Record::HeaderSize;
+ unsigned lenWPadding = lenWHdr;
+
+ switch( compactOptions->paddingMode ) {
+ case CompactOptions::NONE:
+ if ( _details->isUserFlagSet(Flag_UsePowerOf2Sizes) )
+ lenWPadding = quantizePowerOf2AllocationSpace(lenWPadding);
+ break;
+ case CompactOptions::PRESERVE:
+ // if we are preserving the padding, the record should not change size
+ lenWPadding = recOld->lengthWithHeaders();
+ break;
+ case CompactOptions::MANUAL:
+ lenWPadding = compactOptions->computeRecordSize(lenWPadding);
+ if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) {
+ lenWPadding = lenWHdr;
+ }
+ break;
+ }
+
+ CompactDocWriter writer( recOld, dataSize, lenWPadding );
+ StatusWith<DiskLoc> status = insertRecord( txn, &writer, false );
+ uassertStatusOK( status.getStatus() );
+ datasize += recordFor( status.getValue() )->netLength();
+
+ adaptor->inserted( dataFor( status.getValue() ), status.getValue() );
+ }
+
+ if( L.isNull() ) {
+ // we just did the very last record from the old extent. it's still pointed to
+ // by the old extent ext, but that will be fixed below after this loop
+ break;
+ }
+
+ // remove the old records (orphan them) periodically so our commit block doesn't get too large
+ bool stopping = false;
+ RARELY stopping = !txn->checkForInterruptNoAssert().isOK();
+ if( stopping || txn->recoveryUnit()->isCommitNeeded() ) {
+ *txn->recoveryUnit()->writing(&e->firstRecord) = L;
+ Record *r = recordFor(L);
+ txn->recoveryUnit()->writingInt(r->prevOfs()) = DiskLoc::NullOfs;
+ txn->recoveryUnit()->commitIfNeeded();
+ txn->checkForInterrupt();
+ }
+ }
+ } // if !L.isNull()
+
+ invariant( _details->firstExtent(txn) == diskloc );
+ invariant( _details->lastExtent(txn) != diskloc );
+ DiskLoc newFirst = e->xnext;
+ _details->setFirstExtent( txn, newFirst );
+ *txn->recoveryUnit()->writing(&_extentManager->getExtent( newFirst )->xprev) = DiskLoc();
+ _extentManager->freeExtent( txn, diskloc );
+
+ txn->recoveryUnit()->commitIfNeeded();
+
+ {
+ double op = 1.0;
+ if( oldObjSize )
+ op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize;
+ log() << "compact finished extent #" << extentNumber << " containing " << nrecords
+ << " documents (" << datasize/1000000.0 << "MB)"
+ << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100;
+ }
+ }
+
+ }
+
+ Status SimpleRecordStoreV1::compact( OperationContext* txn,
+ RecordStoreCompactAdaptor* adaptor,
+ const CompactOptions* options,
+ CompactStats* stats ) {
+
+ // this is a big job, so might as well make things tidy before we start just to be nice.
+ txn->recoveryUnit()->commitIfNeeded();
+
+ list<DiskLoc> extents;
+ for( DiskLoc extLocation = _details->firstExtent(txn);
+ !extLocation.isNull();
+ extLocation = _extentManager->getExtent( extLocation )->xnext ) {
+ extents.push_back( extLocation );
+ }
+ log() << "compact " << extents.size() << " extents";
+
+ log() << "compact orphan deleted lists" << endl;
+ _details->orphanDeletedList(txn);
+
+ // Start over from scratch with our extent sizing and growth
+ _details->setLastExtentSize( txn, 0 );
+
+ // create a new extent so new records go there
+ increaseStorageSize( txn, _details->lastExtentSize(txn), true );
+
+ // reset data size and record counts to 0 for this namespace
+ // as we're about to tally them up again for each new extent
+ _details->setStats( txn, 0, 0 );
+
+ ProgressMeterHolder pm(*txn->setMessage("compact extent",
+ "Extent Compacting Progress",
+ extents.size()));
+
+ int extentNumber = 0;
+ for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) {
+ _compactExtent(txn, *i, extentNumber++, adaptor, options, stats );
+ pm.hit();
+ }
+
+ invariant( _extentManager->getExtent( _details->firstExtent(txn) )->xprev.isNull() );
+ invariant( _extentManager->getExtent( _details->lastExtent(txn) )->xnext.isNull() );
+
+ // indexes will do their own progress meter
+ pm.finished();
+
+ return Status::OK();
+ }
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple.h b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.h
new file mode 100644
index 00000000000..abc6b11b928
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.h
@@ -0,0 +1,95 @@
+// record_store_v1_simple.h
+
+/**
+* Copyright (C) 2013-2014 MongoDB Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#pragma once
+
+#include "mongo/db/diskloc.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_base.h"
+
+namespace mongo {
+
+ class SimpleRecordStoreV1Iterator;
+
+ // used by index and original collections
+ class SimpleRecordStoreV1 : public RecordStoreV1Base {
+ public:
+ SimpleRecordStoreV1( OperationContext* txn,
+ const StringData& ns,
+ RecordStoreV1MetaData* details,
+ ExtentManager* em,
+ bool isSystemIndexes );
+
+ virtual ~SimpleRecordStoreV1();
+
+ const char* name() const { return "SimpleRecordStoreV1"; }
+
+ virtual RecordIterator* getIterator( OperationContext* txn, const DiskLoc& start, bool tailable,
+ const CollectionScanParams::Direction& dir) const;
+
+ virtual std::vector<RecordIterator*> getManyIterators(OperationContext* txn) const;
+
+ virtual Status truncate(OperationContext* txn);
+
+ virtual void temp_cappedTruncateAfter(OperationContext* txn, DiskLoc end, bool inclusive) {
+ invariant(!"cappedTruncateAfter not supported");
+ }
+
+ virtual bool compactSupported() const { return true; }
+ virtual Status compact( OperationContext* txn,
+ RecordStoreCompactAdaptor* adaptor,
+ const CompactOptions* options,
+ CompactStats* stats );
+
+ protected:
+ virtual bool isCapped() const { return false; }
+
+ virtual StatusWith<DiskLoc> allocRecord( OperationContext* txn,
+ int lengthWithHeaders,
+ bool enforceQuota );
+
+ virtual void addDeletedRec(OperationContext* txn,
+ const DiskLoc& dloc);
+ private:
+ DiskLoc _allocFromExistingExtents( OperationContext* txn,
+ int lengthWithHeaders );
+
+ void _compactExtent(OperationContext* txn,
+ const DiskLoc diskloc,
+ int extentNumber,
+ RecordStoreCompactAdaptor* adaptor,
+ const CompactOptions* compactOptions,
+ CompactStats* stats );
+
+ bool _normalCollection;
+
+ friend class SimpleRecordStoreV1Iterator;
+ };
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp
new file mode 100644
index 00000000000..803b1494920
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp
@@ -0,0 +1,130 @@
+/**
+ * Copyright (C) 2013 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h"
+
+#include "mongo/db/catalog/collection.h"
+#include "mongo/db/storage/mmap_v1/extent.h"
+#include "mongo/db/storage/mmap_v1/extent_manager.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h"
+
+namespace mongo {
+
+ //
+ // Regular / non-capped collection traversal
+ //
+
+ SimpleRecordStoreV1Iterator::SimpleRecordStoreV1Iterator(OperationContext* txn,
+ const SimpleRecordStoreV1* collection,
+ const DiskLoc& start,
+ const CollectionScanParams::Direction& dir)
+ : _txn(txn), _curr(start), _recordStore(collection), _direction(dir) {
+
+ if (_curr.isNull()) {
+
+ const ExtentManager* em = _recordStore->_extentManager;
+
+ if ( _recordStore->details()->firstExtent(txn).isNull() ) {
+ // nothing in the collection
+ verify( _recordStore->details()->lastExtent(txn).isNull() );
+ }
+ else if (CollectionScanParams::FORWARD == _direction) {
+
+ // Find a non-empty extent and start with the first record in it.
+ Extent* e = em->getExtent( _recordStore->details()->firstExtent(txn) );
+
+ while (e->firstRecord.isNull() && !e->xnext.isNull()) {
+ e = em->getExtent( e->xnext );
+ }
+
+ // _curr may be set to DiskLoc() here if e->lastRecord isNull but there is no
+ // valid e->xnext
+ _curr = e->firstRecord;
+ }
+ else {
+ // Walk backwards, skipping empty extents, and use the last record in the first
+ // non-empty extent we see.
+ Extent* e = em->getExtent( _recordStore->details()->lastExtent(txn) );
+
+ // TODO ELABORATE
+ // Does one of e->lastRecord.isNull(), e.firstRecord.isNull() imply the other?
+ while (e->lastRecord.isNull() && !e->xprev.isNull()) {
+ e = em->getExtent( e->xprev );
+ }
+
+ // _curr may be set to DiskLoc() here if e->lastRecord isNull but there is no
+ // valid e->xprev
+ _curr = e->lastRecord;
+ }
+ }
+ }
+
+ bool SimpleRecordStoreV1Iterator::isEOF() {
+ return _curr.isNull();
+ }
+
+ DiskLoc SimpleRecordStoreV1Iterator::curr() { return _curr; }
+
+ DiskLoc SimpleRecordStoreV1Iterator::getNext() {
+ DiskLoc ret = _curr;
+
+ // Move to the next thing.
+ if (!isEOF()) {
+ if (CollectionScanParams::FORWARD == _direction) {
+ _curr = _recordStore->getNextRecord( _txn, _curr );
+ }
+ else {
+ _curr = _recordStore->getPrevRecord( _txn, _curr );
+ }
+ }
+
+ return ret;
+ }
+
+ void SimpleRecordStoreV1Iterator::invalidate(const DiskLoc& dl) {
+ // Just move past the thing being deleted.
+ if (dl == _curr) {
+ // We don't care about the return of getNext so much as the side effect of moving _curr
+ // to the 'next' thing.
+ getNext();
+ }
+ }
+
+ void SimpleRecordStoreV1Iterator::prepareToYield() {
+ }
+
+ bool SimpleRecordStoreV1Iterator::recoverFromYield() {
+ // if the collection is dropped, then the cursor should be destroyed
+ return true;
+ }
+
+ RecordData SimpleRecordStoreV1Iterator::dataFor( const DiskLoc& loc ) const {
+ return _recordStore->dataFor( loc );
+ }
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h
new file mode 100644
index 00000000000..ded30a3ee1d
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h
@@ -0,0 +1,73 @@
+/**
+ * Copyright (C) 2013 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/storage/record_store.h"
+
+namespace mongo {
+
+ class SimpleRecordStoreV1;
+
+ /**
+ * This class iterates over a non-capped collection identified by 'ns'.
+ * The collection must exist when the constructor is called.
+ *
+ * If start is not DiskLoc(), the iteration begins at that DiskLoc.
+ */
+ class SimpleRecordStoreV1Iterator : public RecordIterator {
+ public:
+ SimpleRecordStoreV1Iterator( OperationContext* txn,
+ const SimpleRecordStoreV1* records,
+ const DiskLoc& start,
+ const CollectionScanParams::Direction& dir );
+ virtual ~SimpleRecordStoreV1Iterator() { }
+
+ virtual bool isEOF();
+ virtual DiskLoc getNext();
+ virtual DiskLoc curr();
+
+ virtual void invalidate(const DiskLoc& dl);
+ virtual void prepareToYield();
+ virtual bool recoverFromYield();
+
+ virtual RecordData dataFor( const DiskLoc& loc ) const;
+
+ private:
+ // for getNext, not owned
+ OperationContext* _txn;
+
+ // The result returned on the next call to getNext().
+ DiskLoc _curr;
+
+ const SimpleRecordStoreV1* _recordStore;
+
+ CollectionScanParams::Direction _direction;
+ };
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp
new file mode 100644
index 00000000000..31f17f42b28
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp
@@ -0,0 +1,775 @@
+// record_store_v1_simple_test.cpp
+
+/**
+ * Copyright (C) 2014 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h"
+
+#include "mongo/db/operation_context_noop.h"
+#include "mongo/db/storage/mmap_v1/record.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h"
+#include "mongo/unittest/unittest.h"
+
+using namespace mongo;
+
+namespace {
+
+ // Provides data to be inserted. Must be large enough for largest possible record.
+ // Should be in BSS so unused portions should be free.
+ char zeros[20*1024*1024] = {};
+
+ TEST( SimpleRecordStoreV1, quantizeAllocationSpaceSimple ) {
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(33), 36);
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1000), 1024);
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(10001), 10240);
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(100000), 106496);
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1000001), 1048576);
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(10000000), 10223616);
+ }
+
+ TEST( SimpleRecordStoreV1, quantizeAllocationMinMaxBound ) {
+ const int maxSize = 16 * 1024 * 1024;
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1), 2);
+ ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(maxSize), maxSize);
+ }
+
+ /**
+ * Test Quantize record allocation on every boundary, as well as boundary-1
+ * @see NamespaceDetails::quantizeAllocationSpace()
+ */
+ TEST( SimpleRecordStoreV1, quantizeAllocationBoundary ) {
+ for (int iBucket = 0; iBucket <= RecordStoreV1Base::MaxBucket; ++iBucket) {
+ // for each bucket in range [min, max)
+ const int bucketSize = RecordStoreV1Base::bucketSizes[iBucket];
+ const int prevBucketSize =
+ (iBucket - 1 >= 0) ? RecordStoreV1Base::bucketSizes[iBucket - 1] : 0;
+ const int intervalSize = bucketSize / 16;
+ for (int iBoundary = prevBucketSize;
+ iBoundary < bucketSize;
+ iBoundary += intervalSize) {
+ // for each quantization boundary within the bucket
+ for (int iSize = iBoundary - 1; iSize <= iBoundary; ++iSize) {
+ // test the quantization boundary - 1, and the boundary itself
+ const int quantized =
+ RecordStoreV1Base::quantizeAllocationSpace(iSize);
+ // assert quantized size is greater than or equal to requested size
+ ASSERT(quantized >= iSize);
+ // assert quantized size is within one quantization interval of
+ // the requested size
+ ASSERT(quantized - iSize <= intervalSize);
+ // assert quantization is an idempotent operation
+ ASSERT(quantized ==
+ RecordStoreV1Base::quantizeAllocationSpace(quantized));
+ }
+ }
+ }
+ }
+
+ /**
+ * For buckets up to 4MB powerOf2 allocation should round up to next power of 2. It should be
+ * return the input unmodified if it is already a power of 2.
+ */
+ TEST( SimpleRecordStoreV1, quantizePowerOf2Small ) {
+ // only tests buckets <= 4MB. Higher buckets quatize to 1MB even with powerOf2
+ for (int bucket = 0; bucket < RecordStoreV1Base::MaxBucket; bucket++) {
+ const int size = RecordStoreV1Base::bucketSizes[bucket];
+ const int nextSize = RecordStoreV1Base::bucketSizes[bucket + 1];
+
+ // size - 1 is quantized to size.
+ ASSERT_EQUALS( size,
+ RecordStoreV1Base::quantizePowerOf2AllocationSpace( size - 1 ) );
+
+ // size is quantized to size.
+ ASSERT_EQUALS( size,
+ RecordStoreV1Base::quantizePowerOf2AllocationSpace( size ) );
+
+ // size + 1 is quantized to nextSize (unless > 4MB which is covered by next test)
+ if (size < 4*1024*1024) {
+ ASSERT_EQUALS( nextSize,
+ RecordStoreV1Base::quantizePowerOf2AllocationSpace( size + 1 ) );
+ }
+ }
+ }
+
+ /**
+ * Within the largest bucket, quantizePowerOf2AllocationSpace quantizes to the nearest
+ * megabyte boundary.
+ */
+ TEST( SimpleRecordStoreV1, SimpleRecordLargePowerOf2ToMegabyteBoundary ) {
+ // Iterate iSize over all 1mb boundaries from the size of the next to largest bucket
+ // to the size of the largest bucket + 1mb.
+ for( int iSize = RecordStoreV1Base::bucketSizes[ RecordStoreV1Base::MaxBucket - 1 ];
+ iSize <= RecordStoreV1Base::bucketSizes[ RecordStoreV1Base::MaxBucket ] + 0x100000;
+ iSize += 0x100000 ) {
+
+ // iSize - 1 is quantized to iSize.
+ ASSERT_EQUALS( iSize,
+ RecordStoreV1Base::quantizePowerOf2AllocationSpace( iSize - 1 ) );
+
+ // iSize is quantized to iSize.
+ ASSERT_EQUALS( iSize,
+ RecordStoreV1Base::quantizePowerOf2AllocationSpace( iSize ) );
+
+ // iSize + 1 is quantized to iSize + 1mb.
+ ASSERT_EQUALS( iSize + 0x100000,
+ RecordStoreV1Base::quantizePowerOf2AllocationSpace( iSize + 1 ) );
+ }
+ }
+
+ BSONObj docForRecordSize( int size ) {
+ BSONObjBuilder b;
+ b.append( "_id", 5 );
+ b.append( "x", string( size - Record::HeaderSize - 22, 'x' ) );
+ BSONObj x = b.obj();
+ ASSERT_EQUALS( Record::HeaderSize + x.objsize(), size );
+ return x;
+ }
+
+ /** alloc() quantizes the requested size using quantizeAllocationSpace() rules. */
+ TEST(SimpleRecordStoreV1, AllocQuantized) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+
+ string myns = "test.AllocQuantized";
+ SimpleRecordStoreV1 rs( &txn, myns, md, &em, false );
+
+ BSONObj obj = docForRecordSize( 300 );
+ StatusWith<DiskLoc> result = rs.insertRecord( &txn, obj.objdata(), obj.objsize(), false);
+ ASSERT( result.isOK() );
+
+ // The length of the allocated record is quantized.
+ ASSERT_EQUALS( 320, rs.dataFor( result.getValue() ).size() + Record::HeaderSize );
+ }
+
+ /**
+ * alloc() does not quantize records in index collections using quantizeAllocationSpace()
+ * rules.
+ */
+ TEST(SimpleRecordStoreV1, AllocIndexNamespaceNotQuantized) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+
+ string myns = "test.AllocIndexNamespaceNotQuantized";
+ SimpleRecordStoreV1 rs( &txn, myns + "$x", md, &em, false );
+
+ BSONObj obj = docForRecordSize( 300 );
+ StatusWith<DiskLoc> result = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false);
+ ASSERT( result.isOK() );
+
+ // The length of the allocated record is not quantized.
+ ASSERT_EQUALS( 300, rs.dataFor( result.getValue() ).size() + Record::HeaderSize );
+
+ }
+
+ /** alloc() quantizes records in index collections to the nearest multiple of 4. */
+ TEST(SimpleRecordStoreV1, AllocIndexNamespaceSlightlyQuantized) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+
+ string myns = "test.AllocIndexNamespaceNotQuantized";
+ SimpleRecordStoreV1 rs( &txn, myns + "$x", md, &em, false );
+
+ BSONObj obj = docForRecordSize( 298 );
+ StatusWith<DiskLoc> result = rs.insertRecord( &txn, obj.objdata(), obj.objsize(), false);
+ ASSERT( result.isOK() );
+
+ ASSERT_EQUALS( 300, rs.dataFor( result.getValue() ).size() + Record::HeaderSize );
+ }
+
+ /** alloc() returns a non quantized record larger than the requested size. */
+ TEST(SimpleRecordStoreV1, AllocUseNonQuantizedDeletedRecord) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+ SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+ {
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1000), 310},
+ {}
+ };
+ initializeV1RS(&txn, NULL, drecs, &em, md);
+ }
+
+ BSONObj obj = docForRecordSize( 300 );
+ StatusWith<DiskLoc> actualLocation = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false);
+ ASSERT_OK( actualLocation.getStatus() );
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 1000), 310},
+ {}
+ };
+ LocAndSize drecs[] = {
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ }
+ }
+
+ /** alloc() returns a non quantized record equal to the requested size. */
+ TEST(SimpleRecordStoreV1, AllocExactSizeNonQuantizedDeletedRecord) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+ SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+ {
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1000), 300},
+ {}
+ };
+ initializeV1RS(&txn, NULL, drecs, &em, md);
+ }
+
+ BSONObj obj = docForRecordSize( 300 );
+ StatusWith<DiskLoc> actualLocation = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false);
+ ASSERT_OK( actualLocation.getStatus() );
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 1000), 300},
+ {}
+ };
+ LocAndSize drecs[] = {
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ }
+ }
+
+ /**
+ * alloc() returns a non quantized record equal to the quantized size plus some extra space
+ * too small to make a DeletedRecord.
+ */
+ TEST(SimpleRecordStoreV1, AllocQuantizedWithExtra) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+ SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+ {
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1000), 343},
+ {}
+ };
+ initializeV1RS(&txn, NULL, drecs, &em, md);
+ }
+
+ BSONObj obj = docForRecordSize( 300 );
+ StatusWith<DiskLoc> actualLocation = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false);
+ ASSERT_OK( actualLocation.getStatus() );
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 1000), 343},
+ {}
+ };
+ LocAndSize drecs[] = {
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ }
+ }
+
+ /**
+ * alloc() returns a quantized record when the extra space in the reclaimed deleted record
+ * is large enough to form a new deleted record.
+ */
+ TEST(SimpleRecordStoreV1, AllocQuantizedWithoutExtra) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+ SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+ {
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1000), 344},
+ {}
+ };
+ initializeV1RS(&txn, NULL, drecs, &em, md);
+ }
+
+
+ BSONObj obj = docForRecordSize( 300 );
+ StatusWith<DiskLoc> actualLocation = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false);
+ ASSERT_OK( actualLocation.getStatus() );
+
+ {
+ LocAndSize recs[] = {
+ // The returned record is quantized from 300 to 320.
+ {DiskLoc(0, 1000), 320},
+ {}
+ };
+ LocAndSize drecs[] = {
+ // A new 24 byte deleted record is split off.
+ {DiskLoc(0, 1320), 24},
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ }
+ }
+
+ /**
+ * A non quantized deleted record within 1/8 of the requested size is returned as is, even
+ * if a quantized portion of the deleted record could be used instead.
+ */
+ TEST(SimpleRecordStoreV1, AllocNotQuantizedNearDeletedSize) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+ SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+ {
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1000), 344},
+ {}
+ };
+ initializeV1RS(&txn, NULL, drecs, &em, md);
+ }
+
+ BSONObj obj = docForRecordSize( 319 );
+ StatusWith<DiskLoc> actualLocation = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false);
+ ASSERT_OK( actualLocation.getStatus() );
+
+ // Even though 319 would be quantized to 320 and 344 - 320 == 24 could become a new
+ // deleted record, the entire deleted record is returned because
+ // ( 344 - 320 ) < ( 320 / 8 ).
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 1000), 344},
+ {}
+ };
+ LocAndSize drecs[] = {
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ }
+ }
+
+ /** getRecordAllocationSize() returns its argument when the padding factor is 1.0. */
+ TEST(SimpleRecordStoreV1, GetRecordAllocationSizeNoPadding) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+ SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+ ASSERT_EQUALS( 1.0, md->paddingFactor() );
+ ASSERT_EQUALS( 300, rs.getRecordAllocationSize( 300 ) );
+ }
+
+ /** getRecordAllocationSize() multiplies by a padding factor > 1.0. */
+ TEST(SimpleRecordStoreV1, GetRecordAllocationSizeWithPadding) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+ SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+ double paddingFactor = 1.2;
+ md->setPaddingFactor( &txn, paddingFactor );
+ ASSERT_EQUALS( paddingFactor, md->paddingFactor() );
+ ASSERT_EQUALS( int(300 * paddingFactor), rs.getRecordAllocationSize( 300 ) );
+ }
+
+ /**
+ * getRecordAllocationSize() quantizes to the nearest power of 2 when Flag_UsePowerOf2Sizes
+ * is set.
+ */
+ TEST(SimpleRecordStoreV1, GetRecordAllocationSizePowerOf2) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(
+ false,
+ RecordStoreV1Base::Flag_UsePowerOf2Sizes );
+
+ SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+ ASSERT_EQUALS( 512, rs.getRecordAllocationSize( 300 ) );
+ }
+
+ /**
+ * getRecordAllocationSize() quantizes to the nearest power of 2 when Flag_UsePowerOf2Sizes
+ * is set, ignoring the padding factor.
+ */
+ TEST(SimpleRecordStoreV1, GetRecordAllocationSizePowerOf2PaddingIgnored) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(
+ false,
+ RecordStoreV1Base::Flag_UsePowerOf2Sizes );
+
+ SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+ md->setPaddingFactor( &txn, 2.0 );
+ ASSERT_EQUALS( 2.0, md->paddingFactor() );
+ ASSERT_EQUALS( 512, rs.getRecordAllocationSize( 300 ) );
+ }
+
+
+ // -----------------
+
+ TEST( SimpleRecordStoreV1, FullSimple1 ) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+ SimpleRecordStoreV1 rs( &txn,
+ "test.foo",
+ md,
+ &em,
+ false );
+
+
+ ASSERT_EQUALS( 0, md->numRecords() );
+ StatusWith<DiskLoc> result = rs.insertRecord( &txn, "abc", 4, 1000 );
+ ASSERT_TRUE( result.isOK() );
+ ASSERT_EQUALS( 1, md->numRecords() );
+ RecordData recordData = rs.dataFor( result.getValue() );
+ ASSERT_EQUALS( string("abc"), string(recordData.data()) );
+ }
+
+ // ----------------
+
+ /**
+ * Inserts take the first deleted record with the correct size.
+ */
+ TEST( SimpleRecordStoreV1, InsertTakesFirstDeletedWithExactSize ) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+ SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 1000), 100},
+ {DiskLoc(0, 1100), 100},
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(2, 1100), 100},
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1200), 100}, // this one will be used
+ {DiskLoc(2, 1000), 100},
+ {DiskLoc(1, 1000), 1000},
+ {}
+ };
+
+ initializeV1RS(&txn, recs, drecs, &em, md);
+ }
+
+ rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 1000), 100},
+ {DiskLoc(0, 1100), 100},
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1200), 100}, // this is the new record
+ {DiskLoc(2, 1100), 100},
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(2, 1000), 100},
+ {DiskLoc(1, 1000), 1000},
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ }
+ }
+
+ /**
+ * Test that we keep looking for better matches for 5 links once we find a non-exact match.
+ * This "extra" scanning does not proceed into bigger buckets.
+ * WARNING: this test depends on magic numbers inside RSV1Simple::_allocFromExistingExtents.
+ */
+ TEST( SimpleRecordStoreV1, InsertLooksForBetterMatchUpTo5Links ) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+ SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+ {
+ LocAndSize recs[] = {
+ {}
+ };
+ LocAndSize drecs[] = {
+ // This intentionally leaves gaps to keep locs readable.
+ {DiskLoc(0, 1000), 75}, // too small
+ {DiskLoc(0, 1100), 100}, // 1st big enough: will be first record
+ {DiskLoc(0, 1200), 100}, // 2nd: will be third record
+ {DiskLoc(0, 1300), 100}, // 3rd
+ {DiskLoc(0, 1400), 100}, // 4th
+ {DiskLoc(0, 1500), 100}, // 5th: first and third will stop once they look here
+ {DiskLoc(0, 1600), 80}, // 6th: second will make it here and use this
+ {DiskLoc(0, 1700), 999}, // bigger bucket. Should never look here
+ {}
+ };
+ initializeV1RS(&txn, recs, drecs, &em, md);
+ }
+
+ rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false);
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 1100), 100}, // 1st insert
+ {DiskLoc(0, 1600), 80}, // 2nd insert
+ {DiskLoc(0, 1200), 100}, // 3rd insert
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1000), 75},
+ {DiskLoc(0, 1300), 100},
+ {DiskLoc(0, 1400), 100},
+ {DiskLoc(0, 1500), 100},
+ {DiskLoc(0, 1700), 999},
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ }
+ }
+
+ /**
+ * Test that we stop looking in a bucket once we see 31 too small drecs.
+ * WARNING: this test depends on magic numbers inside RSV1Simple::_allocFromExistingExtents.
+ */
+ TEST( SimpleRecordStoreV1, InsertLooksForMatchUpTo31Links ) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+ SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+ {
+ LocAndSize recs[] = {
+ {}
+ };
+ LocAndSize drecs[] = {
+ // This intentionally leaves gaps to keep locs readable.
+ {DiskLoc(0, 1000), 50}, // different bucket
+
+ {DiskLoc(0, 1100), 75}, // 1st too small in correct bucket
+ {DiskLoc(0, 1200), 75},
+ {DiskLoc(0, 1300), 75},
+ {DiskLoc(0, 1400), 75},
+ {DiskLoc(0, 1500), 75},
+ {DiskLoc(0, 1600), 75},
+ {DiskLoc(0, 1700), 75},
+ {DiskLoc(0, 1800), 75},
+ {DiskLoc(0, 1900), 75},
+ {DiskLoc(0, 2000), 75}, // 10th too small
+ {DiskLoc(0, 2100), 75},
+ {DiskLoc(0, 2200), 75},
+ {DiskLoc(0, 2300), 75},
+ {DiskLoc(0, 2400), 75},
+ {DiskLoc(0, 2500), 75},
+ {DiskLoc(0, 2600), 75},
+ {DiskLoc(0, 2700), 75},
+ {DiskLoc(0, 2800), 75},
+ {DiskLoc(0, 2900), 75},
+ {DiskLoc(0, 3000), 75}, // 20th too small
+ {DiskLoc(0, 3100), 75},
+ {DiskLoc(0, 3200), 75},
+ {DiskLoc(0, 3300), 75},
+ {DiskLoc(0, 3400), 75},
+ {DiskLoc(0, 3500), 75},
+ {DiskLoc(0, 3600), 75},
+ {DiskLoc(0, 3700), 75},
+ {DiskLoc(0, 3800), 75},
+ {DiskLoc(0, 3900), 75},
+ {DiskLoc(0, 4000), 75}, // 30th too small
+ {DiskLoc(0, 4100), 75}, // 31st too small
+
+ {DiskLoc(0, 8000), 80}, // big enough but wont be seen until we take an earlier one
+ {DiskLoc(0, 9000), 140}, // bigger bucket. jumps here after seeing 31 drecs
+ {}
+ };
+ initializeV1RS(&txn, recs, drecs, &em, md);
+ }
+
+ rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false); // takes from bigger bucket
+ rs.insertRecord(&txn, zeros, 70 - Record::HeaderSize, false); // removes a 75-sized drec
+ rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false); // now sees big-enough drec
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 9000), 80}, // 1st insert went here
+ {DiskLoc(0, 1100), 75}, // 2nd here
+ {DiskLoc(0, 8000), 80}, // 3rd here
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 9000 + 80), 140 - 80}, // split off during first insert
+ {DiskLoc(0, 1000), 50},
+ {DiskLoc(0, 1200), 75},
+ {DiskLoc(0, 1300), 75},
+ {DiskLoc(0, 1400), 75},
+ {DiskLoc(0, 1500), 75},
+ {DiskLoc(0, 1600), 75},
+ {DiskLoc(0, 1700), 75},
+ {DiskLoc(0, 1800), 75},
+ {DiskLoc(0, 1900), 75},
+ {DiskLoc(0, 2000), 75},
+ {DiskLoc(0, 2100), 75},
+ {DiskLoc(0, 2200), 75},
+ {DiskLoc(0, 2300), 75},
+ {DiskLoc(0, 2400), 75},
+ {DiskLoc(0, 2500), 75},
+ {DiskLoc(0, 2600), 75},
+ {DiskLoc(0, 2700), 75},
+ {DiskLoc(0, 2800), 75},
+ {DiskLoc(0, 2900), 75},
+ {DiskLoc(0, 3000), 75},
+ {DiskLoc(0, 3100), 75},
+ {DiskLoc(0, 3200), 75},
+ {DiskLoc(0, 3300), 75},
+ {DiskLoc(0, 3400), 75},
+ {DiskLoc(0, 3500), 75},
+ {DiskLoc(0, 3600), 75},
+ {DiskLoc(0, 3700), 75},
+ {DiskLoc(0, 3800), 75},
+ {DiskLoc(0, 3900), 75},
+ {DiskLoc(0, 4000), 75},
+ {DiskLoc(0, 4100), 75},
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ }
+ }
+
+ /**
+ * Test that we stop looking in a bucket once we see 31 drecs, or look 4-past the first
+ * too-large match, whichever comes first. This is a combination of
+ * InsertLooksForBetterMatchUpTo5Links and InsertLooksForMatchUpTo31Links.
+ *
+ * WARNING: this test depends on magic numbers inside RSV1Simple::_allocFromExistingExtents.
+ */
+ TEST( SimpleRecordStoreV1, InsertLooksForMatchUpTo31LinksEvenIfFoundOversizedFit ) {
+ OperationContextNoop txn;
+ DummyExtentManager em;
+ DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+ SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+ {
+ LocAndSize recs[] = {
+ {}
+ };
+ LocAndSize drecs[] = {
+ // This intentionally leaves gaps to keep locs readable.
+ {DiskLoc(0, 1000), 50}, // different bucket
+
+ {DiskLoc(0, 1100), 75}, // 1st too small in correct bucket
+ {DiskLoc(0, 1200), 75},
+ {DiskLoc(0, 1300), 75},
+ {DiskLoc(0, 1400), 75},
+ {DiskLoc(0, 1500), 75},
+ {DiskLoc(0, 1600), 75},
+ {DiskLoc(0, 1700), 75},
+ {DiskLoc(0, 1800), 75},
+ {DiskLoc(0, 1900), 75},
+ {DiskLoc(0, 2000), 75}, // 10th too small
+ {DiskLoc(0, 2100), 75},
+ {DiskLoc(0, 2200), 75},
+ {DiskLoc(0, 2300), 75},
+ {DiskLoc(0, 2400), 75},
+ {DiskLoc(0, 2500), 75},
+ {DiskLoc(0, 2600), 75},
+ {DiskLoc(0, 2700), 75},
+ {DiskLoc(0, 2800), 75},
+ {DiskLoc(0, 2900), 75},
+ {DiskLoc(0, 3000), 75}, // 20th too small
+ {DiskLoc(0, 3100), 75},
+ {DiskLoc(0, 3200), 75},
+ {DiskLoc(0, 3300), 75},
+ {DiskLoc(0, 3400), 75},
+ {DiskLoc(0, 3500), 75},
+ {DiskLoc(0, 3600), 75},
+ {DiskLoc(0, 3700), 75}, // 27th too small
+
+ {DiskLoc(0, 7000), 95}, // 1st insert takes this
+ {DiskLoc(0, 7100), 95}, // 3rd insert takes this
+
+ {DiskLoc(0, 3800), 75},
+ {DiskLoc(0, 3900), 75}, // 29th too small (31st overall)
+
+ {DiskLoc(0, 8000), 80}, // exact match. taken by 2nd insert
+
+ {DiskLoc(0, 9000), 140}, // bigger bucket. Should never get here
+ {}
+ };
+ initializeV1RS(&txn, recs, drecs, &em, md);
+ }
+
+ rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false);
+ rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false);
+
+ {
+ LocAndSize recs[] = {
+ {DiskLoc(0, 7000), 95}, // 1st insert went here
+ {DiskLoc(0, 8000), 80}, // 2nd here
+ {DiskLoc(0, 7100), 95}, // 3rd here
+ {}
+ };
+ LocAndSize drecs[] = {
+ {DiskLoc(0, 1000), 50},
+ {DiskLoc(0, 1100), 75},
+ {DiskLoc(0, 1200), 75},
+ {DiskLoc(0, 1300), 75},
+ {DiskLoc(0, 1400), 75},
+ {DiskLoc(0, 1500), 75},
+ {DiskLoc(0, 1600), 75},
+ {DiskLoc(0, 1700), 75},
+ {DiskLoc(0, 1800), 75},
+ {DiskLoc(0, 1900), 75},
+ {DiskLoc(0, 2000), 75},
+ {DiskLoc(0, 2100), 75},
+ {DiskLoc(0, 2200), 75},
+ {DiskLoc(0, 2300), 75},
+ {DiskLoc(0, 2400), 75},
+ {DiskLoc(0, 2500), 75},
+ {DiskLoc(0, 2600), 75},
+ {DiskLoc(0, 2700), 75},
+ {DiskLoc(0, 2800), 75},
+ {DiskLoc(0, 2900), 75},
+ {DiskLoc(0, 3000), 75},
+ {DiskLoc(0, 3100), 75},
+ {DiskLoc(0, 3200), 75},
+ {DiskLoc(0, 3300), 75},
+ {DiskLoc(0, 3400), 75},
+ {DiskLoc(0, 3500), 75},
+ {DiskLoc(0, 3600), 75},
+ {DiskLoc(0, 3700), 75},
+ {DiskLoc(0, 3800), 75},
+ {DiskLoc(0, 3900), 75},
+ {DiskLoc(0, 9000), 140},
+ {}
+ };
+ assertStateV1RS(&txn, recs, drecs, &em, md);
+ }
+ }
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp
new file mode 100644
index 00000000000..3ea4298332f
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp
@@ -0,0 +1,608 @@
+// record_store_v1_test_help.cpp
+
+/**
+* Copyright (C) 2014 MongoDB Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h"
+
+#include <algorithm>
+#include <map>
+#include <set>
+#include <vector>
+
+#include "mongo/db/storage/mmap_v1/extent.h"
+#include "mongo/db/storage/mmap_v1/record.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+
+ DummyRecordStoreV1MetaData::DummyRecordStoreV1MetaData( bool capped, int userFlags ) {
+ _dataSize = 0;
+ _numRecords = 0;
+ _capped = capped;
+ _userFlags = userFlags;
+ _lastExtentSize = 0;
+ _paddingFactor = 1;
+ _maxCappedDocs = numeric_limits<long long>::max();
+ _capFirstNewRecord.setInvalid();
+ if ( _capped ) {
+ // copied from NamespaceDetails::NamespaceDetails()
+ setDeletedListEntry( NULL, 1, DiskLoc().setInvalid() );
+ }
+ }
+
+ const DiskLoc& DummyRecordStoreV1MetaData::capExtent() const {
+ return _capExtent;
+ }
+
+ void DummyRecordStoreV1MetaData::setCapExtent( OperationContext* txn,
+ const DiskLoc& loc ) {
+ _capExtent = loc;
+ }
+
+ const DiskLoc& DummyRecordStoreV1MetaData::capFirstNewRecord() const {
+ return _capFirstNewRecord;
+ }
+
+ void DummyRecordStoreV1MetaData::setCapFirstNewRecord( OperationContext* txn,
+ const DiskLoc& loc ) {
+ _capFirstNewRecord = loc;
+ }
+
+ long long DummyRecordStoreV1MetaData::dataSize() const {
+ return _dataSize;
+ }
+
+ long long DummyRecordStoreV1MetaData::numRecords() const {
+ return _numRecords;
+ }
+
+ void DummyRecordStoreV1MetaData::incrementStats( OperationContext* txn,
+ long long dataSizeIncrement,
+ long long numRecordsIncrement ) {
+ _dataSize += dataSizeIncrement;
+ _numRecords += numRecordsIncrement;
+ }
+
+ void DummyRecordStoreV1MetaData::setStats( OperationContext* txn,
+ long long dataSizeIncrement,
+ long long numRecordsIncrement ) {
+ _dataSize = dataSizeIncrement;
+ _numRecords = numRecordsIncrement;
+ }
+
+ namespace {
+ DiskLoc myNull;
+ }
+
+ const DiskLoc& DummyRecordStoreV1MetaData::deletedListEntry( int bucket ) const {
+ invariant( bucket >= 0 );
+ if ( static_cast<size_t>( bucket ) >= _deletedLists.size() )
+ return myNull;
+ return _deletedLists[bucket];
+ }
+
+ void DummyRecordStoreV1MetaData::setDeletedListEntry( OperationContext* txn,
+ int bucket,
+ const DiskLoc& loc ) {
+ invariant( bucket >= 0 );
+ invariant( bucket < 1000 );
+ while ( static_cast<size_t>( bucket ) >= _deletedLists.size() )
+ _deletedLists.push_back( DiskLoc() );
+ _deletedLists[bucket] = loc;
+ }
+
+ void DummyRecordStoreV1MetaData::orphanDeletedList(OperationContext* txn) {
+ invariant( false );
+ }
+
+ const DiskLoc& DummyRecordStoreV1MetaData::firstExtent(OperationContext* txn) const {
+ return _firstExtent;
+ }
+
+ void DummyRecordStoreV1MetaData::setFirstExtent( OperationContext* txn,
+ const DiskLoc& loc ) {
+ _firstExtent = loc;
+ }
+
+ const DiskLoc& DummyRecordStoreV1MetaData::lastExtent(OperationContext* txn) const {
+ return _lastExtent;
+ }
+
+ void DummyRecordStoreV1MetaData::setLastExtent( OperationContext* txn,
+ const DiskLoc& loc ) {
+ _lastExtent = loc;
+ }
+
+ bool DummyRecordStoreV1MetaData::isCapped() const {
+ return _capped;
+ }
+
+ bool DummyRecordStoreV1MetaData::isUserFlagSet( int flag ) const {
+ return _userFlags & flag;
+ }
+
+ bool DummyRecordStoreV1MetaData::setUserFlag( OperationContext* txn, int flag ) {
+ if ( ( _userFlags & flag ) == flag )
+ return false;
+
+ _userFlags |= flag;
+ return true;
+
+ }
+ bool DummyRecordStoreV1MetaData::clearUserFlag( OperationContext* txn, int flag ) {
+ if ( ( _userFlags & flag ) == 0 )
+ return false;
+
+ _userFlags &= ~flag;
+ return true;
+
+ }
+ bool DummyRecordStoreV1MetaData::replaceUserFlags( OperationContext* txn, int flags ) {
+ if ( _userFlags == flags )
+ return false;
+ _userFlags = flags;
+ return true;
+ }
+
+
+ int DummyRecordStoreV1MetaData::lastExtentSize(OperationContext* txn) const {
+ return _lastExtentSize;
+ }
+
+ void DummyRecordStoreV1MetaData::setLastExtentSize( OperationContext* txn, int newMax ) {
+ _lastExtentSize = newMax;
+ }
+
+ long long DummyRecordStoreV1MetaData::maxCappedDocs() const {
+ return _maxCappedDocs;
+ }
+
+ double DummyRecordStoreV1MetaData::paddingFactor() const {
+ return _paddingFactor;
+ }
+
+ void DummyRecordStoreV1MetaData::setPaddingFactor( OperationContext* txn,
+ double paddingFactor ) {
+ _paddingFactor = paddingFactor;
+ }
+
+ // -----------------------------------------
+
+ DummyExtentManager::~DummyExtentManager() {
+ for ( size_t i = 0; i < _extents.size(); i++ ) {
+ if ( _extents[i].data )
+ free( _extents[i].data );
+ }
+ }
+
+ Status DummyExtentManager::init(OperationContext* txn) {
+ return Status::OK();
+ }
+
+ int DummyExtentManager::numFiles() const {
+ return static_cast<int>( _extents.size() );
+ }
+
+ long long DummyExtentManager::fileSize() const {
+ invariant( false );
+ return -1;
+ }
+
+ DiskLoc DummyExtentManager::allocateExtent( OperationContext* txn,
+ bool capped,
+ int size,
+ bool enforceQuota ) {
+ size = quantizeExtentSize( size );
+
+ ExtentInfo info;
+ info.data = static_cast<char*>( malloc( size ) );
+ info.length = size;
+
+ DiskLoc loc( _extents.size(), 0 );
+ _extents.push_back( info );
+
+ Extent* e = getExtent( loc, false );
+ e->magic = Extent::extentSignature;
+ e->myLoc = loc;
+ e->xnext.Null();
+ e->xprev.Null();
+ e->length = size;
+ e->firstRecord.Null();
+ e->lastRecord.Null();
+
+ return loc;
+
+ }
+
+ void DummyExtentManager::freeExtents( OperationContext* txn,
+ DiskLoc firstExt, DiskLoc lastExt ) {
+ // XXX
+ }
+
+ void DummyExtentManager::freeExtent( OperationContext* txn, DiskLoc extent ) {
+ // XXX
+ }
+ void DummyExtentManager::freeListStats( int* numExtents, int64_t* totalFreeSize ) const {
+ invariant( false );
+ }
+
+ Record* DummyExtentManager::recordForV1( const DiskLoc& loc ) const {
+ invariant( static_cast<size_t>( loc.a() ) < _extents.size() );
+ invariant( static_cast<size_t>( loc.getOfs() ) < _extents[loc.a()].length );
+ char* root = _extents[loc.a()].data;
+ return reinterpret_cast<Record*>( root + loc.getOfs() );
+ }
+
+ Extent* DummyExtentManager::extentForV1( const DiskLoc& loc ) const {
+ invariant( false );
+ }
+
+ DiskLoc DummyExtentManager::extentLocForV1( const DiskLoc& loc ) const {
+ return DiskLoc( loc.a(), 0 );
+ }
+
+ Extent* DummyExtentManager::getExtent( const DiskLoc& loc, bool doSanityCheck ) const {
+ invariant( !loc.isNull() );
+ invariant( static_cast<size_t>( loc.a() ) < _extents.size() );
+ invariant( loc.getOfs() == 0 );
+ Extent* ext = reinterpret_cast<Extent*>( _extents[loc.a()].data );
+ if (doSanityCheck)
+ ext->assertOk();
+ return ext;
+ }
+
+ int DummyExtentManager::maxSize() const {
+ return 1024 * 1024 * 64;
+ }
+
+ DummyExtentManager::CacheHint* DummyExtentManager::cacheHint( const DiskLoc& extentLoc, const HintType& hint ) {
+ return new CacheHint();
+ }
+
+namespace {
+ void accumulateExtentSizeRequirements(const LocAndSize* las, std::map<int, size_t>* sizes) {
+ if (!las)
+ return;
+
+ while (!las->loc.isNull()) {
+ // We require passed in offsets to be > 1000 to leave room for Extent headers.
+ invariant(Extent::HeaderSize() < 1000);
+ invariant(las->loc.getOfs() >= 1000);
+
+ const size_t end = las->loc.getOfs() + las->size;
+ size_t& sizeNeeded = (*sizes)[las->loc.a()];
+ sizeNeeded = std::max(sizeNeeded, end);
+ las++;
+ }
+ }
+
+ void printRecList(OperationContext* txn,
+ const ExtentManager* em,
+ const RecordStoreV1MetaData* md) {
+ log() << " *** BEGIN ACTUAL RECORD LIST *** ";
+ DiskLoc extLoc = md->firstExtent(txn);
+ std::set<DiskLoc> seenLocs;
+ while (!extLoc.isNull()) {
+ Extent* ext = em->getExtent(extLoc, true);
+ DiskLoc actualLoc = ext->firstRecord;
+ while (!actualLoc.isNull()) {
+ const Record* actualRec = em->recordForV1(actualLoc);
+ const int actualSize = actualRec->lengthWithHeaders();
+
+ log() << "loc: " << actualLoc // <--hex
+ << " (" << actualLoc.getOfs() << ")"
+ << " size: " << actualSize
+ << " prev: " << actualRec->prevOfs()
+ << " next: " << actualRec->nextOfs()
+ << (actualLoc == md->capFirstNewRecord() ? " (CAP_FIRST_NEW)" : "")
+ ;
+
+ const bool foundCycle = !seenLocs.insert(actualLoc).second;
+ invariant(!foundCycle);
+
+ const int nextOfs = actualRec->nextOfs();
+ actualLoc = (nextOfs == DiskLoc::NullOfs ? DiskLoc()
+ : DiskLoc(actualLoc.a(), nextOfs));
+ }
+ extLoc = ext->xnext;
+ }
+ log() << " *** END ACTUAL RECORD LIST *** ";
+ }
+
+ void printDRecList(const ExtentManager* em, const RecordStoreV1MetaData* md) {
+ log() << " *** BEGIN ACTUAL DELETED RECORD LIST *** ";
+ std::set<DiskLoc> seenLocs;
+ for (int bucketIdx = 0; bucketIdx < RecordStoreV1Base::Buckets; bucketIdx++) {
+ DiskLoc actualLoc = md->deletedListEntry(bucketIdx);
+ while (!actualLoc.isNull()) {
+ const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted();
+ const int actualSize = actualDrec->lengthWithHeaders();
+
+ log() << "loc: " << actualLoc // <--hex
+ << " (" << actualLoc.getOfs() << ")"
+ << " size: " << actualSize
+ << " bucket: " << bucketIdx
+ << " next: " << actualDrec->nextDeleted();
+
+ const bool foundCycle = !seenLocs.insert(actualLoc).second;
+ invariant(!foundCycle);
+
+ actualLoc = actualDrec->nextDeleted();
+ }
+
+ // Only print bucket 0 in capped collections since it contains all deleted records
+ if (md->isCapped())
+ break;
+ }
+ log() << " *** END ACTUAL DELETED RECORD LIST *** ";
+ }
+}
+
+ void initializeV1RS(OperationContext* txn,
+ const LocAndSize* records,
+ const LocAndSize* drecs,
+ DummyExtentManager* em,
+ DummyRecordStoreV1MetaData* md) {
+ invariant(records || drecs); // if both are NULL nothing is being created...
+
+ // Need to start with a blank slate
+ invariant(em->numFiles() == 0);
+ invariant(md->firstExtent(txn).isNull());
+
+ // pre-allocate extents (even extents that aren't part of this RS)
+ {
+ typedef std::map<int, size_t> ExtentSizes;
+ ExtentSizes extentSizes;
+ accumulateExtentSizeRequirements(records, &extentSizes);
+ accumulateExtentSizeRequirements(drecs, &extentSizes);
+ invariant(!extentSizes.empty());
+
+ const int maxExtent = extentSizes.rbegin()->first;
+ for (int i = 0; i <= maxExtent; i++) {
+ const size_t size = extentSizes.count(i) ? extentSizes[i] : 0;
+ const DiskLoc loc = em->allocateExtent(txn, md->isCapped(), size, 0);
+
+ // This function and assertState depend on these details of DummyExtentManager
+ invariant(loc.a() == i);
+ invariant(loc.getOfs() == 0);
+ }
+
+ // link together extents that should be part of this RS
+ md->setFirstExtent(txn, DiskLoc(extentSizes.begin()->first, 0));
+ md->setLastExtent(txn, DiskLoc(extentSizes.rbegin()->first, 0));
+ for (ExtentSizes::iterator it = extentSizes.begin();
+ boost::next(it) != extentSizes.end(); /* ++it */ ) {
+ const int a = it->first;
+ ++it;
+ const int b = it->first;
+ em->getExtent(DiskLoc(a, 0))->xnext = DiskLoc(b, 0);
+ em->getExtent(DiskLoc(b, 0))->xprev = DiskLoc(a, 0);
+ }
+
+ // This signals "done allocating new extents".
+ if (md->isCapped())
+ md->setDeletedListEntry(txn, 1, DiskLoc());
+ }
+
+ if (records && !records[0].loc.isNull()) {
+ int recIdx = 0;
+ DiskLoc extLoc = md->firstExtent(txn);
+ while (!extLoc.isNull()) {
+ Extent* ext = em->getExtent(extLoc);
+ int prevOfs = DiskLoc::NullOfs;
+ while (extLoc.a() == records[recIdx].loc.a()) { // for all records in this extent
+ const DiskLoc loc = records[recIdx].loc;
+ const int size = records[recIdx].size;;
+ invariant(size >= Record::HeaderSize);
+
+ md->incrementStats(txn, size - Record::HeaderSize, 1);
+
+ if (ext->firstRecord.isNull())
+ ext->firstRecord = loc;
+
+ Record* rec = em->recordForV1(loc);
+ rec->lengthWithHeaders() = size;
+ rec->extentOfs() = 0;
+
+ rec->prevOfs() = prevOfs;
+ prevOfs = loc.getOfs();
+
+ const DiskLoc nextLoc = records[recIdx + 1].loc;
+ if (nextLoc.a() == loc.a()) { // if next is in same extent
+ rec->nextOfs() = nextLoc.getOfs();
+ }
+ else {
+ rec->nextOfs() = DiskLoc::NullOfs;
+ ext->lastRecord = loc;
+ }
+
+ recIdx++;
+ }
+ extLoc = ext->xnext;
+ }
+ invariant(records[recIdx].loc.isNull());
+ }
+
+ if (drecs && !drecs[0].loc.isNull()) {
+ int drecIdx = 0;
+ DiskLoc* prevNextPtr = NULL;
+ int lastBucket = -1;
+ while (!drecs[drecIdx].loc.isNull()) {
+ const DiskLoc loc = drecs[drecIdx].loc;
+ const int size = drecs[drecIdx].size;
+ invariant(size >= Record::HeaderSize);
+ const int bucket = RecordStoreV1Base::bucket(size);
+
+ if (md->isCapped()) {
+ // All drecs form a single list in bucket 0
+ if (prevNextPtr == NULL) {
+ md->setDeletedListEntry(txn, 0, loc);
+ }
+ else {
+ *prevNextPtr = loc;
+ }
+
+ if (loc.a() < md->capExtent().a()
+ && drecs[drecIdx + 1].loc.a() == md->capExtent().a()) {
+ // Bucket 1 is known as cappedLastDelRecLastExtent
+ md->setDeletedListEntry(txn, 1, loc);
+ }
+ }
+ else if (bucket != lastBucket) {
+ invariant(bucket > lastBucket); // if this fails, drecs weren't sorted by bucket
+ md->setDeletedListEntry(txn, bucket, loc);
+ lastBucket = bucket;
+ }
+ else {
+ *prevNextPtr = loc;
+ }
+
+ DeletedRecord* drec = &em->recordForV1(loc)->asDeleted();
+ drec->lengthWithHeaders() = size;
+ drec->extentOfs() = 0;
+ drec->nextDeleted() = DiskLoc();
+ prevNextPtr = &drec->nextDeleted();
+
+ drecIdx++;
+ }
+ }
+
+ // Make sure we set everything up as requested.
+ assertStateV1RS(txn, records, drecs, em, md);
+ }
+
+ void assertStateV1RS(OperationContext* txn,
+ const LocAndSize* records,
+ const LocAndSize* drecs,
+ const ExtentManager* em,
+ const DummyRecordStoreV1MetaData* md) {
+ invariant(records || drecs); // if both are NULL nothing is being asserted...
+
+ try {
+ if (records) {
+ long long dataSize = 0;
+ long long numRecs = 0;
+
+ int recIdx = 0;
+
+ DiskLoc extLoc = md->firstExtent(txn);
+ while (!extLoc.isNull()) { // for each Extent
+ Extent* ext = em->getExtent(extLoc, true);
+ int expectedPrevOfs = DiskLoc::NullOfs;
+ DiskLoc actualLoc = ext->firstRecord;
+ while (!actualLoc.isNull()) { // for each Record in this Extent
+ const Record* actualRec = em->recordForV1(actualLoc);
+ const int actualSize = actualRec->lengthWithHeaders();
+
+ dataSize += actualSize - Record::HeaderSize;
+ numRecs += 1;
+
+ ASSERT_EQUALS(actualLoc, records[recIdx].loc);
+ ASSERT_EQUALS(actualSize, records[recIdx].size);
+
+ ASSERT_EQUALS(actualRec->extentOfs(), extLoc.getOfs());
+ ASSERT_EQUALS(actualRec->prevOfs(), expectedPrevOfs);
+ expectedPrevOfs = actualLoc.getOfs();
+
+ recIdx++;
+ const int nextOfs = actualRec->nextOfs();
+ actualLoc = (nextOfs == DiskLoc::NullOfs ? DiskLoc()
+ : DiskLoc(actualLoc.a(), nextOfs));
+ }
+
+ if (ext->xnext.isNull()) {
+ ASSERT_EQUALS(md->lastExtent(txn), extLoc);
+ }
+
+ extLoc = ext->xnext;
+ }
+
+ // both the expected and actual record lists must be done at this point
+ ASSERT_EQUALS(records[recIdx].loc, DiskLoc());
+
+ ASSERT_EQUALS(dataSize, md->dataSize());
+ ASSERT_EQUALS(numRecs, md->numRecords());
+ }
+
+ if (drecs) {
+ int drecIdx = 0;
+ for (int bucketIdx = 0; bucketIdx < RecordStoreV1Base::Buckets; bucketIdx++) {
+ DiskLoc actualLoc = md->deletedListEntry(bucketIdx);
+
+ if (md->isCapped() && bucketIdx == 1) {
+ // In capped collections, the 2nd bucket (index 1) points to the drec before
+ // the first drec in the capExtent. If the capExtent is the first Extent,
+ // it should be Null.
+
+ if (md->capExtent() == md->firstExtent(txn)) {
+ ASSERT_EQUALS(actualLoc, DiskLoc());
+ }
+ else {
+ ASSERT_NOT_EQUALS(actualLoc.a(), md->capExtent().a());
+ const DeletedRecord* actualDrec =
+ &em->recordForV1(actualLoc)->asDeleted();
+ ASSERT_EQUALS(actualDrec->nextDeleted().a(), md->capExtent().a());
+ }
+
+ // Don't do normal checking of bucket 1 in capped collections. Checking
+ // other buckets to verify that they are Null.
+ continue;
+ }
+
+ while (!actualLoc.isNull()) {
+ const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted();
+ const int actualSize = actualDrec->lengthWithHeaders();
+
+ ASSERT_EQUALS(actualLoc, drecs[drecIdx].loc);
+ ASSERT_EQUALS(actualSize, drecs[drecIdx].size);
+
+ // Make sure the drec is correct
+ ASSERT_EQUALS(actualDrec->extentOfs(), 0);
+
+ // in capped collections all drecs are linked into a single list in bucket 0
+ ASSERT_EQUALS(bucketIdx, md->isCapped()
+ ? 0
+ : RecordStoreV1Base::bucket(actualSize));
+
+ drecIdx++;
+ actualLoc = actualDrec->nextDeleted();
+ }
+ }
+ // both the expected and actual deleted lists must be done at this point
+ ASSERT_EQUALS(drecs[drecIdx].loc, DiskLoc());
+ }
+ }
+ catch (...) {
+ // If a test fails, provide extra info to make debugging easier
+ printRecList(txn, em, md);
+ printDRecList(em, md);
+ throw;
+ }
+ }
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h
new file mode 100644
index 00000000000..87ddc078b6d
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h
@@ -0,0 +1,198 @@
+// record_store_v1_test_help.h
+
+/**
+* Copyright (C) 2014 MongoDB Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#pragma once
+
+#include <vector>
+
+#include "mongo/db/storage/mmap_v1/extent_manager.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_base.h"
+
+namespace mongo {
+
+ class DummyRecordStoreV1MetaData : public RecordStoreV1MetaData {
+ public:
+ DummyRecordStoreV1MetaData( bool capped, int userFlags );
+ virtual ~DummyRecordStoreV1MetaData(){}
+
+ virtual const DiskLoc& capExtent() const;
+ virtual void setCapExtent( OperationContext* txn, const DiskLoc& loc );
+
+ virtual const DiskLoc& capFirstNewRecord() const;
+ virtual void setCapFirstNewRecord( OperationContext* txn, const DiskLoc& loc );
+
+ virtual long long dataSize() const;
+ virtual long long numRecords() const;
+
+ virtual void incrementStats( OperationContext* txn,
+ long long dataSizeIncrement,
+ long long numRecordsIncrement );
+
+ virtual void setStats( OperationContext* txn,
+ long long dataSizeIncrement,
+ long long numRecordsIncrement );
+
+ virtual const DiskLoc& deletedListEntry( int bucket ) const;
+ virtual void setDeletedListEntry( OperationContext* txn,
+ int bucket,
+ const DiskLoc& loc );
+ virtual void orphanDeletedList(OperationContext* txn);
+
+ virtual const DiskLoc& firstExtent( OperationContext* txn ) const;
+ virtual void setFirstExtent( OperationContext* txn, const DiskLoc& loc );
+
+ virtual const DiskLoc& lastExtent( OperationContext* txn ) const;
+ virtual void setLastExtent( OperationContext* txn, const DiskLoc& loc );
+
+ virtual bool isCapped() const;
+
+ virtual bool isUserFlagSet( int flag ) const;
+ virtual int userFlags() const { return _userFlags; }
+ virtual bool setUserFlag( OperationContext* txn, int flag );
+ virtual bool clearUserFlag( OperationContext* txn, int flag );
+ virtual bool replaceUserFlags( OperationContext* txn, int flags );
+
+
+ virtual int lastExtentSize( OperationContext* txn ) const;
+ virtual void setLastExtentSize( OperationContext* txn, int newMax );
+
+ virtual long long maxCappedDocs() const;
+
+ virtual double paddingFactor() const;
+
+ virtual void setPaddingFactor( OperationContext* txn, double paddingFactor );
+
+ protected:
+
+ DiskLoc _capExtent;
+ DiskLoc _capFirstNewRecord;
+
+ long long _dataSize;
+ long long _numRecords;
+
+ DiskLoc _firstExtent;
+ DiskLoc _lastExtent;
+
+ bool _capped;
+ int _userFlags;
+ long long _maxCappedDocs;
+
+ int _lastExtentSize;
+ double _paddingFactor;
+
+ std::vector<DiskLoc> _deletedLists;
+ };
+
+ class DummyExtentManager : public ExtentManager {
+ public:
+ virtual ~DummyExtentManager();
+
+ virtual Status init(OperationContext* txn);
+
+ virtual int numFiles() const;
+ virtual long long fileSize() const;
+
+ virtual DiskLoc allocateExtent( OperationContext* txn,
+ bool capped,
+ int size,
+ bool enforceQuota );
+
+ virtual void freeExtents( OperationContext* txn,
+ DiskLoc firstExt, DiskLoc lastExt );
+
+ virtual void freeExtent( OperationContext* txn, DiskLoc extent );
+
+ virtual void freeListStats( int* numExtents, int64_t* totalFreeSize ) const;
+
+ virtual Record* recordForV1( const DiskLoc& loc ) const;
+
+ virtual Extent* extentForV1( const DiskLoc& loc ) const;
+
+ virtual DiskLoc extentLocForV1( const DiskLoc& loc ) const;
+
+ virtual Extent* getExtent( const DiskLoc& loc, bool doSanityCheck = true ) const;
+
+ virtual int maxSize() const;
+
+ virtual CacheHint* cacheHint( const DiskLoc& extentLoc, const HintType& hint );
+
+ protected:
+ struct ExtentInfo {
+ char* data;
+ size_t length;
+ };
+
+ std::vector<ExtentInfo> _extents;
+ };
+
+ struct LocAndSize {
+ DiskLoc loc;
+ int size; // with headers
+ };
+
+ /**
+ * Creates a V1 storage/mmap_v1 with the passed in records and DeletedRecords (drecs).
+ *
+ * List of LocAndSize are terminated by a Null DiskLoc. Passing a NULL pointer is shorthand for
+ * an empty list. Each extent gets it's own DiskLoc file number. DiskLoc Offsets must be > 1000.
+ *
+ * records must be sorted by extent/file. offsets within an extent can be in any order.
+ *
+ * In a simple RS, drecs must be grouped into size-buckets, but the ordering within the size
+ * buckets is up to you.
+ *
+ * In a capped collection, all drecs form a single list and must be grouped by extent, with each
+ * extent having at least one drec. capFirstNewRecord() and capExtent() *must* be correctly set
+ * on md before calling.
+ *
+ * You are responsible for ensuring the records and drecs don't overlap.
+ *
+ * ExtentManager and MetaData must both be empty.
+ */
+ void initializeV1RS(OperationContext* txn,
+ const LocAndSize* records,
+ const LocAndSize* drecs,
+ DummyExtentManager* em,
+ DummyRecordStoreV1MetaData* md);
+
+ /**
+ * Asserts that the V1RecordStore defined by md has the passed in records and drecs in the
+ * correct order.
+ *
+ * List of LocAndSize are terminated by a Null DiskLoc. Passing a NULL pointer means don't check
+ * that list.
+ */
+ void assertStateV1RS(OperationContext* txn,
+ const LocAndSize* records,
+ const LocAndSize* drecs,
+ const ExtentManager* em,
+ const DummyRecordStoreV1MetaData* md);
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/record_store.h b/src/mongo/db/storage/record_store.h
new file mode 100644
index 00000000000..8437046e5d6
--- /dev/null
+++ b/src/mongo/db/storage/record_store.h
@@ -0,0 +1,291 @@
+// record_store.h
+
+/**
+* Copyright (C) 2013 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*
+* As a special exception, the copyright holders give permission to link the
+* code of portions of this program with the OpenSSL library under certain
+* conditions as described in each individual source file and distribute
+* linked combinations including the program with the OpenSSL library. You
+* must comply with the GNU Affero General Public License in all respects for
+* all of the code used other than as permitted herein. If you modify file(s)
+* with this exception, you may extend this exception to your version of the
+* file(s), but you are not obligated to do so. If you do not wish to do so,
+* delete this exception statement from your version. If you delete this
+* exception statement from all source files in the program, then also delete
+* it in the license file.
+*/
+
+#pragma once
+
+#include "mongo/base/owned_pointer_vector.h"
+#include "mongo/bson/mutable/damage_vector.h"
+#include "mongo/db/diskloc.h"
+#include "mongo/db/exec/collection_scan_common.h"
+#include "mongo/db/storage/record_data.h"
+
+namespace mongo {
+
+ class CappedDocumentDeleteCallback;
+ class Collection;
+ struct CompactOptions;
+ struct CompactStats;
+ class DocWriter;
+ class MAdvise;
+ class NamespaceDetails;
+ class OperationContext;
+ class Record;
+
+ class RecordStoreCompactAdaptor;
+ class RecordStore;
+
+ struct ValidateResults;
+ class ValidateAdaptor;
+
+ /**
+ * Allows inserting a Record "in-place" without creating a copy ahead of time.
+ */
+ class DocWriter {
+ public:
+ virtual ~DocWriter() {}
+ virtual void writeDocument( char* buf ) const = 0;
+ virtual size_t documentSize() const = 0;
+ virtual bool addPadding() const { return true; }
+ };
+
+ /**
+ * @see RecordStore::updateRecord
+ */
+ class UpdateMoveNotifier {
+ public:
+ virtual ~UpdateMoveNotifier(){}
+ virtual Status recordStoreGoingToMove( OperationContext* txn,
+ const DiskLoc& oldLocation,
+ const char* oldBuffer,
+ size_t oldSize ) = 0;
+ };
+
+ /**
+ * A RecordIterator provides an interface for walking over a RecordStore.
+ * The details of navigating the collection's structure are below this interface.
+ */
+ class RecordIterator {
+ public:
+ virtual ~RecordIterator() { }
+
+ // True if getNext will produce no more data, false otherwise.
+ virtual bool isEOF() = 0;
+
+ // Return the DiskLoc that the iterator points at. Returns DiskLoc() if isEOF.
+ virtual DiskLoc curr() = 0;
+
+ // Return the DiskLoc that the iterator points at and move the iterator to the next item
+ // from the collection. Returns DiskLoc() if isEOF.
+ virtual DiskLoc getNext() = 0;
+
+ // Can only be called after prepareToYield and before recoverFromYield.
+ virtual void invalidate(const DiskLoc& dl) = 0;
+
+ // Save any state required to resume operation (without crashing) after DiskLoc deletion or
+ // a collection drop.
+ virtual void prepareToYield() = 0;
+
+ // Returns true if collection still exists, false otherwise.
+ virtual bool recoverFromYield() = 0;
+
+ // normally this will just go back to the RecordStore and convert
+ // but this gives the iterator an oppurtnity to optimize
+ virtual RecordData dataFor( const DiskLoc& loc ) const = 0;
+ };
+
+
+ class RecordStore {
+ MONGO_DISALLOW_COPYING(RecordStore);
+ public:
+ RecordStore( const StringData& ns ) : _ns(ns.toString()) { }
+
+ virtual ~RecordStore() { }
+
+ // META
+
+ // name of the RecordStore implementation
+ virtual const char* name() const = 0;
+
+ virtual long long dataSize() const = 0;
+
+ virtual long long numRecords() const = 0;
+
+ virtual bool isCapped() const = 0;
+
+ virtual void setCappedDeleteCallback(CappedDocumentDeleteCallback*) {invariant( false );}
+
+ /**
+ * @param extraInfo - optional more debug info
+ * @param level - optional, level of debug info to put in (higher is more)
+ */
+ virtual int64_t storageSize( OperationContext* txn,
+ BSONObjBuilder* extraInfo = NULL,
+ int infoLevel = 0 ) const = 0;
+
+ // CRUD related
+
+ virtual RecordData dataFor( const DiskLoc& loc) const = 0;
+
+ virtual void deleteRecord( OperationContext* txn, const DiskLoc& dl ) = 0;
+
+ virtual StatusWith<DiskLoc> insertRecord( OperationContext* txn,
+ const char* data,
+ int len,
+ bool enforceQuota ) = 0;
+
+ virtual StatusWith<DiskLoc> insertRecord( OperationContext* txn,
+ const DocWriter* doc,
+ bool enforceQuota ) = 0;
+
+ /**
+ * @param notifier - this is called if the document is moved
+ * it is to be called after the document has been written to new
+ * location, before deleted from old.
+ * @return Status or DiskLoc, DiskLoc might be different
+ */
+ virtual StatusWith<DiskLoc> updateRecord( OperationContext* txn,
+ const DiskLoc& oldLocation,
+ const char* data,
+ int len,
+ bool enforceQuota,
+ UpdateMoveNotifier* notifier ) = 0;
+
+ virtual Status updateWithDamages( OperationContext* txn,
+ const DiskLoc& loc,
+ const char* damangeSource,
+ const mutablebson::DamageVector& damages ) = 0;
+ /**
+ * returned iterator owned by caller
+ * canonical to get all would be
+ * getIterator( txn, DiskLoc(), false, CollectionScanParams::FORWARD )
+ */
+ virtual RecordIterator* getIterator( OperationContext* txn,
+ const DiskLoc& start = DiskLoc(),
+ bool tailable = false,
+ const CollectionScanParams::Direction& dir =
+ CollectionScanParams::FORWARD
+ ) const = 0;
+
+ /**
+ * Constructs an iterator over a potentially corrupted store, which can be used to salvage
+ * damaged records. The iterator might return every record in the store if all of them
+ * are reachable and not corrupted.
+ */
+ virtual RecordIterator* getIteratorForRepair( OperationContext* txn ) const = 0;
+
+ /**
+ * Returns many iterators that partition the RecordStore into many disjoint sets. Iterating
+ * all returned iterators is equivalent to Iterating the full store.
+ */
+ virtual std::vector<RecordIterator*> getManyIterators( OperationContext* txn ) const = 0;
+
+ // higher level
+
+
+ /**
+ * removes all Records
+ */
+ virtual Status truncate( OperationContext* txn ) = 0;
+
+ /**
+ * Truncate documents newer than the document at 'end' from the capped
+ * collection. The collection cannot be completely emptied using this
+ * function. An assertion will be thrown if that is attempted.
+ * @param inclusive - Truncate 'end' as well iff true
+ * XXX: this will go away soon, just needed to move for now
+ */
+ virtual void temp_cappedTruncateAfter(OperationContext* txn,
+ DiskLoc end,
+ bool inclusive) = 0;
+
+ // does this RecordStore support the compact operation
+ virtual bool compactSupported() const = 0;
+ virtual Status compact( OperationContext* txn,
+ RecordStoreCompactAdaptor* adaptor,
+ const CompactOptions* options,
+ CompactStats* stats ) = 0;
+
+ /**
+ * @param full - does more checks
+ * @param scanData - scans each document
+ * @return OK if the validate run successfully
+ * OK will be returned even if corruption is found
+ * deatils will be in result
+ */
+ virtual Status validate( OperationContext* txn,
+ bool full, bool scanData,
+ ValidateAdaptor* adaptor,
+ ValidateResults* results, BSONObjBuilder* output ) const = 0;
+
+ /**
+ * @param scaleSize - amount by which to scale size metrics
+ * appends any custom stats from the RecordStore or other unique stats
+ */
+ virtual void appendCustomStats( OperationContext* txn,
+ BSONObjBuilder* result,
+ double scale ) const = 0;
+
+ /**
+ * Load all data into cache.
+ * What cache depends on implementation.
+ * @param output (optional) - where to put detailed stats
+ */
+ virtual Status touch( OperationContext* txn, BSONObjBuilder* output ) const = 0;
+
+ /**
+ * @return Status::OK() if option hanlded
+ * InvalidOptions is option not supported
+ * other errors indicate option supported, but error setting
+ */
+ virtual Status setCustomOption( OperationContext* txn,
+ const BSONElement& option,
+ BSONObjBuilder* info = NULL ) = 0;
+ protected:
+ std::string _ns;
+ };
+
+ class RecordStoreCompactAdaptor {
+ public:
+ virtual ~RecordStoreCompactAdaptor(){}
+ virtual bool isDataValid( const RecordData& recData ) = 0;
+ virtual size_t dataSize( const RecordData& recData ) = 0;
+ virtual void inserted( const RecordData& recData, const DiskLoc& newLocation ) = 0;
+ };
+
+ struct ValidateResults {
+ ValidateResults() {
+ valid = true;
+ }
+ bool valid;
+ std::vector<std::string> errors;
+ };
+
+ /**
+ * This is so when a RecordStore is validating all records
+ * it can call back to someone to check if a record is valid.
+ * The actual data contained in a Record is totally opaque to the implementation.
+ */
+ class ValidateAdaptor {
+ public:
+ virtual ~ValidateAdaptor(){}
+
+ virtual Status validate( const RecordData& recordData, size_t* dataSize ) = 0;
+ };
+}
diff --git a/src/mongo/db/storage/rocks/rocks_btree_impl.cpp b/src/mongo/db/storage/rocks/rocks_btree_impl.cpp
index 00cbbf1c580..8bd3f2734cf 100644
--- a/src/mongo/db/storage/rocks/rocks_btree_impl.cpp
+++ b/src/mongo/db/storage/rocks/rocks_btree_impl.cpp
@@ -87,7 +87,7 @@ namespace mongo {
rocksdb::Slice sliced[2];
};
- class RocksCursor : public BtreeInterface::Cursor {
+ class RocksCursor : public SortedDataInterface::Cursor {
public:
RocksCursor( rocksdb::Iterator* iterator, bool direction )
: _iterator( iterator ), _direction( direction ), _cached( false ) {
@@ -285,8 +285,8 @@ namespace mongo {
return Status::OK();
}
- BtreeInterface::Cursor* RocksBtreeImpl::newCursor(OperationContext* txn,
- int direction) const {
+ SortedDataInterface::Cursor* RocksBtreeImpl::newCursor(OperationContext* txn,
+ int direction) const {
return new RocksCursor( _db->NewIterator( rocksdb::ReadOptions(),
_columnFamily ),
txn,
diff --git a/src/mongo/db/storage/rocks/rocks_btree_impl.h b/src/mongo/db/storage/rocks/rocks_btree_impl.h
index 2a15e46aad5..4e75ad50e11 100644
--- a/src/mongo/db/storage/rocks/rocks_btree_impl.h
+++ b/src/mongo/db/storage/rocks/rocks_btree_impl.h
@@ -28,7 +28,7 @@
* it in the license file.
*/
-#include "mongo/db/structure/btree/btree_interface.h"
+#include "mongo/db/storage/sorted_data_interface.h"
#pragma once
@@ -47,7 +47,7 @@ namespace mongo {
virtual unsigned long long commit(bool mayInterrupt) = 0;
};
- class RocksBtreeImpl : public BtreeInterface {
+ class RocksBtreeImpl : public SortedDataInterface {
public:
RocksBtreeImpl( rocksdb::DB* db,
rocksdb::ColumnFamilyHandle* cf );
diff --git a/src/mongo/db/storage/rocks/rocks_btree_impl_test.cpp b/src/mongo/db/storage/rocks/rocks_btree_impl_test.cpp
index f7102163352..e080fb08faf 100644
--- a/src/mongo/db/storage/rocks/rocks_btree_impl_test.cpp
+++ b/src/mongo/db/storage/rocks/rocks_btree_impl_test.cpp
@@ -126,7 +126,7 @@ namespace mongo {
DiskLoc loc( 5, 16 );
{
- scoped_ptr<BtreeInterface::Cursor> cursor( btree.newCursor( 1 ) );
+ scoped_ptr<SortedDataInterface::Cursor> cursor( btree.newCursor( 1 ) );
ASSERT( !cursor->locate( key, loc ) );
}
@@ -140,7 +140,7 @@ namespace mongo {
}
{
- scoped_ptr<BtreeInterface::Cursor> cursor( btree.newCursor( 1 ) );
+ scoped_ptr<SortedDataInterface::Cursor> cursor( btree.newCursor( 1 ) );
ASSERT( cursor->locate( key, loc ) );
ASSERT_EQUALS( key, cursor->getKey() );
ASSERT_EQUALS( loc, cursor->getDiskLoc() );
@@ -166,7 +166,7 @@ namespace mongo {
}
{
- scoped_ptr<BtreeInterface::Cursor> cursor( btree.newCursor( 1 ) );
+ scoped_ptr<SortedDataInterface::Cursor> cursor( btree.newCursor( 1 ) );
ASSERT( cursor->locate( BSON( "a" << 2 ), DiskLoc(0,0) ) );
ASSERT( !cursor->isEOF() );
ASSERT_EQUALS( BSON( "" << 2 ), cursor->getKey() );
diff --git a/src/mongo/db/storage/sorted_data_interface.h b/src/mongo/db/storage/sorted_data_interface.h
new file mode 100644
index 00000000000..52f20a6288d
--- /dev/null
+++ b/src/mongo/db/storage/sorted_data_interface.h
@@ -0,0 +1,200 @@
+/**
+ * Copyright (C) 2014 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/bson/ordering.h"
+#include "mongo/db/catalog/head_manager.h"
+#include "mongo/db/diskloc.h"
+#include "mongo/db/jsobj.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/db/storage/record_store.h"
+
+#pragma once
+
+namespace mongo {
+
+ class BucketDeletionNotification;
+ class SortedDataBuilderInterface;
+
+ /**
+ * This interface is a work in progress. Notes below:
+ *
+ * This interface began as the SortedDataInterface, a way to hide the fact that there were two
+ * on-disk formats for the btree. With the introduction of other storage engines, this
+ * interface was generalized to provide access to sorted data. Specifically:
+ *
+ * 1. Many other storage engines provide different Btree(-ish) implementations. This interface
+ * could allow those interfaces to avoid storing btree buckets in an already sorted structure.
+ *
+ * TODO: See if there is actually a performance gain.
+ *
+ * 2. The existing btree implementation is written to assume that if it modifies a record it is
+ * modifying the underlying record. This interface is an attempt to work around that.
+ *
+ * TODO: See if this actually works.
+ */
+ class SortedDataInterface {
+ public:
+ virtual ~SortedDataInterface() { }
+
+ //
+ // Data changes
+ //
+
+ /**
+ * Caller owns returned pointer.
+ * 'this' must outlive the returned pointer.
+ */
+ virtual SortedDataBuilderInterface* getBulkBuilder(OperationContext* txn,
+ bool dupsAllowed) = 0;
+
+ virtual Status insert(OperationContext* txn,
+ const BSONObj& key,
+ const DiskLoc& loc,
+ bool dupsAllowed) = 0;
+
+ virtual bool unindex(OperationContext* txn,
+ const BSONObj& key,
+ const DiskLoc& loc) = 0;
+
+ // TODO: Hide this by exposing an update method?
+ virtual Status dupKeyCheck(OperationContext* txn,
+ const BSONObj& key,
+ const DiskLoc& loc) = 0;
+
+ //
+ // Information about the tree
+ //
+
+ // TODO: expose full set of args for testing?
+ virtual void fullValidate(OperationContext* txn, long long* numKeysOut) = 0;
+
+ virtual bool isEmpty() = 0;
+
+ /**
+ * Attempt to bring whole index into memory. No-op is ok if not supported.
+ */
+ virtual Status touch(OperationContext* txn) const = 0;
+
+ //
+ // Navigation
+ //
+
+ class Cursor {
+ public:
+ virtual ~Cursor() {}
+
+ virtual int getDirection() const = 0;
+
+ virtual bool isEOF() const = 0;
+
+ /**
+ * Will only be called with other from same index as this.
+ * All EOF locs should be considered equal.
+ */
+ virtual bool pointsToSamePlaceAs(const Cursor& other) const = 0;
+
+ /**
+ * If the SortedDataInterface impl calls the BucketNotificationCallback, the argument must
+ * be forwarded to all Cursors over that SortedData.
+ * TODO something better.
+ */
+ virtual void aboutToDeleteBucket(const DiskLoc& bucket) = 0;
+
+ virtual bool locate(const BSONObj& key, const DiskLoc& loc) = 0;
+
+ virtual void advanceTo(const BSONObj &keyBegin,
+ int keyBeginLen,
+ bool afterKey,
+ const vector<const BSONElement*>& keyEnd,
+ const vector<bool>& keyEndInclusive) = 0;
+
+ /**
+ * Locate a key with fields comprised of a combination of keyBegin fields and keyEnd
+ * fields.
+ */
+ virtual void customLocate(const BSONObj& keyBegin,
+ int keyBeginLen,
+ bool afterVersion,
+ const vector<const BSONElement*>& keyEnd,
+ const vector<bool>& keyEndInclusive) = 0;
+
+ /**
+ * Return OK if it's not
+ * Otherwise return a status that can be displayed
+ */
+ virtual BSONObj getKey() const = 0;
+
+ virtual DiskLoc getDiskLoc() const = 0;
+
+ virtual void advance() = 0;
+
+ //
+ // Saving and restoring state
+ //
+ virtual void savePosition() = 0;
+
+ virtual void restorePosition() = 0;
+ };
+
+ /**
+ * Caller takes ownership. SortedDataInterface must outlive all Cursors it produces.
+ */
+ virtual Cursor* newCursor(OperationContext* txn, int direction) const = 0;
+
+ //
+ // Index creation
+ //
+
+ virtual Status initAsEmpty(OperationContext* txn) = 0;
+ };
+
+ /**
+ * A version-hiding wrapper around the bulk builder for the Btree.
+ */
+ class SortedDataBuilderInterface {
+ public:
+ virtual ~SortedDataBuilderInterface() { }
+
+ /**
+ * Adds 'key' to intermediate storage.
+ *
+ * 'key' must be > or >= the last key passed to this function (depends on _dupsAllowed). If
+ * this is violated an error Status (ErrorCodes::InternalError) will be returned.
+ */
+ virtual Status addKey(const BSONObj& key, const DiskLoc& loc) = 0;
+
+ /**
+ * commit work. if not called, destructor will clean up partially completed work
+ * (in case exception has happened).
+ *
+ * Returns number of keys added.
+ */
+ virtual unsigned long long commit(bool mayInterrupt) = 0;
+ };
+
+} // namespace mongo