60 files changed, 16723 insertions, 67 deletions
diff --git a/src/mongo/db/storage/capped_callback.h b/src/mongo/db/storage/capped_callback.h
new file mode 100644
index 00000000000..59c23f9dab9
--- /dev/null
+++ b/src/mongo/db/storage/capped_callback.h
@@ -0,0 +1,54 @@
+// record_store_v1_capped.h
+
+/**
+*    Copyright (C) 2014 MongoDB Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#pragma once
+
+#include "mongo/db/diskloc.h"
+
+namespace mongo {
+
+    class OperationContext;
+
+    /**
+     * When a capped collection has to delete a document, it needs a way to tell the caller
+     * what its deleting so it can unindex or do any other cleanup.
+     * This is that way.
+     */
+    class CappedDocumentDeleteCallback {
+    public:
+        virtual ~CappedDocumentDeleteCallback(){}
+
+        /**
+         * This will be called right before loc is deleted when wrapping.
+         */
+        virtual Status aboutToDeleteCapped( OperationContext* txn, const DiskLoc& loc ) = 0;
+    };
+
+}
diff --git a/src/mongo/db/storage/heap1/SConscript b/src/mongo/db/storage/heap1/SConscript
index 0b1a6db0383..feb9fcbb2d1 100644
--- a/src/mongo/db/storage/heap1/SConscript
+++ b/src/mongo/db/storage/heap1/SConscript
@@ -8,9 +8,20 @@ env.Library(
         'heap1_engine.cpp',
         ],
     LIBDEPS= [
+        'heap_record_store',
         '$BUILD_DIR/mongo/bson',
         '$BUILD_DIR/mongo/db/catalog/collection_options',
-        '$BUILD_DIR/mongo/db/structure/record_store',
+        '$BUILD_DIR/mongo/foundation',
+        ]
+    )
+
+env.Library(
+    target= 'heap_record_store',
+    source= [
+        'record_store_heap.cpp'
+        ],
+    LIBDEPS= [
+        '$BUILD_DIR/mongo/bson',
         '$BUILD_DIR/mongo/foundation',
         ]
     )
diff --git a/src/mongo/db/storage/heap1/heap1_btree_impl.cpp b/src/mongo/db/storage/heap1/heap1_btree_impl.cpp
index 2d5ae2fc63b..9a2ec04417a 100644
--- a/src/mongo/db/storage/heap1/heap1_btree_impl.cpp
+++ b/src/mongo/db/storage/heap1/heap1_btree_impl.cpp
@@ -200,7 +200,7 @@ namespace {
         return it->loc != loc;
     }
 
-    class Heap1BtreeBuilderImpl : public BtreeBuilderInterface {
+    class Heap1BtreeBuilderImpl : public SortedDataBuilderInterface {
     public:
         Heap1BtreeBuilderImpl(IndexSet* data, bool dupsAllowed)
                 : _data(data),
@@ -241,14 +241,14 @@ namespace {
         bool _committed;
     };
 
-    class Heap1BtreeImpl : public BtreeInterface {
+    class Heap1BtreeImpl : public SortedDataInterface {
     public:
         Heap1BtreeImpl(const IndexCatalogEntry& info, IndexSet* data) 
             : _info(info),
               _data(data)
         {}
 
-        virtual BtreeBuilderInterface* getBulkBuilder(OperationContext* txn, bool dupsAllowed) {
+        virtual SortedDataBuilderInterface* getBulkBuilder(OperationContext* txn, bool dupsAllowed) {
             return new Heap1BtreeBuilderImpl(_data, dupsAllowed);
         }
 
@@ -300,7 +300,7 @@ namespace {
             return Status::OK();
         }
 
-        class ForwardCursor : public BtreeInterface::Cursor {
+        class ForwardCursor : public SortedDataInterface::Cursor {
         public:
             ForwardCursor(const IndexSet& data, OperationContext* txn)
                 : _txn(txn),
@@ -314,7 +314,7 @@ namespace {
                 return _it == _data.end();
             }
 
-            virtual bool pointsToSamePlaceAs(const BtreeInterface::Cursor& otherBase) const {
+            virtual bool pointsToSamePlaceAs(const SortedDataInterface::Cursor& otherBase) const {
                 const ForwardCursor& other = static_cast<const ForwardCursor&>(otherBase);
                 invariant(&_data == &other._data); // iterators over same index
                 return _it == other._it;
@@ -399,7 +399,7 @@ namespace {
         };
 
         // TODO see if this can share any code with ForwardIterator
-        class ReverseCursor : public BtreeInterface::Cursor {
+        class ReverseCursor : public SortedDataInterface::Cursor {
         public:
             ReverseCursor(const IndexSet& data, OperationContext* txn)
                 : _txn(txn),
@@ -413,7 +413,7 @@ namespace {
                 return _it == _data.rend();
             }
 
-            virtual bool pointsToSamePlaceAs(const BtreeInterface::Cursor& otherBase) const {
+            virtual bool pointsToSamePlaceAs(const SortedDataInterface::Cursor& otherBase) const {
                 const ReverseCursor& other = static_cast<const ReverseCursor&>(otherBase);
                 invariant(&_data == &other._data); // iterators over same index
                 return _it == other._it;
@@ -512,7 +512,7 @@ namespace {
             DiskLoc _savedLoc;
         };
 
-        virtual BtreeInterface::Cursor* newCursor(OperationContext* txn, int direction) const {
+        virtual SortedDataInterface::Cursor* newCursor(OperationContext* txn, int direction) const {
             if (direction == 1)
                 return new ForwardCursor(*_data, txn);
 
@@ -533,7 +533,7 @@ namespace {
 
     // IndexCatalogEntry argument taken by non-const pointer for consistency with other Btree
     // factories. We don't actually modify it.
-    BtreeInterface* getHeap1BtreeImpl(IndexCatalogEntry* info, boost::shared_ptr<void>* dataInOut) {
+    SortedDataInterface* getHeap1BtreeImpl(IndexCatalogEntry* info, boost::shared_ptr<void>* dataInOut) {
         invariant(info);
         invariant(dataInOut);
         if (!*dataInOut) {
diff --git a/src/mongo/db/storage/heap1/heap1_btree_impl.h b/src/mongo/db/storage/heap1/heap1_btree_impl.h
index 72b38ce3696..7187dc589dc 100644
--- a/src/mongo/db/storage/heap1/heap1_btree_impl.h
+++ b/src/mongo/db/storage/heap1/heap1_btree_impl.h
@@ -28,17 +28,18 @@
 
 #include <boost/shared_ptr.hpp>
 
-#include "mongo/db/structure/btree/btree_interface.h"
+#include "mongo/db/storage/sorted_data_interface.h"
 
 #pragma once
 
 namespace mongo {
+
     class IndexCatalogEntry;
 
     /**
      * Caller takes ownership.
      * All permanent data will be stored and fetch from dataInOut.
      */
-    BtreeInterface* getHeap1BtreeImpl(IndexCatalogEntry* info, boost::shared_ptr<void>* dataInOut);
+    SortedDataInterface* getHeap1BtreeImpl(IndexCatalogEntry* info, boost::shared_ptr<void>* dataInOut);
 
 }  // namespace mongo
diff --git a/src/mongo/db/storage/heap1/heap1_database_catalog_entry.cpp b/src/mongo/db/storage/heap1/heap1_database_catalog_entry.cpp
index 53dea7f10c7..58e069d9863 100644
--- a/src/mongo/db/storage/heap1/heap1_database_catalog_entry.cpp
+++ b/src/mongo/db/storage/heap1/heap1_database_catalog_entry.cpp
@@ -1,32 +1,30 @@
-// heap1_database_catalog_entry.cpp
-
 /**
-*    Copyright (C) 2014 MongoDB Inc.
-*
-*    This program is free software: you can redistribute it and/or  modify
-*    it under the terms of the GNU Affero General Public License, version 3,
-*    as published by the Free Software Foundation.
-*
-*    This program is distributed in the hope that it will be useful,
-*    but WITHOUT ANY WARRANTY; without even the implied warranty of
-*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-*    GNU Affero General Public License for more details.
-*
-*    You should have received a copy of the GNU Affero General Public License
-*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*
-*    As a special exception, the copyright holders give permission to link the
-*    code of portions of this program with the OpenSSL library under certain
-*    conditions as described in each individual source file and distribute
-*    linked combinations including the program with the OpenSSL library. You
-*    must comply with the GNU Affero General Public License in all respects for
-*    all of the code used other than as permitted herein. If you modify file(s)
-*    with this exception, you may extend this exception to your version of the
-*    file(s), but you are not obligated to do so. If you do not wish to do so,
-*    delete this exception statement from your version. If you delete this
-*    exception statement from all source files in the program, then also delete
-*    it in the license file.
-*/
+ *    Copyright (C) 2014 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
 
 #include "mongo/db/storage/heap1/heap1_database_catalog_entry.h"
 
@@ -43,7 +41,7 @@
 #include "mongo/db/operation_context.h"
 #include "mongo/db/storage/heap1/heap1_btree_impl.h"
 #include "mongo/db/storage/heap1/heap1_recovery_unit.h"
-#include "mongo/db/structure/record_store_heap.h"
+#include "mongo/db/storage/heap1/record_store_heap.h"
 
 namespace mongo {
 
@@ -159,14 +157,14 @@ namespace mongo {
         index->headManager()->setHead(txn, DiskLoc(0xDEAD, 0xBEAF));
 
         // When is a btree not a Btree? When it is a Heap1BtreeImpl!
-        std::auto_ptr<BtreeInterface> btree(getHeap1BtreeImpl(index, &i->second->data));
+        std::auto_ptr<SortedDataInterface> btree(getHeap1BtreeImpl(index, &i->second->data));
 #else
 
         if (!i->second->rs)
             i->second->rs.reset(new HeapRecordStore( index->descriptor()->indexName() ));
 
-        std::auto_ptr<BtreeInterface> btree(
-            BtreeInterface::getInterface(index->headManager(),
+        std::auto_ptr<SortedDataInterface> btree(
+            SortedDataInterface::getInterface(index->headManager(),
                                          i->second->rs,
                                          index->ordering(),
                                          index->descriptor()->indexNamespace(),
diff --git a/src/mongo/db/storage/heap1/record_store_heap.cpp b/src/mongo/db/storage/heap1/record_store_heap.cpp
new file mode 100644
index 00000000000..e0578dc5c71
--- /dev/null
+++ b/src/mongo/db/storage/heap1/record_store_heap.cpp
@@ -0,0 +1,494 @@
+/**
+ *    Copyright (C) 2014 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/storage/heap1/record_store_heap.h"
+
+namespace mongo {
+
+    //
+    // RecordStore
+    //
+
+    HeapRecordStore::HeapRecordStore(const StringData& ns,
+                                     bool isCapped,
+                                     int64_t cappedMaxSize,
+                                     int64_t cappedMaxDocs,
+                                     CappedDocumentDeleteCallback* cappedDeleteCallback)
+            : RecordStore(ns),
+              _isCapped(isCapped),
+              _cappedMaxSize(cappedMaxSize),
+              _cappedMaxDocs(cappedMaxDocs),
+              _cappedDeleteCallback(cappedDeleteCallback),
+              _dataSize(0),
+              _nextId(1) { // DiskLoc(0,0) isn't valid for records.
+
+        if (_isCapped) {
+            invariant(_cappedMaxSize > 0);
+            invariant(_cappedMaxDocs == -1 || _cappedMaxDocs > 0);
+        }
+        else {
+            invariant(_cappedMaxSize == -1);
+            invariant(_cappedMaxDocs == -1);
+        }
+    }
+
+    const char* HeapRecordStore::name() const { return "heap"; }
+
+    RecordData HeapRecordStore::dataFor( const DiskLoc& loc ) const {
+        return recordFor(loc)->toRecordData();
+    }
+
+    HeapRecordStore::HeapRecord* HeapRecordStore::recordFor(const DiskLoc& loc) const {
+        Records::const_iterator it = _records.find(loc);
+        invariant(it != _records.end());
+        return reinterpret_cast<HeapRecord*>(it->second.get());
+    }
+
+    void HeapRecordStore::deleteRecord(OperationContext* txn, const DiskLoc& loc) {
+        HeapRecord* rec = recordFor(loc);
+        _dataSize -= rec->netLength();
+        invariant(_records.erase(loc) == 1);
+    }
+
+    bool HeapRecordStore::cappedAndNeedDelete() const {
+        if (!_isCapped)
+            return false;
+
+        if (_dataSize > _cappedMaxSize)
+            return true;
+
+        if ((_cappedMaxDocs != -1) && (numRecords() > _cappedMaxDocs))
+            return true;
+
+        return false;
+    }
+
+    void HeapRecordStore::cappedDeleteAsNeeded(OperationContext* txn) {
+        while (cappedAndNeedDelete()) {
+            invariant(!_records.empty());
+
+            DiskLoc oldest = _records.begin()->first;
+
+            if (_cappedDeleteCallback)
+                uassertStatusOK(_cappedDeleteCallback->aboutToDeleteCapped(txn, oldest));
+
+            deleteRecord(txn, oldest);
+        }
+    }
+
+    StatusWith<DiskLoc> HeapRecordStore::insertRecord(OperationContext* txn,
+                                                      const char* data,
+                                                      int len,
+                                                      bool enforceQuota) {
+        if (_isCapped && len > _cappedMaxSize) {
+            // We use dataSize for capped rollover and we don't want to delete everything if we know
+            // this won't fit.
+            return StatusWith<DiskLoc>(ErrorCodes::BadValue,
+                                       "object to insert exceeds cappedMaxSize");
+        }
+
+        // TODO padding?
+        const int lengthWithHeaders = len + HeapRecord::HeaderSize;
+        boost::shared_array<char> buf(new char[lengthWithHeaders]);
+        HeapRecord* rec = reinterpret_cast<HeapRecord*>(buf.get());
+        rec->lengthWithHeaders() = lengthWithHeaders;
+        memcpy(rec->data(), data, len);
+
+        const DiskLoc loc = allocateLoc();
+        _records[loc] = buf;
+        _dataSize += len;
+
+        cappedDeleteAsNeeded(txn);
+
+        return StatusWith<DiskLoc>(loc);
+    }
+
+    StatusWith<DiskLoc> HeapRecordStore::insertRecord(OperationContext* txn,
+                                                      const DocWriter* doc,
+                                                      bool enforceQuota) {
+        const int len = doc->documentSize();
+        if (_isCapped && len > _cappedMaxSize) {
+            // We use dataSize for capped rollover and we don't want to delete everything if we know
+            // this won't fit.
+            return StatusWith<DiskLoc>(ErrorCodes::BadValue,
+                                       "object to insert exceeds cappedMaxSize");
+        }
+
+        // TODO padding?
+        const int lengthWithHeaders = len + HeapRecord::HeaderSize;
+        boost::shared_array<char> buf(new char[lengthWithHeaders]);
+        HeapRecord* rec = reinterpret_cast<HeapRecord*>(buf.get());
+        rec->lengthWithHeaders() = lengthWithHeaders;
+        doc->writeDocument(rec->data());
+
+        const DiskLoc loc = allocateLoc();
+        _records[loc] = buf;
+        _dataSize += len;
+
+        cappedDeleteAsNeeded(txn);
+
+        return StatusWith<DiskLoc>(loc);
+    }
+
+    StatusWith<DiskLoc> HeapRecordStore::updateRecord(OperationContext* txn,
+                                                      const DiskLoc& oldLocation,
+                                                      const char* data,
+                                                      int len,
+                                                      bool enforceQuota,
+                                                      UpdateMoveNotifier* notifier ) {
+        HeapRecord* oldRecord = recordFor( oldLocation );
+        int oldLen = oldRecord->netLength();
+
+        // If the length of the new data is <= the length of the old data then just
+        // memcopy into the old space
+        if ( len <= oldLen) {
+            memcpy(oldRecord->data(), data, len);
+            _dataSize += len - oldLen;
+            return StatusWith<DiskLoc>(oldLocation);
+        }
+
+        if ( _isCapped ) {
+            return StatusWith<DiskLoc>( ErrorCodes::InternalError,
+                                        "failing update: objects in a capped ns cannot grow",
+                                        10003 );
+        }
+
+        // If the length of the new data exceeds the size of the old Record, we need to allocate
+        // a new Record, and delete the old one
+
+        const int lengthWithHeaders = len + HeapRecord::HeaderSize;
+        boost::shared_array<char> buf(new char[lengthWithHeaders]);
+        HeapRecord* rec = reinterpret_cast<HeapRecord*>(buf.get());
+        rec->lengthWithHeaders() = lengthWithHeaders;
+        memcpy(rec->data(), data, len);
+
+        _records[oldLocation] = buf;
+        _dataSize += len - oldLen;
+
+        cappedDeleteAsNeeded(txn);
+
+        return StatusWith<DiskLoc>(oldLocation);
+    }
+
+    Status HeapRecordStore::updateWithDamages( OperationContext* txn,
+                                               const DiskLoc& loc,
+                                               const char* damangeSource,
+                                               const mutablebson::DamageVector& damages ) {
+        HeapRecord* rec = recordFor( loc );
+        char* root = rec->data();
+
+        // All updates were in place. Apply them via durability and writing pointer.
+        mutablebson::DamageVector::const_iterator where = damages.begin();
+        const mutablebson::DamageVector::const_iterator end = damages.end();
+        for( ; where != end; ++where ) {
+            const char* sourcePtr = damangeSource + where->sourceOffset;
+            char* targetPtr = root + where->targetOffset;
+            std::memcpy(targetPtr, sourcePtr, where->size);
+        }
+
+        return Status::OK();
+    }
+
+    RecordIterator* HeapRecordStore::getIterator(OperationContext* txn,
+                                                 const DiskLoc& start,
+                                                 bool tailable,
+                                                 const CollectionScanParams::Direction& dir) const {
+        if (tailable)
+            invariant(_isCapped && dir == CollectionScanParams::FORWARD);
+
+        if (dir == CollectionScanParams::FORWARD) {
+            return new HeapRecordIterator(txn, _records, *this, start, tailable);
+        }
+        else {
+            return new HeapRecordIterator(txn, _records, *this, start);
+        }
+    }
+
+    RecordIterator* HeapRecordStore::getIteratorForRepair(OperationContext* txn) const {
+        // TODO maybe make different from HeapRecordIterator
+        return new HeapRecordIterator(txn, _records, *this);
+    }
+
+    std::vector<RecordIterator*> HeapRecordStore::getManyIterators(OperationContext* txn) const {
+        std::vector<RecordIterator*> out;
+        // TODO maybe find a way to return multiple iterators.
+        out.push_back(new HeapRecordIterator(txn, _records, *this));
+        return out;
+    }
+
+    Status HeapRecordStore::truncate(OperationContext* txn) {
+        _records.clear();
+        _dataSize = 0;
+        return Status::OK();
+    }
+
+    void HeapRecordStore::temp_cappedTruncateAfter(OperationContext* txn,
+                                                   DiskLoc end,
+                                                   bool inclusive) {
+        Records::iterator it = inclusive ? _records.lower_bound(end)
+                                         : _records.upper_bound(end);
+        while(it != _records.end()) {
+            _dataSize -= reinterpret_cast<HeapRecord*>(it->second.get())->netLength();
+            _records.erase(it++);
+        }
+    }
+
+    bool HeapRecordStore::compactSupported() const {
+        return false;
+    }
+    Status HeapRecordStore::compact(OperationContext* txn,
+                                    RecordStoreCompactAdaptor* adaptor,
+                                    const CompactOptions* options,
+                                    CompactStats* stats) {
+        // TODO might be possible to do something here
+        invariant(!"compact not yet implemented");
+    }
+
+    Status HeapRecordStore::validate(OperationContext* txn,
+                                     bool full,
+                                     bool scanData,
+                                     ValidateAdaptor* adaptor,
+                                     ValidateResults* results,
+                                     BSONObjBuilder* output) const {
+        results->valid = true;
+        if (scanData && full) {
+            for (Records::const_iterator it = _records.begin(); it != _records.end(); ++it) {
+                HeapRecord* rec = reinterpret_cast<HeapRecord*>(it->second.get());
+                size_t dataSize;
+                const Status status = adaptor->validate(rec->toRecordData(), &dataSize);
+                if (!status.isOK()) {
+                    results->valid = false;
+                    results->errors.push_back("invalid object detected (see logs)");
+                    log() << "Invalid object detected in " << _ns << ": " << status.reason();
+                }
+            }
+        }
+
+        output->appendNumber( "nrecords", _records.size() );
+
+        return Status::OK();
+
+    }
+    
+    void HeapRecordStore::appendCustomStats( OperationContext* txn,
+                                             BSONObjBuilder* result,
+                                             double scale ) const {
+        result->append( "note", "HeapRecordStore has no cusom stats yet" );
+    }
+
+    Status HeapRecordStore::touch(OperationContext* txn, BSONObjBuilder* output) const {
+        if (output) {
+            output->append("numRanges", 1);
+            output->append("millis", 0);
+        }
+        return Status::OK();
+    }
+    
+    Status HeapRecordStore::setCustomOption(
+                OperationContext* txn, const BSONElement& option, BSONObjBuilder* info) {
+        invariant(!"setCustomOption not yet implemented");
+    }
+
+    void HeapRecordStore::increaseStorageSize(OperationContext* txn,  int size, bool enforceQuota) {
+        // unclear what this would mean for this class. For now, just error if called.
+        invariant(!"increaseStorageSize not yet implemented");
+    }
+
+    int64_t HeapRecordStore::storageSize(OperationContext* txn,
+                                         BSONObjBuilder* extraInfo,
+                                         int infoLevel) const {
+        // Note: not making use of extraInfo or infoLevel since we don't have extents
+        const int64_t recordOverhead = numRecords() * HeapRecord::HeaderSize;
+        return _dataSize + recordOverhead;
+    }
+
+    DiskLoc HeapRecordStore::allocateLoc() {
+        const int64_t id = _nextId++;
+        // This is a hack, but both the high and low order bits of DiskLoc offset must be 0, and the
+        // file must fit in 23 bits. This gives us a total of 30 + 23 == 53 bits.
+        invariant(id < (1LL << 53));
+        return DiskLoc(int(id >> 30), int((id << 1) & ~(1<<31)));
+    }
+
+    //
+    // Forward Iterator
+    //
+
+    HeapRecordIterator::HeapRecordIterator(OperationContext* txn,
+                                           const HeapRecordStore::Records& records,
+                                           const HeapRecordStore& rs,
+                                           DiskLoc start,
+                                           bool tailable)
+            : _txn(txn),
+              _tailable(tailable),
+              _lastLoc(minDiskLoc),
+              _killedByInvalidate(false),
+              _records(records),
+              _rs(rs) {
+        if (start.isNull()) {
+            _it = _records.begin();
+        }
+        else {
+            _it = _records.find(start);
+            invariant(_it != _records.end());
+        }
+    }
+
+    bool HeapRecordIterator::isEOF() {
+        return _it == _records.end();
+    }
+
+    DiskLoc HeapRecordIterator::curr() {
+        if (isEOF())
+            return DiskLoc();
+        return _it->first;
+    }
+
+    DiskLoc HeapRecordIterator::getNext() {
+        if (isEOF()) {
+            if (!_tailable)
+                return DiskLoc();
+
+            if (_records.empty())
+                return DiskLoc();
+
+            invariant(!_killedByInvalidate);
+
+            // recover to last returned record
+            invariant(!_lastLoc.isNull());
+            _it = _records.find(_lastLoc);
+            invariant(_it != _records.end());
+
+            if (++_it == _records.end())
+                return DiskLoc();
+        }
+
+        const DiskLoc out = _it->first;
+        ++_it;
+        if (_tailable && _it == _records.end())
+            _lastLoc = out;
+        return out;
+    }
+
+    void HeapRecordIterator::invalidate(const DiskLoc& loc) {
+        if (_rs.isCapped()) {
+            // Capped iterators die on invalidation rather than advancing.
+            if (isEOF()) {
+                if (_lastLoc == loc) {
+                    _killedByInvalidate = true;
+                }
+            } 
+            else if (_it->first == loc) {
+                _killedByInvalidate = true;
+            }
+
+            return;
+        }
+
+        if (_it != _records.end() && _it->first == loc)
+            ++_it;
+    }
+
+    void HeapRecordIterator::prepareToYield() {
+    }
+
+    bool HeapRecordIterator::recoverFromYield() {
+        return !_killedByInvalidate;
+    }
+
+    RecordData HeapRecordIterator::dataFor(const DiskLoc& loc) const {
+        return _rs.dataFor(loc);
+    }
+
+    //
+    // Reverse Iterator
+    //
+
+    HeapRecordReverseIterator::HeapRecordReverseIterator(OperationContext* txn,
+                                                         const HeapRecordStore::Records& records,
+                                                         const HeapRecordStore& rs,
+                                                         DiskLoc start)
+            : _txn(txn),
+              _killedByInvalidate(false),
+              _records(records),
+              _rs(rs) {
+        if (start.isNull()) {
+            _it = _records.rbegin();
+        }
+        else {
+            _it = HeapRecordStore::Records::const_reverse_iterator(_records.find(start));
+            invariant(_it != _records.rend());
+        }
+    }
+
+    bool HeapRecordReverseIterator::isEOF() {
+        return _it == _records.rend();
+    }
+
+    DiskLoc HeapRecordReverseIterator::curr() {
+        if (isEOF())
+            return DiskLoc();
+        return _it->first;
+    }
+
+    DiskLoc HeapRecordReverseIterator::getNext() {
+        if (isEOF())
+            return DiskLoc();
+
+        const DiskLoc out = _it->first;
+        ++_it;
+        return out;
+    }
+
+    void HeapRecordReverseIterator::invalidate(const DiskLoc& loc) {
+        if (isEOF())
+            return;
+
+        if (_it->first == loc) {
+            if (_rs.isCapped()) {
+                // Capped iterators die on invalidation rather than advancing.
+                _killedByInvalidate = true;
+                return;
+            }
+            ++_it;
+        }
+    }
+
+    void HeapRecordReverseIterator::prepareToYield() {
+    }
+
+    bool HeapRecordReverseIterator::recoverFromYield() {
+        return !_killedByInvalidate;
+    }
+
+    RecordData HeapRecordReverseIterator::dataFor(const DiskLoc& loc) const {
+        return _rs.dataFor(loc);
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/heap1/record_store_heap.h b/src/mongo/db/storage/heap1/record_store_heap.h
new file mode 100644
index 00000000000..f4810b04972
--- /dev/null
+++ b/src/mongo/db/storage/heap1/record_store_heap.h
@@ -0,0 +1,241 @@
+// record_store_heap.h
+
+/**
+*    Copyright (C) 2014 MongoDB Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#pragma once
+
+#include <boost/shared_array.hpp>
+#include <map>
+
+#include "mongo/db/storage/capped_callback.h"
+#include "mongo/db/storage/record_store.h"
+
+namespace mongo {
+
+    class HeapRecordIterator;
+
+    /**
+     * A RecordStore that stores all data on the heap.
+     *
+     * @param cappedMaxSize - required if isCapped. limit uses dataSize() in this impl.
+     */
+    class HeapRecordStore : public RecordStore {
+    public:
+        explicit HeapRecordStore(const StringData& ns,
+                                 bool isCapped = false,
+                                 int64_t cappedMaxSize = -1,
+                                 int64_t cappedMaxDocs = -1,
+                                 CappedDocumentDeleteCallback* cappedDeleteCallback = NULL);
+
+        virtual const char* name() const;
+
+        virtual RecordData dataFor( const DiskLoc& loc ) const;
+
+        virtual void deleteRecord( OperationContext* txn, const DiskLoc& dl );
+
+        virtual StatusWith<DiskLoc> insertRecord( OperationContext* txn,
+                                                  const char* data,
+                                                  int len,
+                                                  bool enforceQuota );
+
+        virtual StatusWith<DiskLoc> insertRecord( OperationContext* txn,
+                                                  const DocWriter* doc,
+                                                  bool enforceQuota );
+                                                  
+        virtual StatusWith<DiskLoc> updateRecord( OperationContext* txn,
+                                                  const DiskLoc& oldLocation,
+                                                  const char* data,
+                                                  int len,
+                                                  bool enforceQuota,
+                                                  UpdateMoveNotifier* notifier );
+                                                  
+        virtual Status updateWithDamages( OperationContext* txn,
+                                          const DiskLoc& loc,
+                                          const char* damangeSource,
+                                          const mutablebson::DamageVector& damages );
+
+        virtual RecordIterator* getIterator( OperationContext* txn,
+                                             const DiskLoc& start,
+                                             bool tailable,
+                                             const CollectionScanParams::Direction& dir) const;
+
+        virtual RecordIterator* getIteratorForRepair( OperationContext* txn ) const;
+
+        virtual std::vector<RecordIterator*> getManyIterators( OperationContext* txn ) const;
+
+        virtual Status truncate( OperationContext* txn );
+
+        virtual void temp_cappedTruncateAfter( OperationContext* txn, DiskLoc end, bool inclusive );
+
+        virtual bool compactSupported() const;
+        virtual Status compact( OperationContext* txn,
+                                RecordStoreCompactAdaptor* adaptor,
+                                const CompactOptions* options,
+                                CompactStats* stats );
+
+        virtual Status validate( OperationContext* txn,
+                                 bool full,
+                                 bool scanData,
+                                 ValidateAdaptor* adaptor,
+                                 ValidateResults* results, BSONObjBuilder* output ) const;
+                                 
+        virtual void appendCustomStats( OperationContext* txn,
+                                        BSONObjBuilder* result,
+                                        double scale ) const;
+
+        virtual Status touch( OperationContext* txn, BSONObjBuilder* output ) const;
+        
+        virtual Status setCustomOption( OperationContext* txn,
+                                        const BSONElement& option,
+                                        BSONObjBuilder* info = NULL );
+
+        virtual void increaseStorageSize( OperationContext* txn,  int size, bool enforceQuota );
+
+        virtual int64_t storageSize( OperationContext* txn,
+                                     BSONObjBuilder* extraInfo = NULL,
+                                     int infoLevel = 0) const;
+
+        virtual long long dataSize() const { return _dataSize; }
+
+        virtual long long numRecords() const { return _records.size(); }
+
+    protected:
+        class HeapRecord {
+        public:
+            enum HeaderSizeValue { HeaderSize = 16 };
+
+            int lengthWithHeaders() const {  return _lengthWithHeaders; }
+            int& lengthWithHeaders() {  return _lengthWithHeaders; }
+
+            const char* data() const { return _data; }
+            char* data() { return _data; }
+
+            int netLength() const { return _lengthWithHeaders - HeaderSize; }
+
+            RecordData toRecordData() const { return RecordData(_data, netLength()); }
+
+        private:
+            int _lengthWithHeaders;
+            char _data[4];
+        };
+
+        virtual HeapRecord* recordFor( const DiskLoc& loc ) const;
+
+    public:
+        //
+        // Not in RecordStore interface
+        //
+
+        typedef std::map<DiskLoc, boost::shared_array<char> > Records;
+
+        bool isCapped() const { return _isCapped; }
+        void setCappedDeleteCallback(CappedDocumentDeleteCallback* cb) { _cappedDeleteCallback = cb; }
+        bool cappedMaxDocs() const { invariant(_isCapped); return _cappedMaxDocs; }
+        bool cappedMaxSize() const { invariant(_isCapped); return _cappedMaxSize; }
+
+    private:
+        DiskLoc allocateLoc();
+        bool cappedAndNeedDelete() const;
+        void cappedDeleteAsNeeded(OperationContext* txn);
+
+        // TODO figure out a proper solution to metadata
+        const bool _isCapped;
+        const int64_t _cappedMaxSize;
+        const int64_t _cappedMaxDocs;
+        CappedDocumentDeleteCallback* _cappedDeleteCallback;
+        int64_t _dataSize;
+
+        Records _records;
+        int64_t _nextId;
+    };
+
+    class HeapRecordIterator : public RecordIterator {
+    public:
+        HeapRecordIterator(OperationContext* txn,
+                           const HeapRecordStore::Records& records,
+                           const HeapRecordStore& rs,
+                           DiskLoc start = DiskLoc(),
+                           bool tailable = false);
+
+        virtual bool isEOF();
+
+        virtual DiskLoc curr();
+
+        virtual DiskLoc getNext();
+
+        virtual void invalidate(const DiskLoc& dl);
+
+        virtual void prepareToYield();
+
+        virtual bool recoverFromYield();
+
+        virtual RecordData dataFor( const DiskLoc& loc ) const;
+
+    private:
+        OperationContext* _txn; // not owned
+        HeapRecordStore::Records::const_iterator _it;
+        bool _tailable;
+        DiskLoc _lastLoc; // only for restarting tailable
+        bool _killedByInvalidate;
+
+        const HeapRecordStore::Records& _records;
+        const HeapRecordStore& _rs;
+    };
+
+    class HeapRecordReverseIterator : public RecordIterator {
+    public:
+        HeapRecordReverseIterator(OperationContext* txn,
+                                  const HeapRecordStore::Records& records,
+                                  const HeapRecordStore& rs,
+                                  DiskLoc start = DiskLoc());
+
+        virtual bool isEOF();
+
+        virtual DiskLoc curr();
+
+        virtual DiskLoc getNext();
+
+        virtual void invalidate(const DiskLoc& dl);
+
+        virtual void prepareToYield();
+
+        virtual bool recoverFromYield();
+
+        virtual RecordData dataFor( const DiskLoc& loc ) const;
+
+    private:
+        OperationContext* _txn; // not owned
+        HeapRecordStore::Records::const_reverse_iterator _it;
+        bool _killedByInvalidate;
+
+        const HeapRecordStore::Records& _records;
+        const HeapRecordStore& _rs;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/SConscript b/src/mongo/db/storage/mmap_v1/SConscript
index 5f7ac5eabd2..11b6b06b3e7 100644
--- a/src/mongo/db/storage/mmap_v1/SConscript
+++ b/src/mongo/db/storage/mmap_v1/SConscript
@@ -1,6 +1,34 @@
 Import("env")
 
 env.Library(
+    target = 'storage_mmapv1',
+    source = [ "catalog/index_details.cpp",
+               "catalog/namespace.cpp",
+               "catalog/namespace_details.cpp",
+               "catalog/namespace_details_collection_entry.cpp",
+               "catalog/namespace_details_rsv1_metadata.cpp",
+               "catalog/namespace_index.cpp",
+               "data_file.cpp",
+               "durable_mapped_file.cpp",
+               "dur.cpp",
+               "durop.cpp",
+               "dur_writetodatafiles.cpp",
+               "dur_preplogbuffer.cpp",
+               "dur_commitjob.cpp",
+               "dur_recover.cpp",
+               "dur_journal.cpp",
+               "dur_recovery_unit.cpp",
+               "mmap_v1_database_catalog_entry.cpp",
+               "mmap_v1_engine.cpp",
+               "mmap_v1_extent_manager.cpp",
+               "repair_database.cpp",
+             ],
+    LIBDEPS = [
+        'record_store_v1',
+        'btree']
+    )
+
+env.Library(
     target= 'extent',
     source= [
         'extent.cpp',
@@ -11,3 +39,94 @@ env.Library(
         '$BUILD_DIR/mongo/foundation',
         ]
     )
+
+env.Library(
+    target= 'record_store_v1',
+    source= [
+        'record_store_v1_base.cpp',
+        'record_store_v1_capped.cpp',
+        'record_store_v1_capped_iterator.cpp',
+        'record_store_v1_repair_iterator.cpp',
+        'record_store_v1_simple.cpp',
+        'record_store_v1_simple_iterator.cpp',
+        ],
+    LIBDEPS= [
+        'extent',
+        '$BUILD_DIR/mongo/mongocommon',  # for ProgressMeter
+        '$BUILD_DIR/mongo/db/commands/server_status_core',
+        ]
+    )
+
+env.Library(
+    target='record_store_v1_test_help',
+    source=['record_store_v1_test_help.cpp',
+            ],
+    LIBDEPS=[
+        'record_store_v1'
+        ]
+    )
+
+env.CppUnitTest(target = 'namespace_test',
+                source = ['catalog/namespace_test.cpp'],
+                LIBDEPS = ['$BUILD_DIR/mongo/foundation'])
+
+env.CppUnitTest(
+    target='record_store_v1_simple_test',
+    source=['record_store_v1_simple_test.cpp',
+            ],
+    LIBDEPS=[
+        'record_store_v1_test_help'
+        ]
+    )
+
+env.CppUnitTest(
+    target='record_store_v1_capped_test',
+    source=['record_store_v1_capped_test.cpp',
+            ],
+    LIBDEPS=[
+        'record_store_v1_test_help'
+        ]
+    )
+
+env.Library(
+    target= 'btree',
+    source= [
+        'btree/btree_logic.cpp',
+        'btree/btree_interface.cpp',
+        'btree/key.cpp'
+        ],
+    LIBDEPS= [
+        '$BUILD_DIR/mongo/bson'
+        ]
+    )
+
+env.Library(
+    target= 'btree_test_help',
+    source= [
+        'btree/btree_test_help.cpp'
+        ],
+    LIBDEPS= [
+        'btree',
+        '$BUILD_DIR/mongo/mongocommon',  # for ProgressMeter
+        '$BUILD_DIR/mongo/db/storage/mmap_v1/record_store_v1_test_help',
+        '$BUILD_DIR/mongo/db/storage/heap1/heap_record_store'  # XXX?
+        ]
+    )
+
+env.CppUnitTest(
+    target='btree_logic_test',
+    source=['btree/btree_logic_test.cpp'
+            ],
+    LIBDEPS=[
+        'btree_test_help'
+        ]
+    )
+
+env.CppUnitTest(
+    target='btree_builder_test',
+    source=['btree/btree_builder_test.cpp'
+            ],
+    LIBDEPS=[
+        'btree_test_help'        
+        ]
+    )
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_builder_test.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_builder_test.cpp
new file mode 100644
index 00000000000..89d2ffc4d98
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_builder_test.cpp
@@ -0,0 +1,133 @@
+// btree_builder_test.cpp : Btree builder unit test
+
+/**
+ *    Copyright (C) 2014 MongoDB
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects
+ *    for all of the code used other than as permitted herein. If you modify
+ *    file(s) with this exception, you may extend this exception to your
+ *    version of the file(s), but you are not obligated to do so. If you do not
+ *    wish to do so, delete this exception statement from your version. If you
+ *    delete this exception statement from all source files in the program,
+ *    then also delete it in the license file.
+ */
+
+// This file contains simple tests to check the Btree builder logic,
+// including handling of interruptions.
+
+#include "mongo/db/instance.h"
+#include "mongo/db/operation_context_noop.h"
+#include "mongo/db/storage/mmap_v1/btree/btree_test_help.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+
+    class MockOperationContextKillable : public OperationContextNoop {
+    public:
+        MockOperationContextKillable()
+            : _killPending(false) {
+        }
+
+        virtual void checkForInterrupt(bool heedMutex = true) const {
+            if (_killPending) {
+                throw UserException(ErrorCodes::Interrupted, "interrupted");
+            }
+        }
+
+        virtual void kill() {
+           _killPending = true;
+        }
+
+    private:
+       bool _killPending;
+    };
+
+    /**
+     * Builder::commit() is interrupted if there is a request to kill the current operation.
+     */
+    template<class OnDiskFormat>
+    class InterruptCommit {
+    public:
+        typedef typename BtreeLogic<OnDiskFormat>::Builder Builder;
+
+        InterruptCommit( bool mayInterrupt ) :
+            _mayInterrupt( mayInterrupt ),
+            _helper(BSON( "a" << 1 )) {
+        }
+
+        void run() {
+            // Create a btree builder.
+            MockOperationContextKillable txn;
+            Builder* builder = _helper.btree.newBuilder(&txn, false);
+
+            // Add some keys to the builder, in order.  We need enough keys to build an internal
+            // node in order to check for an interrupt.
+            int32_t nKeys = 1000;
+            for( int32_t i = 0; i < nKeys; ++i ) {
+                BSONObj key = BSON( "a" << i );
+                builder->addKey( key, /* dummy location */ DiskLoc() );
+            }
+
+            // The root of the index has not yet been set.
+            ASSERT( _helper.headManager.getHead().isNull() );
+            // Register a request to kill the current operation.
+            txn.kill();
+            if ( _mayInterrupt ) {
+                // Call commit on the builder, which will be aborted due to the kill request.
+                ASSERT_THROWS( builder->commit( _mayInterrupt ), UserException );
+                // The root of the index is not set because commit() did not complete.
+                ASSERT( _helper.headManager.getHead().isNull() );
+            }
+            else {
+                // Call commit on the builder, which will not be aborted because mayInterrupt is
+                // false.
+                builder->commit( _mayInterrupt );
+                // The root of the index is set because commit() completed.
+                ASSERT( !_helper.headManager.getHead().isNull() );
+            }
+        }
+
+    private:
+        bool _mayInterrupt;
+        BtreeLogicTestHelper<OnDiskFormat> _helper;
+    };
+
+
+    //
+    // TEST SUITE DEFINITION
+    //
+
+    template<class OnDiskFormat>
+    class BtreeBuilderTestSuite : public unittest::Suite {
+    public:
+        BtreeBuilderTestSuite(const std::string& name) : Suite(name) {
+
+        }
+
+        void setupTests() {
+
+            add< InterruptCommit<OnDiskFormat> >( false );
+            add< InterruptCommit<OnDiskFormat> >( true );
+        }
+    };
+
+    // Test suite for both V0 and V1
+    static BtreeBuilderTestSuite<BtreeLayoutV0> SUITE_V0("BtreeBuilderTests V0");
+    static BtreeBuilderTestSuite<BtreeLayoutV1> SUITE_V1("BtreeBuilderTests V1");
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp
new file mode 100644
index 00000000000..6d2fae7bffa
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp
@@ -0,0 +1,266 @@
+/**
+ *    Copyright (C) 2014 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/storage/sorted_data_interface.h"
+
+#include "mongo/db/operation_context.h"
+#include "mongo/db/storage/mmap_v1/btree/btree_logic.h"
+
+
+namespace mongo {
+
+    template <class OnDiskFormat>
+    class BtreeBuilderInterfaceImpl : public SortedDataBuilderInterface {
+    public:
+        BtreeBuilderInterfaceImpl(OperationContext* trans,
+                                  typename BtreeLogic<OnDiskFormat>::Builder* builder)
+            : _builder(builder), _trans(trans) { }
+
+        virtual ~BtreeBuilderInterfaceImpl() { }
+
+        Status addKey(const BSONObj& key, const DiskLoc& loc) {
+            return _builder->addKey(key, loc);
+        }
+
+        unsigned long long commit(bool mayInterrupt) {
+            return _builder->commit(mayInterrupt);
+        }
+
+    private:
+        typename BtreeLogic<OnDiskFormat>::Builder* _builder;
+
+        // Not owned here.
+        OperationContext* _trans;
+    };
+
+    template <class OnDiskFormat>
+    class BtreeInterfaceImpl : public SortedDataInterface {
+    public:
+        BtreeInterfaceImpl(HeadManager* headManager,
+                                RecordStore* recordStore,
+                                const Ordering& ordering,
+                                const string& indexName,
+                                BucketDeletionNotification* bucketDeletionNotification) {
+
+            _btree.reset(new BtreeLogic<OnDiskFormat>(headManager,
+                                                      recordStore,
+                                                      ordering,
+                                                      indexName,
+                                                      bucketDeletionNotification));
+        }
+
+        virtual ~BtreeInterfaceImpl() { }
+
+        virtual SortedDataBuilderInterface* getBulkBuilder(OperationContext* txn,
+                                                           bool dupsAllowed) {
+
+            return new BtreeBuilderInterfaceImpl<OnDiskFormat>(
+                txn, _btree->newBuilder(txn, dupsAllowed));
+        }
+
+        virtual Status insert(OperationContext* txn,
+                              const BSONObj& key,
+                              const DiskLoc& loc,
+                              bool dupsAllowed) {
+
+            return _btree->insert(txn, key, loc, dupsAllowed);
+        }
+
+        virtual bool unindex(OperationContext* txn,
+                             const BSONObj& key,
+                             const DiskLoc& loc) {
+
+            return _btree->unindex(txn, key, loc);
+        }
+
+        virtual void fullValidate(OperationContext* txn, long long *numKeysOut) {
+            *numKeysOut = _btree->fullValidate(txn, NULL, false, false, 0);
+        }
+
+        virtual Status dupKeyCheck(OperationContext* txn,
+                                   const BSONObj& key,
+                                   const DiskLoc& loc) {
+            return _btree->dupKeyCheck(txn, key, loc);
+        }
+
+        virtual bool isEmpty() {
+            return _btree->isEmpty();
+        }
+
+        virtual Status touch(OperationContext* txn) const{
+            return _btree->touch(txn);
+        }
+
+        class Cursor : public SortedDataInterface::Cursor {
+        public:
+            Cursor(OperationContext* txn,
+                   const BtreeLogic<OnDiskFormat>* btree,
+                   int direction)
+                : _txn(txn),
+                  _btree(btree),
+                  _direction(direction),
+                  _bucket(btree->getHead()), // XXX this shouldn't be nessisary, but is.
+                  _ofs(0) {
+            }
+
+            virtual int getDirection() const { return _direction; }
+
+            virtual bool isEOF() const { return _bucket.isNull(); }
+
+            virtual bool pointsToSamePlaceAs(const SortedDataInterface::Cursor& otherBase) const {
+                const Cursor& other = static_cast<const Cursor&>(otherBase);
+                if (isEOF())
+                    return other.isEOF();
+
+                return _bucket == other._bucket && _ofs == other._ofs;
+
+            }
+
+            virtual void aboutToDeleteBucket(const DiskLoc& bucket) {
+                if (_bucket == bucket)
+                    _ofs = -1;
+            }
+
+            virtual bool locate(const BSONObj& key, const DiskLoc& loc) {
+                return _btree->locate(_txn, key, loc, _direction, &_ofs, &_bucket);
+            }
+
+            virtual void customLocate(const BSONObj& keyBegin,
+                                      int keyBeginLen,
+                                      bool afterKey,
+                                      const vector<const BSONElement*>& keyEnd,
+                                      const vector<bool>& keyEndInclusive) {
+
+                _btree->customLocate(_txn,
+                                     &_bucket,
+                                     &_ofs,
+                                     keyBegin,
+                                     keyBeginLen,
+                                     afterKey, 
+                                     keyEnd,
+                                     keyEndInclusive,
+                                     _direction);
+            }
+
+            void advanceTo(const BSONObj &keyBegin,
+                           int keyBeginLen,
+                           bool afterKey,
+                           const vector<const BSONElement*>& keyEnd,
+                           const vector<bool>& keyEndInclusive) {
+
+                _btree->advanceTo(_txn,
+                                  &_bucket,
+                                  &_ofs,
+                                  keyBegin,
+                                  keyBeginLen,
+                                  afterKey,
+                                  keyEnd,
+                                  keyEndInclusive,
+                                  _direction);
+            }
+
+            virtual BSONObj getKey() const {
+                return _btree->getKey(_bucket, _ofs);
+            }
+
+            virtual DiskLoc getDiskLoc() const {
+                return _btree->getDiskLoc(_bucket, _ofs);
+            }
+
+            virtual void advance() {
+                _btree->advance(_txn, &_bucket, &_ofs, _direction);
+            }
+
+            virtual void savePosition() {
+                if (!_bucket.isNull()) {
+                    _savedKey = getKey().getOwned();
+                    _savedLoc = getDiskLoc();
+                }
+            }
+
+            virtual void restorePosition() {
+                if (!_bucket.isNull()) {
+                    _btree->restorePosition(_txn,
+                                            _savedKey,
+                                            _savedLoc,
+                                            _direction,
+                                            &_bucket,
+                                            &_ofs);
+                }
+            }
+
+        private:
+            OperationContext* _txn; // not owned
+            const BtreeLogic<OnDiskFormat>* const _btree;
+            const int _direction;
+
+            DiskLoc _bucket;
+            int _ofs;
+
+            // Only used by save/restorePosition() if _bucket is non-Null.
+            BSONObj _savedKey;
+            DiskLoc _savedLoc;
+        };
+
+        virtual Cursor* newCursor(OperationContext* txn, int direction) const {
+            return new Cursor(txn, _btree.get(), direction);
+        }
+
+        virtual Status initAsEmpty(OperationContext* txn) {
+            return _btree->initAsEmpty(txn);
+        }
+
+    private:
+        scoped_ptr<BtreeLogic<OnDiskFormat> > _btree;
+    };
+
+    SortedDataInterface* getMMAPV1Interface(HeadManager* headManager,
+                                            RecordStore* recordStore,
+                                            const Ordering& ordering,
+                                            const string& indexName,
+                                            int version,
+                                            BucketDeletionNotification* bucketDeletion) {
+
+        if (0 == version) {
+            return new BtreeInterfaceImpl<BtreeLayoutV0>(headManager,
+                                                         recordStore,
+                                                         ordering,
+                                                         indexName,
+                                                         bucketDeletion);
+        }
+        else {
+            invariant(1 == version);
+            return new BtreeInterfaceImpl<BtreeLayoutV1>(headManager,
+                                                         recordStore,
+                                                         ordering,
+                                                         indexName,
+                                                         bucketDeletion);
+        }
+    }
+
+}  // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_interface.h b/src/mongo/db/storage/mmap_v1/btree/btree_interface.h
new file mode 100644
index 00000000000..ad0d07b7ece
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_interface.h
@@ -0,0 +1,50 @@
+/**
+ *    Copyright (C) 2014 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/bson/ordering.h"
+#include "mongo/db/catalog/head_manager.h"
+#include "mongo/db/diskloc.h"
+#include "mongo/db/jsobj.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/db/storage/record_store.h"
+#include "mongo/db/storage/sorted_data_interface.h"
+
+#pragma once
+
+namespace mongo {
+
+    class BucketDeletionNotification;
+
+    SortedDataInterface* getMMAPV1Interface(HeadManager* headManager,
+                                            RecordStore* recordStore,
+                                            const Ordering& ordering,
+                                            const string& indexName,
+                                            int version,
+                                            BucketDeletionNotification* bucketDeletion);
+
+}  // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp
new file mode 100644
index 00000000000..93f802bc4a5
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp
@@ -0,0 +1,2519 @@
+/**
+ *    Copyright (C) 2014 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/diskloc.h"
+#include "mongo/db/jsobj.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/db/storage/mmap_v1/btree/btree_logic.h"
+#include "mongo/db/storage/mmap_v1/btree/key.h"
+#include "mongo/db/storage/record_store.h"
+#include "mongo/util/log.h"
+#include "mongo/util/mongoutils/str.h"
+
+namespace mongo {
+
+    MONGO_LOG_DEFAULT_COMPONENT_FILE(::mongo::logger::LogComponent::kIndexing);
+
+    //
+    // Public Builder logic
+    //
+
+    template <class BtreeLayout>
+    typename BtreeLogic<BtreeLayout>::Builder*
+    BtreeLogic<BtreeLayout>::newBuilder(OperationContext* txn, bool dupsAllowed) {
+        return new Builder(this, txn, dupsAllowed);
+    }
+
+    template <class BtreeLayout>
+    BtreeLogic<BtreeLayout>::Builder::Builder(BtreeLogic* logic,
+                                              OperationContext* txn,
+                                              bool dupsAllowed)
+        : _logic(logic),
+          _dupsAllowed(dupsAllowed),
+          _numAdded(0),
+          _txn(txn) {
+
+        // XXX: Due to the way bulk building works, we may already have an empty root bucket that we
+        // must now dispose of. This isn't the case in some unit tests that use the Builder directly
+        // rather than going through an IndexAccessMethod.
+        DiskLoc oldHead = _logic->_headManager->getHead();
+        if (!oldHead.isNull()) {
+            _logic->_headManager->setHead(_txn, DiskLoc());
+            _logic->_recordStore->deleteRecord(_txn, oldHead);
+        }
+
+        _first = _cur = _logic->_addBucket(txn);
+        _b = _getModifiableBucket(_cur);
+        _committed = false;
+    }
+
+    template <class BtreeLayout>
+    Status BtreeLogic<BtreeLayout>::Builder::addKey(const BSONObj& keyObj, const DiskLoc& loc) {
+        auto_ptr<KeyDataOwnedType> key(new KeyDataOwnedType(keyObj));
+
+        if (key->dataSize() > BtreeLayout::KeyMax) {
+            string msg = str::stream() << "Btree::insert: key too large to index, failing "
+                                       << _logic->_indexName
+                                       << ' ' << key->dataSize() << ' ' << key->toString();
+            log() << msg << endl;
+            return Status(ErrorCodes::KeyTooLong, msg);
+        }
+
+        // If we have a previous key to compare to...
+        if (_numAdded > 0) {
+            int cmp = _keyLast->woCompare(*key, _logic->_ordering);
+
+            // This shouldn't happen ever.  We expect keys in sorted order.
+            if (cmp > 0) {
+                return Status(ErrorCodes::InternalError, "Bad key order in btree builder");
+            }
+
+            // This could easily happen..
+            if (!_dupsAllowed && (cmp == 0)) {
+                return Status(ErrorCodes::DuplicateKey, _logic->dupKeyError(*_keyLast));
+            }
+        }
+
+        if (!_logic->_pushBack(_b, loc, *key, DiskLoc())) {
+            // bucket was full
+            newBucket();
+            _logic->pushBack(_b, loc, *key, DiskLoc());
+        }
+
+        _keyLast = key;
+        _numAdded++;
+        mayCommitProgressDurably();
+        return Status::OK();
+    }
+
+    template <class BtreeLayout>
+    unsigned long long BtreeLogic<BtreeLayout>::Builder::commit(bool mayInterrupt) {
+        buildNextLevel(_first, mayInterrupt);
+        _committed = true;
+        return _numAdded;
+    }
+
+    //
+    // Private Builder logic
+    //
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::Builder::newBucket() {
+        DiskLoc newBucketLoc = _logic->_addBucket(_txn);
+        _b->parent = newBucketLoc;
+        _cur = newBucketLoc;
+        _b = _getModifiableBucket(_cur);
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::Builder::buildNextLevel(DiskLoc loc, bool mayInterrupt) {
+        for (;;) {
+            if (_getBucket(loc)->parent.isNull()) {
+                // only 1 bucket at this level. we are done.
+                _logic->_headManager->setHead(_txn, loc);
+                break;
+            }
+
+            DiskLoc upLoc = _logic->_addBucket(_txn);
+            DiskLoc upStart = upLoc;
+            BucketType* up = _getModifiableBucket(upLoc);
+
+            DiskLoc xloc = loc;
+            while (!xloc.isNull()) {
+                if (_txn->recoveryUnit()->commitIfNeeded()) {
+                    _b = _getModifiableBucket(_cur);
+                    up = _getModifiableBucket(upLoc);
+                }
+
+                if (mayInterrupt) {
+                    _txn->checkForInterrupt();
+                }
+
+                BucketType* x = _getModifiableBucket(xloc);
+                KeyDataType k;
+                DiskLoc r;
+                _logic->popBack(x, &r, &k);
+                bool keepX = (x->n != 0);
+                DiskLoc keepLoc = keepX ? xloc : x->nextChild;
+
+                if (!_logic->_pushBack(up, r, k, keepLoc)) {
+                    // current bucket full
+                    DiskLoc n = _logic->_addBucket(_txn);
+                    up->parent = n;
+                    upLoc = n;
+                    up = _getModifiableBucket(upLoc);
+                    _logic->pushBack(up, r, k, keepLoc);
+                }
+
+                DiskLoc nextLoc = x->parent;
+                if (keepX) {
+                    x->parent = upLoc;
+                }
+                else {
+                    if (!x->nextChild.isNull()) {
+                        DiskLoc ll = x->nextChild;
+                        _getModifiableBucket(ll)->parent = upLoc;
+                    }
+                    _logic->deallocBucket(_txn, x, xloc);
+                }
+                xloc = nextLoc;
+            }
+
+            loc = upStart;
+            mayCommitProgressDurably();
+        }
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::Builder::mayCommitProgressDurably() {
+        if (_txn->recoveryUnit()->commitIfNeeded()) {
+            _b = _getModifiableBucket(_cur);
+        }
+    }
+
+    template <class BtreeLayout>
+    typename BtreeLogic<BtreeLayout>::BucketType*
+    BtreeLogic<BtreeLayout>::Builder::_getModifiableBucket(DiskLoc loc) {
+        return _logic->btreemod(_txn, _logic->getBucket(loc));
+    }
+
+    template <class BtreeLayout>
+    typename BtreeLogic<BtreeLayout>::BucketType*
+    BtreeLogic<BtreeLayout>::Builder::_getBucket(DiskLoc loc) {
+        return _logic->getBucket(loc);
+    }
+
+    //
+    // BtreeLogic logic
+    //
+
+    // static
+    template <class BtreeLayout>
+    typename BtreeLogic<BtreeLayout>::FullKey
+    BtreeLogic<BtreeLayout>::getFullKey(const BucketType* bucket, int i) {
+        if (i >= bucket->n) {
+            int code = 13000;
+            massert(code,
+                    (string)"invalid keyNode: " + BSON( "i" << i << "n" << bucket->n ).jsonString(),
+                    i < bucket->n );
+        }
+        return FullKey(bucket, i);
+    }
+
+    // static
+    template <class BtreeLayout>
+    typename BtreeLogic<BtreeLayout>::KeyHeaderType&
+    BtreeLogic<BtreeLayout>::getKeyHeader(BucketType* bucket, int i) {
+        return ((KeyHeaderType*)bucket->data)[i];
+    }
+
+    // static
+    template <class BtreeLayout>
+    const typename BtreeLogic<BtreeLayout>::KeyHeaderType&
+    BtreeLogic<BtreeLayout>::getKeyHeader(const BucketType* bucket, int i) {
+        return ((const KeyHeaderType*)bucket->data)[i];
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::markUnused(BucketType* bucket, int keyPos) {
+        invariant(keyPos >= 0 && keyPos < bucket->n);
+        getKeyHeader(bucket, keyPos).setUnused();
+    }
+
+    template <class BtreeLayout>
+    char* BtreeLogic<BtreeLayout>::dataAt(BucketType* bucket, short ofs) {
+        return bucket->data + ofs;
+    }
+
+    template <class BtreeLayout>
+    typename BtreeLogic<BtreeLayout>::BucketType*
+    BtreeLogic<BtreeLayout>::btreemod(OperationContext* txn, BucketType* bucket) {
+        txn->recoveryUnit()->writingPtr(bucket, BtreeLayout::BucketSize);
+        return bucket;
+    }
+
+    template <class BtreeLayout>
+    int BtreeLogic<BtreeLayout>::totalDataSize(BucketType* bucket) {
+        return (int) (BtreeLayout::BucketSize - (bucket->data - (char*)bucket));
+    }
+
+    // We define this value as the maximum number of bytes such that, if we have
+    // fewer than this many bytes, we must be able to either merge with or receive
+    // keys from any neighboring node.  If our utilization goes below this value we
+    // know we can bring up the utilization with a simple operation.  Ignoring the
+    // 90/10 split policy which is sometimes employed and our 'unused' nodes, this
+    // is a lower bound on bucket utilization for non root buckets.
+    //
+    // Note that the exact value here depends on the implementation of
+    // _rebalancedSeparatorPos().  The conditions for lowWaterMark - 1 are as
+    // follows:  We know we cannot merge with the neighbor, so the total data size
+    // for us, the neighbor, and the separator must be at least
+    // BucketType::bodySize() + 1.  We must be able to accept one key of any
+    // allowed size, so our size plus storage for that additional key must be
+    // <= BucketType::bodySize() / 2.  This way, with the extra key we'll have a
+    // new bucket data size < half the total data size and by the implementation
+    // of _rebalancedSeparatorPos() the key must be added.
+    template <class BtreeLayout>
+    int BtreeLogic<BtreeLayout>::lowWaterMark() {
+        return BtreeLayout::BucketBodySize / 2 - BtreeLayout::KeyMax - sizeof(KeyHeaderType) + 1;
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::init(BucketType* bucket) {
+        BtreeLayout::initBucket(bucket);
+        bucket->parent.Null();
+        bucket->nextChild.Null();
+        bucket->flags = Packed;
+        bucket->n = 0;
+        bucket->emptySize = totalDataSize(bucket);
+        bucket->topSize = 0;
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::_unalloc(BucketType* bucket, int bytes) {
+        bucket->topSize -= bytes;
+        bucket->emptySize += bytes;
+    }
+
+    /**
+     * We allocate space from the end of the buffer for data.  The keynodes grow from the front.
+     */
+    template <class BtreeLayout>
+    int BtreeLogic<BtreeLayout>::_alloc(BucketType* bucket, int bytes) {
+        invariant(bucket->emptySize >= bytes);
+        bucket->topSize += bytes;
+        bucket->emptySize -= bytes;
+        int ofs = totalDataSize(bucket) - bucket->topSize;
+        invariant(ofs > 0);
+        return ofs;
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::setNotPacked(BucketType* bucket) {
+        bucket->flags &= ~Packed;
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::setPacked(BucketType* bucket) {
+        bucket->flags |= Packed;
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::_delKeyAtPos(BucketType* bucket, int keypos, bool mayEmpty) {
+        invariant(keypos >= 0 && keypos <= bucket->n);
+        invariant(childLocForPos(bucket, keypos).isNull());
+        invariant((mayEmpty && bucket->n > 0) || bucket->n > 1 || bucket->nextChild.isNull());
+
+        bucket->emptySize += sizeof(KeyHeaderType);
+        bucket->n--;
+
+        for (int j = keypos; j < bucket->n; j++) {
+            getKeyHeader(bucket, j) = getKeyHeader(bucket, j + 1);
+        }
+
+        setNotPacked(bucket);
+    }
+
+    /**
+     * Pull rightmost key from the bucket.  This version requires its right child to be null so it
+     * does not bother returning that value.
+     */
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::popBack(BucketType* bucket,
+                                          DiskLoc* recordLocOut,
+                                          KeyDataType *keyDataOut) {
+
+        massert(17435,  "n==0 in btree popBack()", bucket->n > 0 );
+
+        invariant(getKeyHeader(bucket, bucket->n - 1).isUsed());
+
+        FullKey kn = getFullKey(bucket, bucket->n - 1);
+        *recordLocOut = kn.recordLoc;
+        keyDataOut->assign(kn.data);
+        int keysize = kn.data.dataSize();
+
+        massert(17436, "rchild not null in btree popBack()", bucket->nextChild.isNull());
+
+        // Weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't
+        // full.
+        bucket->nextChild = kn.prevChildBucket;
+        bucket->n--;
+        // This is risky because the key we are returning points to this unalloc'ed memory,
+        // and we are assuming that the last key points to the last allocated
+        // bson region.
+        bucket->emptySize += sizeof(KeyHeaderType);
+        _unalloc(bucket, keysize);
+    }
+
+    /**
+     * Add a key.  Must be > all existing.  Be careful to set next ptr right.
+     */
+    template <class BtreeLayout>
+    bool BtreeLogic<BtreeLayout>::_pushBack(BucketType* bucket,
+                                             const DiskLoc recordLoc,
+                                             const KeyDataType& key,
+                                             const DiskLoc prevChild) {
+
+        int bytesNeeded = key.dataSize() + sizeof(KeyHeaderType);
+        if (bytesNeeded > bucket->emptySize) {
+            return false;
+        }
+        invariant(bytesNeeded <= bucket->emptySize);
+
+        if (bucket->n) {
+            const FullKey klast = getFullKey(bucket, bucket->n - 1);
+            if (klast.data.woCompare(key, _ordering) > 0) { 
+                log() << "btree bucket corrupt? "
+                         "consider reindexing or running validate command" << endl;
+                log() << "  klast: " << klast.data.toString() << endl;
+                log() << "  key:   " << key.toString() << endl;
+                invariant(false);
+            }
+        }
+
+        bucket->emptySize -= sizeof(KeyHeaderType);
+        KeyHeaderType& kn = getKeyHeader(bucket, bucket->n++);
+        kn.prevChildBucket = prevChild;
+        kn.recordLoc = recordLoc;
+        kn.setKeyDataOfs((short)_alloc(bucket, key.dataSize()));
+        short ofs = kn.keyDataOfs();
+        char *p = dataAt(bucket, ofs);
+        memcpy(p, key.data(), key.dataSize());
+        return true;
+    }
+
+    /**
+     * Durability note:
+     *
+     * We do separate intent declarations herein.  Arguably one could just declare the whole bucket
+     * given we do group commits.  This is something we could investigate later as to what is
+     * faster.
+     **/
+
+    /**
+     * Insert a key in a bucket with no complexity -- no splits required
+     * Returns false if a split is required.
+     */
+    template <class BtreeLayout>
+    bool BtreeLogic<BtreeLayout>::basicInsert(OperationContext* txn,
+                                              BucketType* bucket,
+                                              const DiskLoc bucketLoc,
+                                              int& keypos,
+                                              const KeyDataType& key,
+                                              const DiskLoc recordLoc) {
+        invariant(bucket->n < 1024);
+        invariant(keypos >= 0 && keypos <= bucket->n);
+
+        int bytesNeeded = key.dataSize() + sizeof(KeyHeaderType);
+        if (bytesNeeded > bucket->emptySize) {
+            _pack(txn, bucket, bucketLoc, keypos);
+            if (bytesNeeded > bucket->emptySize) {
+                return false;
+            }
+        }
+
+        invariant(getBucket(bucketLoc) == bucket);
+
+        {
+            // declare that we will write to [k(keypos),k(n)]
+            char* start = reinterpret_cast<char*>(&getKeyHeader(bucket, keypos));
+            char* end = reinterpret_cast<char*>(&getKeyHeader(bucket, bucket->n + 1));
+
+            // Declare that we will write to [k(keypos),k(n)]
+            txn->recoveryUnit()->writingPtr(start, end - start);
+        }
+
+        // e.g. for n==3, keypos==2
+        // 1 4 9 -> 1 4 _ 9
+        for (int j = bucket->n; j > keypos; j--) {
+            getKeyHeader(bucket, j) = getKeyHeader(bucket, j - 1);
+        }
+
+        size_t writeLen = sizeof(bucket->emptySize) + sizeof(bucket->topSize) + sizeof(bucket->n);
+        txn->recoveryUnit()->writingPtr(&bucket->emptySize, writeLen);
+        bucket->emptySize -= sizeof(KeyHeaderType);
+        bucket->n++;
+
+        // This _KeyNode was marked for writing above.
+        KeyHeaderType& kn = getKeyHeader(bucket, keypos);
+        kn.prevChildBucket.Null();
+        kn.recordLoc = recordLoc;
+        kn.setKeyDataOfs((short) _alloc(bucket, key.dataSize()));
+        char *p = dataAt(bucket, kn.keyDataOfs());
+        txn->recoveryUnit()->writingPtr(p, key.dataSize());
+        memcpy(p, key.data(), key.dataSize());
+        return true;
+    }
+
+    /**
+     * With this implementation, refPos == 0 disregards effect of refPos.  index > 0 prevents
+     * creation of an empty bucket.
+     */
+    template <class BtreeLayout>
+    bool BtreeLogic<BtreeLayout>::mayDropKey(BucketType* bucket, int index, int refPos) {
+        return index > 0
+            && (index != refPos)
+            && getKeyHeader(bucket, index).isUnused()
+            && getKeyHeader(bucket, index).prevChildBucket.isNull();
+    }
+
+    template <class BtreeLayout>
+    int BtreeLogic<BtreeLayout>::_packedDataSize(BucketType* bucket, int refPos) {
+        if (bucket->flags & Packed) {
+            return BtreeLayout::BucketSize - bucket->emptySize - BucketType::HeaderSize;
+        }
+
+        int size = 0;
+        for (int j = 0; j < bucket->n; ++j) {
+            if (mayDropKey(bucket, j, refPos)) {
+                continue;
+            }
+            size += getFullKey(bucket, j).data.dataSize() + sizeof(KeyHeaderType);
+        }
+
+        return size;
+    }
+
+    /**
+     * When we delete things, we just leave empty space until the node is full and then we repack
+     * it.
+     */
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::_pack(OperationContext* txn,
+                                        BucketType* bucket,
+                                        const DiskLoc thisLoc,
+                                        int &refPos) {
+
+        invariant(getBucket(thisLoc) == bucket);
+
+        if (bucket->flags & Packed) {
+            return;
+        }
+
+        _packReadyForMod(btreemod(txn, bucket), refPos);
+    }
+
+    /**
+     * Version when write intent already declared.
+     */
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::_packReadyForMod(BucketType* bucket, int &refPos) {
+        if (bucket->flags & Packed) {
+            return;
+        }
+
+        int tdz = totalDataSize(bucket);
+        char temp[BtreeLayout::BucketSize];
+        int ofs = tdz;
+        bucket->topSize = 0;
+
+        int i = 0;
+        for (int j = 0; j < bucket->n; j++) {
+            if (mayDropKey(bucket, j, refPos)) {
+                // key is unused and has no children - drop it
+                continue;
+            }
+
+            if (i != j) {
+                if (refPos == j) {
+                    // i < j so j will never be refPos again
+                    refPos = i;
+                }
+                getKeyHeader(bucket, i) = getKeyHeader(bucket, j);
+            }
+
+            short ofsold = getKeyHeader(bucket, i).keyDataOfs();
+            int sz = getFullKey(bucket, i).data.dataSize();
+            ofs -= sz;
+            bucket->topSize += sz;
+            memcpy(temp + ofs, dataAt(bucket, ofsold), sz);
+            getKeyHeader(bucket, i).setKeyDataOfsSavingUse(ofs);
+            ++i;
+        }
+
+        if (refPos == bucket->n) {
+            refPos = i;
+        }
+
+        bucket->n = i;
+        int dataUsed = tdz - ofs;
+        memcpy(bucket->data + ofs, temp + ofs, dataUsed);
+
+        bucket->emptySize = tdz - dataUsed - bucket->n * sizeof(KeyHeaderType);
+        int foo = bucket->emptySize;
+        invariant( foo >= 0 );
+        setPacked(bucket);
+        assertValid(_indexName, bucket, _ordering);
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::truncateTo(BucketType* bucket,
+                                              int N,
+                                              int &refPos) {
+        bucket->n = N;
+        setNotPacked(bucket);
+        _packReadyForMod(bucket, refPos);
+    }
+
+    /**
+     * In the standard btree algorithm, we would split based on the
+     * existing keys _and_ the new key.  But that's more work to
+     * implement, so we split the existing keys and then add the new key.
+     *
+     * There are several published heuristic algorithms for doing splits, but basically what you
+     * want are (1) even balancing between the two sides and (2) a small split key so the parent can
+     * have a larger branching factor.
+     *
+     * We just have a simple algorithm right now: if a key includes the halfway point (or 10% way
+     * point) in terms of bytes, split on that key; otherwise split on the key immediately to the
+     * left of the halfway point (or 10% point).
+     *
+     * This function is expected to be called on a packed bucket.
+     */
+    template <class BtreeLayout>
+    int BtreeLogic<BtreeLayout>::splitPos(BucketType* bucket, int keypos) {
+        invariant(bucket->n > 2);
+        int split = 0;
+        int rightSize = 0;
+
+        // When splitting a btree node, if the new key is greater than all the other keys, we should
+        // not do an even split, but a 90/10 split.  see SERVER-983.  TODO I think we only want to
+        // do the 90% split on the rhs node of the tree.
+        int rightSizeLimit = (bucket->topSize + sizeof(KeyHeaderType) * bucket->n)
+                           / (keypos == bucket->n ? 10 : 2);
+
+        for (int i = bucket->n - 1; i > -1; --i) {
+            rightSize += getFullKey(bucket, i).data.dataSize() + sizeof(KeyHeaderType);
+            if (rightSize > rightSizeLimit) {
+                split = i;
+                break;
+            }
+        }
+
+        // safeguards - we must not create an empty bucket
+        if (split < 1) {
+            split = 1;
+        }
+        else if (split > bucket->n - 2) {
+            split = bucket->n - 2;
+        }
+
+        return split;
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::reserveKeysFront(BucketType* bucket, int nAdd) {
+        invariant(bucket->emptySize >= int(sizeof(KeyHeaderType) * nAdd));
+        bucket->emptySize -= sizeof(KeyHeaderType) * nAdd;
+        for (int i = bucket->n - 1; i > -1; --i) {
+            getKeyHeader(bucket, i + nAdd) = getKeyHeader(bucket, i);
+        }
+        bucket->n += nAdd;
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::setKey(BucketType* bucket,
+                                          int i,
+                                          const DiskLoc recordLoc,
+                                          const KeyDataType& key,
+                                          const DiskLoc prevChildBucket) {
+        KeyHeaderType &kn = getKeyHeader(bucket, i);
+        kn.recordLoc = recordLoc;
+        kn.prevChildBucket = prevChildBucket;
+        short ofs = (short) _alloc(bucket, key.dataSize());
+        kn.setKeyDataOfs(ofs);
+        char *p = dataAt(bucket, ofs);
+        memcpy(p, key.data(), key.dataSize());
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::dropFront(BucketType* bucket,
+                                             int nDrop,
+                                             int &refpos) {
+        for (int i = nDrop; i < bucket->n; ++i) {
+            getKeyHeader(bucket, i - nDrop) = getKeyHeader(bucket, i);
+        }
+        bucket->n -= nDrop;
+        setNotPacked(bucket);
+        _packReadyForMod(bucket, refpos );
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::customLocate(OperationContext* txn,
+                                               DiskLoc* locInOut,
+                                               int* keyOfsInOut,
+                                               const BSONObj& keyBegin,
+                                               int keyBeginLen,
+                                               bool afterKey,
+                                               const vector<const BSONElement*>& keyEnd,
+                                               const vector<bool>& keyEndInclusive,
+                                               int direction) const {
+        pair<DiskLoc, int> unused;
+
+        customLocate(txn,
+                     locInOut,
+                     keyOfsInOut,
+                     keyBegin,
+                     keyBeginLen,
+                     afterKey, 
+                     keyEnd,
+                     keyEndInclusive,
+                     direction,
+                     unused);
+
+        skipUnusedKeys(txn, locInOut, keyOfsInOut, direction);
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::advance(OperationContext* txn,
+                                          DiskLoc* bucketLocInOut,
+                                          int* posInOut,
+                                          int direction) const {
+
+        *bucketLocInOut = advance(txn, *bucketLocInOut, posInOut, direction);
+        skipUnusedKeys(txn, bucketLocInOut, posInOut, direction);
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::skipUnusedKeys(OperationContext* txn,
+                                                 DiskLoc* loc,
+                                                 int* pos,
+                                                 int direction) const {
+        while (!loc->isNull() && !keyIsUsed(*loc, *pos)) {
+            *loc = advance(txn, *loc, pos, direction);
+        }
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::advanceTo(OperationContext* txn,
+                                            DiskLoc* thisLocInOut,
+                                            int* keyOfsInOut,
+                                            const BSONObj &keyBegin,
+                                            int keyBeginLen,
+                                            bool afterKey,
+                                            const vector<const BSONElement*>& keyEnd,
+                                            const vector<bool>& keyEndInclusive,
+                                            int direction) const {
+
+        advanceToImpl(txn,
+                      thisLocInOut,
+                      keyOfsInOut,
+                      keyBegin,
+                      keyBeginLen,
+                      afterKey,
+                      keyEnd,
+                      keyEndInclusive,
+                      direction);
+
+        skipUnusedKeys(txn, thisLocInOut, keyOfsInOut, direction);
+    }
+
+    /**
+     * find smallest/biggest value greater-equal/less-equal than specified
+     *
+     * starting thisLoc + keyOfs will be strictly less than/strictly greater than
+     * keyBegin/keyBeginLen/keyEnd
+     *
+     * All the direction checks below allowed me to refactor the code, but possibly separate forward
+     * and reverse implementations would be more efficient
+     */
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::advanceToImpl(OperationContext* txn,
+                                                DiskLoc* thisLocInOut,
+                                                int* keyOfsInOut,
+                                                const BSONObj &keyBegin,
+                                                int keyBeginLen,
+                                                bool afterKey,
+                                                const vector<const BSONElement*>& keyEnd,
+                                                const vector<bool>& keyEndInclusive,
+                                                int direction) const {
+
+        BucketType* bucket = getBucket(*thisLocInOut);
+
+        int l, h;
+        bool dontGoUp;
+
+        if (direction > 0) {
+            l = *keyOfsInOut;
+            h = bucket->n - 1;
+            int cmpResult = customBSONCmp(getFullKey(bucket, h).data.toBson(),
+                                          keyBegin,
+                                          keyBeginLen,
+                                          afterKey,
+                                          keyEnd,
+                                          keyEndInclusive,
+                                          _ordering,
+                                          direction);
+            dontGoUp = (cmpResult >= 0);
+        }
+        else {
+            l = 0;
+            h = *keyOfsInOut;
+            int cmpResult = customBSONCmp(getFullKey(bucket, l).data.toBson(),
+                                          keyBegin,
+                                          keyBeginLen,
+                                          afterKey,
+                                          keyEnd,
+                                          keyEndInclusive,
+                                          _ordering,
+                                          direction);
+            dontGoUp = (cmpResult <= 0);
+        }
+
+        pair<DiskLoc, int> bestParent;
+
+        if (dontGoUp) {
+            // this comparison result assures h > l
+            if (!customFind(l,
+                            h,
+                            keyBegin,
+                            keyBeginLen,
+                            afterKey,
+                            keyEnd,
+                            keyEndInclusive,
+                            _ordering,
+                            direction,
+                            thisLocInOut,
+                            keyOfsInOut,
+                            bestParent)) {
+                return;
+            }
+        }
+        else {
+            // go up parents until rightmost/leftmost node is >=/<= target or at top
+            while (!bucket->parent.isNull()) {
+                *thisLocInOut = bucket->parent;
+                bucket = getBucket(*thisLocInOut);
+
+                if (direction > 0) {
+                    if (customBSONCmp(getFullKey(bucket, bucket->n - 1).data.toBson(),
+                                      keyBegin,
+                                      keyBeginLen,
+                                      afterKey,
+                                      keyEnd,
+                                      keyEndInclusive,
+                                      _ordering,
+                                      direction) >= 0 ) {
+                        break;
+                    }
+                }
+                else {
+                    if (customBSONCmp(getFullKey(bucket, 0).data.toBson(),
+                                      keyBegin,
+                                      keyBeginLen,
+                                      afterKey,
+                                      keyEnd,
+                                      keyEndInclusive,
+                                      _ordering,
+                                      direction) <= 0) {
+                        break;
+                    }
+                }
+            }
+        }
+
+        customLocate(txn,
+                     thisLocInOut,
+                     keyOfsInOut,
+                     keyBegin,
+                     keyBeginLen,
+                     afterKey,
+                     keyEnd,
+                     keyEndInclusive,
+                     direction,
+                     bestParent);
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::customLocate(OperationContext* txn,
+                                               DiskLoc* locInOut,
+                                               int* keyOfsInOut,
+                                               const BSONObj& keyBegin,
+                                               int keyBeginLen,
+                                               bool afterKey,
+                                               const vector<const BSONElement*>& keyEnd,
+                                               const vector<bool>& keyEndInclusive,
+                                               int direction,
+                                               pair<DiskLoc, int>& bestParent) const {
+
+        BucketType* bucket = getBucket(*locInOut);
+
+        if (0 == bucket->n) {
+            *locInOut = DiskLoc();
+            return;
+        }
+
+        // go down until find smallest/biggest >=/<= target
+        for (;;) {
+            int l = 0;
+            int h = bucket->n - 1;
+
+            // +direction: 0, -direction: h
+            int z = (direction > 0) ? 0 : h;
+
+            // leftmost/rightmost key may possibly be >=/<= search key
+            int res = customBSONCmp(getFullKey(bucket, z).data.toBson(),
+                                    keyBegin,
+                                    keyBeginLen,
+                                    afterKey,
+                                    keyEnd,
+                                    keyEndInclusive,
+                                    _ordering,
+                                    direction);
+
+
+            if (direction * res >= 0) {
+                DiskLoc next;
+                *keyOfsInOut = z;
+
+                if (direction > 0) {
+                    dassert(z == 0);
+                    next = getKeyHeader(bucket, 0).prevChildBucket;
+                }
+                else {
+                    next = bucket->nextChild;
+                }
+
+                if (!next.isNull()) {
+                    bestParent = pair<DiskLoc, int>(*locInOut, *keyOfsInOut);
+                    *locInOut = next;
+                    bucket = getBucket(*locInOut);
+                    continue;
+                }
+                else {
+                    return;
+                }
+            }
+
+            res = customBSONCmp(getFullKey(bucket, h - z).data.toBson(),
+                                keyBegin,
+                                keyBeginLen,
+                                afterKey,
+                                keyEnd,
+                                keyEndInclusive,
+                                _ordering,
+                                direction);
+
+            if (direction * res < 0) {
+                DiskLoc next;
+                if (direction > 0) {
+                    next = bucket->nextChild;
+                }
+                else {
+                    next = getKeyHeader(bucket, 0).prevChildBucket;
+                }
+
+                if (next.isNull()) {
+                    // if bestParent is null, we've hit the end and locInOut gets set to DiskLoc()
+                    *locInOut = bestParent.first;
+                    *keyOfsInOut = bestParent.second;
+                    return;
+                }
+                else {
+                    *locInOut = next;
+                    bucket = getBucket(*locInOut);
+                    continue;
+                }
+            }
+
+            if (!customFind(l,
+                            h,
+                            keyBegin,
+                            keyBeginLen,
+                            afterKey,
+                            keyEnd,
+                            keyEndInclusive,
+                            _ordering,
+                            direction,
+                            locInOut,
+                            keyOfsInOut,
+                            bestParent)) {
+                return;
+            }
+
+            bucket = getBucket(*locInOut);
+        }
+    }
+
+    template <class BtreeLayout>
+    bool BtreeLogic<BtreeLayout>::customFind(int low,
+                                             int high,
+                                             const BSONObj& keyBegin,
+                                             int keyBeginLen,
+                                             bool afterKey,
+                                             const vector<const BSONElement*>& keyEnd,
+                                             const vector<bool>& keyEndInclusive,
+                                             const Ordering& order,
+                                             int direction,
+                                             DiskLoc* thisLocInOut,
+                                             int* keyOfsInOut,
+                                             pair<DiskLoc, int>& bestParent) const {
+
+        const BucketType* bucket = getBucket(*thisLocInOut);
+
+        for (;;) {
+            if (low + 1 == high) {
+                *keyOfsInOut = (direction > 0) ? high : low;
+                DiskLoc next = getKeyHeader(bucket, high).prevChildBucket;
+                if (!next.isNull()) {
+                    bestParent = make_pair(*thisLocInOut, *keyOfsInOut);
+                    *thisLocInOut = next;
+                    return true;
+                }
+                else {
+                    return false;
+                }
+            }
+
+            int middle = low + (high - low) / 2;
+
+            int cmp = customBSONCmp(getFullKey(bucket, middle).data.toBson(),
+                                    keyBegin,
+                                    keyBeginLen,
+                                    afterKey,
+                                    keyEnd,
+                                    keyEndInclusive,
+                                    order,
+                                    direction);
+
+            if (cmp < 0) {
+                low = middle;
+            }
+            else if (cmp > 0) {
+                high = middle;
+            }
+            else {
+                if (direction < 0) {
+                    low = middle;
+                }
+                else {
+                    high = middle;
+                }
+            }
+        }
+    }
+
+    /**
+     * NOTE: Currently the Ordering implementation assumes a compound index will not have more keys
+     * than an unsigned variable has bits.  The same assumption is used in the implementation below
+     * with respect to the 'mask' variable.
+     *
+     * 'l' is a regular bsonobj
+     *
+     * 'rBegin' is composed partly of an existing bsonobj, and the remaining keys are taken from a
+     * vector of elements that frequently changes 
+     *
+     * see https://jira.mongodb.org/browse/SERVER-371
+     */
+    // static
+    template <class BtreeLayout>
+    int BtreeLogic<BtreeLayout>::customBSONCmp(const BSONObj& l,
+                                               const BSONObj& rBegin,
+                                               int rBeginLen,
+                                               bool rSup,
+                                               const vector<const BSONElement*>& rEnd,
+                                               const vector<bool>& rEndInclusive,
+                                               const Ordering& o,
+                                               int direction) const {
+        // XXX: make this readable
+        BSONObjIterator ll( l );
+        BSONObjIterator rr( rBegin );
+        vector< const BSONElement * >::const_iterator rr2 = rEnd.begin();
+        vector< bool >::const_iterator inc = rEndInclusive.begin();
+        unsigned mask = 1;
+        for( int i = 0; i < rBeginLen; ++i, mask <<= 1 ) {
+            BSONElement lll = ll.next();
+            BSONElement rrr = rr.next();
+            ++rr2;
+            ++inc;
+
+            int x = lll.woCompare( rrr, false );
+            if ( o.descending( mask ) )
+                x = -x;
+            if ( x != 0 )
+                return x;
+        }
+        if ( rSup ) {
+            return -direction;
+        }
+        for( ; ll.more(); mask <<= 1 ) {
+            BSONElement lll = ll.next();
+            BSONElement rrr = **rr2;
+            ++rr2;
+            int x = lll.woCompare( rrr, false );
+            if ( o.descending( mask ) )
+                x = -x;
+            if ( x != 0 )
+                return x;
+            if ( !*inc ) {
+                return -direction;
+            }
+            ++inc;
+        }
+        return 0;
+    }
+
+    template <class BtreeLayout>
+    bool BtreeLogic<BtreeLayout>::exists(OperationContext* txn, const KeyDataType& key) const {
+        int position = 0;
+
+        // Find the DiskLoc 
+        bool found;
+
+        DiskLoc bucket = _locate(txn, getRootLoc(), key, &position, &found, minDiskLoc, 1);
+
+        while (!bucket.isNull()) {
+            FullKey fullKey = getFullKey(getBucket(bucket), position);
+            if (fullKey.header.isUsed()) {
+                return fullKey.data.woEqual(key);
+            }
+            bucket = advance(txn, bucket, &position, 1);
+        }
+
+        return false;
+    }
+
+    template <class BtreeLayout>
+    Status BtreeLogic<BtreeLayout>::dupKeyCheck(OperationContext* txn,
+                                                const BSONObj& key,
+                                                const DiskLoc& loc) const {
+        KeyDataOwnedType theKey(key);
+        if (!wouldCreateDup(txn, theKey, loc)) {
+            return Status::OK();
+        }
+
+        return Status(ErrorCodes::DuplicateKey, dupKeyError(theKey));
+    }
+
+    template <class BtreeLayout>
+    bool BtreeLogic<BtreeLayout>::wouldCreateDup(OperationContext* txn,
+                                                 const KeyDataType& key,
+                                                 const DiskLoc self) const {
+        int position;
+        bool found;
+
+        DiskLoc posLoc = _locate(txn, getRootLoc(), key, &position, &found, minDiskLoc, 1);
+
+        while (!posLoc.isNull()) {
+            FullKey fullKey = getFullKey(getBucket(posLoc), position);
+            if (fullKey.header.isUsed()) {
+                // TODO: we may not need fullKey.data until we know fullKey.header.isUsed() here
+                // and elsewhere.
+                if (fullKey.data.woEqual(key)) {
+                    return fullKey.recordLoc != self;
+                }
+                break;
+            }
+
+            posLoc = advance(txn, posLoc, &position, 1);
+        }
+        return false;
+    }
+
+    template <class BtreeLayout>
+    string BtreeLogic<BtreeLayout>::dupKeyError(const KeyDataType& key) const {
+        stringstream ss;
+        ss << "E11000 duplicate key error ";
+        ss << "index: " << _indexName << " ";
+        ss << "dup key: " << key.toString();
+        return ss.str();
+    }
+
+    /**
+     * Find a key within this btree bucket.
+     *
+     * When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the
+     * key.  That assures that even when there are many duplicates (e.g., 1 million) for a key, our
+     * performance is still good.
+     *
+     * assertIfDup: if the key exists (ignoring the recordLoc), uassert
+     *
+     * pos: for existing keys k0...kn-1.
+     * returns # it goes BEFORE.  so key[pos-1] < key < key[pos]
+     * returns n if it goes after the last existing key.
+     * note result might be an Unused location!
+     */
+    template <class BtreeLayout>
+    Status BtreeLogic<BtreeLayout>::_find(OperationContext* txn,
+                                          BucketType* bucket,
+                                          const KeyDataType& key,
+                                          const DiskLoc& recordLoc,
+                                          bool errorIfDup,
+                                          int* keyPositionOut,
+                                          bool* foundOut) const {
+
+        // XXX: fix the ctor for DiskLoc56bit so we can just convert w/o assignment operator
+        LocType genericRecordLoc;
+        genericRecordLoc = recordLoc;
+
+        bool dupsChecked = false;
+
+        int low = 0;
+        int high = bucket->n - 1;
+        int middle = (low + high) / 2;
+
+        while (low <= high) {
+            FullKey fullKey = getFullKey(bucket, middle);
+            int cmp = key.woCompare(fullKey.data, _ordering);
+
+            // The key data is the same.
+            if (0 == cmp) {
+                // Found the key in this bucket.  If we're checking for dups...
+                if (errorIfDup) {
+                    if (fullKey.header.isUnused()) {
+                        // It's ok that the key is there if it is unused.  We need to check that
+                        // there aren't other entries for the key then.  as it is very rare that
+                        // we get here, we don't put any coding effort in here to make this
+                        // particularly fast
+                        if (!dupsChecked) {
+                            // This is expensive and we only want to do it once(? -- when would
+                            // it happen twice).
+                            dupsChecked = true;
+                            if (exists(txn, key)) {
+                                if (wouldCreateDup(txn, key, genericRecordLoc)) {
+                                    return Status(ErrorCodes::DuplicateKey, dupKeyError(key), 11000);
+                                }
+                                else {
+                                    return Status(ErrorCodes::UniqueIndexViolation, "FIXME");
+                                }
+                            }
+                        }
+                    }
+                    else {
+                        if (fullKey.recordLoc == recordLoc) {
+                            return Status(ErrorCodes::UniqueIndexViolation, "FIXME");
+                        }
+                        else {
+                            return Status(ErrorCodes::DuplicateKey, dupKeyError(key), 11000);
+                        }
+                    }
+                }
+
+                // If we're here dup keys are allowed, or the key is a dup but unused.
+                LocType recordLocCopy = fullKey.recordLoc;
+
+                // We clear this bit so we can test equality without the used bit messing us up.
+                // XXX: document this
+                // XXX: kill this GETOFS stuff
+                recordLocCopy.GETOFS() &= ~1; 
+
+                // Set 'cmp' to the comparison w/the DiskLoc and fall through below.
+                cmp = recordLoc.compare(recordLocCopy);
+            }
+
+            if (cmp < 0) {
+                high = middle - 1;
+            }
+            else if (cmp > 0) {
+                low = middle + 1;
+            }
+            else {
+                // Found it!
+                *keyPositionOut = middle;
+                *foundOut = true;
+                return Status::OK();
+            }
+
+            middle = (low + high) / 2;
+        }
+
+        // Not found.
+        *keyPositionOut = low;
+
+        // Some debugging checks.
+        if (low != bucket->n) {
+            wassert(key.woCompare(getFullKey(bucket, low).data, _ordering) <= 0);
+
+            if (low > 0) {
+                if (getFullKey(bucket, low - 1).data.woCompare(key, _ordering) > 0) {
+                    DEV {
+                        log() << key.toString() << endl;
+                        log() << getFullKey(bucket, low - 1).data.toString() << endl;
+                    }
+                    wassert(false);
+                }
+            }
+        }
+
+        *foundOut = false;
+        return Status::OK();
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::delBucket(OperationContext* txn,
+                                            BucketType* bucket,
+                                            const DiskLoc bucketLoc) {
+        invariant(bucketLoc != getRootLoc());
+
+        _bucketDeletion->aboutToDeleteBucket(bucketLoc);
+
+        BucketType* p = getBucket(bucket->parent);
+        int parentIdx = indexInParent(txn, bucket, bucketLoc);
+        *txn->recoveryUnit()->writing(&childLocForPos(p, parentIdx)) = DiskLoc();
+        deallocBucket(txn, bucket, bucketLoc);
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::deallocBucket(OperationContext* txn,
+                                                BucketType* bucket,
+                                                const DiskLoc bucketLoc) {
+        bucket->n = BtreeLayout::INVALID_N_SENTINEL;
+        bucket->parent.Null();
+        _recordStore->deleteRecord(txn, bucketLoc);
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::restorePosition(OperationContext* txn,
+                                                  const BSONObj& savedKey,
+                                                  const DiskLoc& savedLoc,
+                                                  int direction,
+                                                  DiskLoc* bucketLocInOut,
+                                                  int* keyOffsetInOut) const {
+
+        // _keyOffset is -1 if the bucket was deleted.  When buckets are deleted the Btree calls
+        // a clientcursor function that calls down to all BTree buckets.  Really, this deletion
+        // thing should be kept BTree-internal.  This'll go away with finer grained locking: we
+        // can hold on to a bucket for as long as we need it.
+        if (-1 == *keyOffsetInOut) {
+            locate(txn, savedKey, savedLoc, direction, keyOffsetInOut, bucketLocInOut);
+            return;
+        }
+
+        invariant(*keyOffsetInOut >= 0);
+
+        BucketType* bucket = getBucket(*bucketLocInOut);
+        invariant(bucket);
+        invariant(BtreeLayout::INVALID_N_SENTINEL != bucket->n);
+
+        if (_keyIsAt(savedKey, savedLoc, bucket, *keyOffsetInOut)) {
+            skipUnusedKeys(txn, bucketLocInOut, keyOffsetInOut, direction);
+            return;
+        }
+
+        if (*keyOffsetInOut > 0) {
+            (*keyOffsetInOut)--;
+            if (_keyIsAt(savedKey, savedLoc, bucket, *keyOffsetInOut)) {
+                skipUnusedKeys(txn, bucketLocInOut, keyOffsetInOut, direction);
+                return;
+            }
+        }
+
+        locate(txn, savedKey, savedLoc, direction, keyOffsetInOut, bucketLocInOut);
+    }
+
+    template <class BtreeLayout>
+    bool BtreeLogic<BtreeLayout>::_keyIsAt(const BSONObj& savedKey,
+                                           const DiskLoc& savedLoc,
+                                           BucketType* bucket,
+                                           int keyPos) const {
+        if (keyPos >= bucket->n) {
+            return false;
+        }
+
+        FullKey key = getFullKey(bucket, keyPos);
+        if (!key.data.toBson().binaryEqual(savedKey)) {
+            return false;
+        }
+        return key.header.recordLoc == savedLoc;
+    }
+
+    /**
+     * May delete the bucket 'bucket' rendering 'bucketLoc' invalid.
+     */
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::delKeyAtPos(OperationContext* txn,
+                                              BucketType* bucket,
+                                              const DiskLoc bucketLoc,
+                                              int p) {
+        invariant(bucket->n > 0);
+        DiskLoc left = childLocForPos(bucket, p);
+        if (bucket->n == 1) {
+            if (left.isNull() && bucket->nextChild.isNull()) {
+                _delKeyAtPos(bucket, p);
+                if (isHead(bucket)) {
+                    // we don't delete the top bucket ever
+                }
+                else {
+                    if (!mayBalanceWithNeighbors(txn, bucket, bucketLoc)) {
+                        // An empty bucket is only allowed as a txnient state.  If
+                        // there are no neighbors to balance with, we delete ourself.
+                        // This condition is only expected in legacy btrees.
+                        delBucket(txn, bucket, bucketLoc);
+                    }
+                }
+                return;
+            }
+            deleteInternalKey(txn, bucket, bucketLoc, p);
+            return;
+        }
+
+        if (left.isNull()) {
+            _delKeyAtPos(bucket, p);
+            mayBalanceWithNeighbors(txn, bucket, bucketLoc);
+        }
+        else {
+            deleteInternalKey(txn, bucket, bucketLoc, p);
+        }
+    }
+
+    /**
+     * This function replaces the specified key (k) by either the prev or next key in the btree
+     * (k').  We require that k have either a left or right child.  If k has a left child, we set k'
+     * to the prev key of k, which must be a leaf present in the left child.  If k does not have a
+     * left child, we set k' to the next key of k, which must be a leaf present in the right child.
+     * When we replace k with k', we copy k' over k (which may cause a split) and then remove k'
+     * from its original location.  Because k' is stored in a descendent of k, replacing k by k'
+     * will not modify the storage location of the original k', and we can easily remove k' from its
+     * original location.
+     *
+     * This function is only needed in cases where k has a left or right child; in other cases a
+     * simpler key removal implementation is possible.
+     *
+     * NOTE on noncompliant BtreeBuilder btrees: It is possible (though likely rare) for btrees
+     * created by BtreeBuilder to have k' that is not a leaf, see SERVER-2732.  These cases are
+     * handled in the same manner as described in the "legacy btree structures" note below.
+     *
+     * NOTE on legacy btree structures: In legacy btrees, k' can be a nonleaf.  In such a case we
+     * 'delete' k by marking it as an unused node rather than replacing it with k'.  Also, k' may be
+     * a leaf but marked as an unused node.  In such a case we replace k by k', preserving the key's
+     * unused marking.  This function is only expected to mark a key as unused when handling a
+     * legacy btree.
+     */
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::deleteInternalKey(OperationContext* txn,
+                                                    BucketType* bucket,
+                                                    const DiskLoc bucketLoc,
+                                                    int keypos) {
+        DiskLoc lchild = childLocForPos(bucket, keypos);
+        DiskLoc rchild = childLocForPos(bucket, keypos + 1);
+        invariant(!lchild.isNull() || !rchild.isNull());
+        int advanceDirection = lchild.isNull() ? 1 : -1;
+        int advanceKeyOfs = keypos;
+        DiskLoc advanceLoc = advance(txn, bucketLoc, &advanceKeyOfs, advanceDirection);
+        // advanceLoc must be a descentant of thisLoc, because thisLoc has a
+        // child in the proper direction and all descendants of thisLoc must be
+        // nonempty because they are not the root.
+        BucketType* advanceBucket = getBucket(advanceLoc);
+         
+        if (!childLocForPos(advanceBucket, advanceKeyOfs).isNull()
+            || !childLocForPos(advanceBucket, advanceKeyOfs + 1).isNull()) {
+
+            markUnused(bucket, keypos);
+            return;
+        }
+
+        FullKey kn = getFullKey(advanceBucket, advanceKeyOfs);
+        // Because advanceLoc is a descendant of thisLoc, updating thisLoc will
+        // not affect packing or keys of advanceLoc and kn will be stable
+        // during the following setInternalKey()
+        setInternalKey(txn, bucket, bucketLoc, keypos, kn.recordLoc, kn.data,
+                       childLocForPos(bucket, keypos),
+                       childLocForPos(bucket, keypos + 1));
+        delKeyAtPos(txn, btreemod(txn, advanceBucket), advanceLoc, advanceKeyOfs);
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::replaceWithNextChild(OperationContext* txn,
+                                                       BucketType* bucket,
+                                                       const DiskLoc bucketLoc) {
+
+        invariant(bucket->n == 0 && !bucket->nextChild.isNull() );
+        if (bucket->parent.isNull()) {
+            invariant(getRootLoc() == bucketLoc);
+            _headManager->setHead(txn, bucket->nextChild);
+        }
+        else {
+            BucketType* parentBucket = getBucket(bucket->parent);
+            int bucketIndexInParent = indexInParent(txn, bucket, bucketLoc);
+            *txn->recoveryUnit()->writing(&childLocForPos(parentBucket, bucketIndexInParent)) =
+                bucket->nextChild;
+        }
+
+        *txn->recoveryUnit()->writing(&getBucket(bucket->nextChild)->parent) = bucket->parent;
+        _bucketDeletion->aboutToDeleteBucket(bucketLoc);
+        deallocBucket(txn, bucket, bucketLoc);
+    }
+
+    template <class BtreeLayout>
+    bool BtreeLogic<BtreeLayout>::canMergeChildren(OperationContext* txn,
+                                                   BucketType* bucket,
+                                                   const DiskLoc bucketLoc,
+                                                   const int leftIndex) {
+        invariant(leftIndex >= 0 && leftIndex < bucket->n);
+
+        DiskLoc leftNodeLoc = childLocForPos(bucket, leftIndex);
+        DiskLoc rightNodeLoc = childLocForPos(bucket, leftIndex + 1);
+
+        if (leftNodeLoc.isNull() || rightNodeLoc.isNull()) {
+            return false;
+        }
+
+        int pos = 0;
+
+        BucketType* leftBucket = getBucket(leftNodeLoc);
+        BucketType* rightBucket = getBucket(rightNodeLoc);
+
+        int sum = BucketType::HeaderSize
+                + _packedDataSize(leftBucket, pos)
+                + _packedDataSize(rightBucket, pos)
+                + getFullKey(bucket, leftIndex).data.dataSize()
+                + sizeof(KeyHeaderType);
+
+        return sum <= BtreeLayout::BucketSize;
+    }
+
+    /**
+     * This implementation must respect the meaning and value of lowWaterMark.  Also see comments in
+     * splitPos().
+     */
+    template <class BtreeLayout>
+    int BtreeLogic<BtreeLayout>::_rebalancedSeparatorPos(OperationContext* txn,
+                                                         BucketType* bucket,
+                                                         int leftIndex) {
+        int split = -1;
+        int rightSize = 0;
+
+        const BucketType* l = childForPos(bucket, leftIndex);
+        const BucketType* r = childForPos(bucket, leftIndex + 1);
+
+        int KNS = sizeof(KeyHeaderType);
+        int rightSizeLimit = ( l->topSize
+                             + l->n * KNS
+                             + getFullKey(bucket, leftIndex).data.dataSize()
+                             + KNS
+                             + r->topSize
+                             + r->n * KNS ) / 2;
+
+        // This constraint should be ensured by only calling this function
+        // if we go below the low water mark.
+        invariant(rightSizeLimit < BtreeLayout::BucketBodySize);
+
+        for (int i = r->n - 1; i > -1; --i) {
+            rightSize += getFullKey(r, i).data.dataSize() + KNS;
+            if (rightSize > rightSizeLimit) {
+                split = l->n + 1 + i;
+                break;
+            }
+        }
+
+        if (split == -1) {
+            rightSize += getFullKey(bucket, leftIndex).data.dataSize() + KNS;
+            if (rightSize > rightSizeLimit) {
+                split = l->n;
+            }
+        }
+
+        if (split == -1) {
+            for (int i = l->n - 1; i > -1; --i) {
+                rightSize += getFullKey(l, i).data.dataSize() + KNS;
+                if (rightSize > rightSizeLimit) {
+                    split = i;
+                    break;
+                }
+            }
+        }
+
+        // safeguards - we must not create an empty bucket
+        if (split < 1) {
+            split = 1;
+        }
+        else if (split > l->n + 1 + r->n - 2) {
+            split = l->n + 1 + r->n - 2;
+        }
+
+        return split;
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::doMergeChildren(OperationContext* txn,
+                                                  BucketType* bucket,
+                                                  const DiskLoc bucketLoc,
+                                                  int leftIndex) {
+
+        DiskLoc leftNodeLoc = childLocForPos(bucket, leftIndex);
+        DiskLoc rightNodeLoc = childLocForPos(bucket, leftIndex + 1);
+
+        BucketType* l = btreemod(txn, getBucket(leftNodeLoc));
+        BucketType* r = btreemod(txn, getBucket(rightNodeLoc));
+
+        int pos = 0;
+        _packReadyForMod(l, pos);
+        _packReadyForMod(r, pos);
+
+        // We know the additional keys below will fit in l because canMergeChildren() must be true.
+        int oldLNum = l->n;
+        // left child's right child becomes old parent key's left child
+        FullKey knLeft = getFullKey(bucket, leftIndex);
+        pushBack(l, knLeft.recordLoc, knLeft.data, l->nextChild);
+
+        for (int i = 0; i < r->n; ++i) {
+            FullKey kn = getFullKey(r, i);
+            pushBack(l, kn.recordLoc, kn.data, kn.prevChildBucket);
+        }
+
+        l->nextChild = r->nextChild;
+        fixParentPtrs(txn, l, leftNodeLoc, oldLNum);
+        delBucket(txn, r, rightNodeLoc);
+
+        childLocForPos(bucket, leftIndex + 1) = leftNodeLoc;
+        childLocForPos(bucket, leftIndex) = DiskLoc();
+        _delKeyAtPos(bucket, leftIndex, true);
+
+        if (bucket->n == 0) {
+            // Will trash bucket and bucketLoc.
+            //
+            // TODO To ensure all leaves are of equal height, we should ensure this is only called
+            // on the root.
+            replaceWithNextChild(txn, bucket, bucketLoc);
+        }
+        else {
+            mayBalanceWithNeighbors(txn, bucket, bucketLoc);
+        }
+    }
+
+    template <class BtreeLayout>
+    int BtreeLogic<BtreeLayout>::indexInParent(OperationContext* txn,
+                                               BucketType* bucket,
+                                               const DiskLoc bucketLoc) const {
+        invariant(!bucket->parent.isNull());
+        const BucketType* p = getBucket(bucket->parent);
+        if (p->nextChild == bucketLoc) {
+            return p->n;
+        }
+
+        for (int i = 0; i < p->n; ++i) {
+            if (getKeyHeader(p, i).prevChildBucket == bucketLoc) {
+                return i;
+            }
+        }
+
+        log() << "ERROR: can't find ref to child bucket.\n";
+        log() << "child: " << bucketLoc << "\n";
+        //dump();
+        log() << "Parent: " << bucket->parent << "\n";
+        //p->dump();
+        invariant(false);
+        return -1; // just to compile
+    }
+
+    template <class BtreeLayout>
+    bool BtreeLogic<BtreeLayout>::tryBalanceChildren(OperationContext* txn,
+                                                     BucketType* bucket,
+                                                     const DiskLoc bucketLoc,
+                                                     int leftIndex) {
+
+        // If we can merge, then we must merge rather than balance to preserve bucket utilization
+        // constraints.
+        if (canMergeChildren(txn, bucket, bucketLoc, leftIndex)) {
+            return false;
+        }
+
+        doBalanceChildren(txn, btreemod(txn, bucket), bucketLoc, leftIndex);
+        return true;
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::doBalanceLeftToRight(OperationContext* txn,
+                                                       BucketType* bucket,
+                                                       const DiskLoc bucketLoc,
+                                                       int leftIndex,
+                                                       int split,
+                                                       BucketType* l,
+                                                       const DiskLoc lchild,
+                                                       BucketType* r,
+                                                       const DiskLoc rchild) {
+
+        // TODO maybe do some audits the same way pushBack() does?  As a precondition, rchild + the
+        // old separator are <= half a body size, and lchild is at most completely full.  Based on
+        // the value of split, rchild will get <= half of the total bytes which is at most 75% of a
+        // full body.  So rchild will have room for the following keys:
+        int rAdd = l->n - split;
+        reserveKeysFront(r, rAdd);
+
+        for (int i = split + 1, j = 0; i < l->n; ++i, ++j) {
+            FullKey kn = getFullKey(l, i);
+            setKey(r, j, kn.recordLoc, kn.data, kn.prevChildBucket);
+        }
+
+        FullKey leftIndexKN = getFullKey(bucket, leftIndex);
+        setKey(r, rAdd - 1, leftIndexKN.recordLoc, leftIndexKN.data, l->nextChild);
+
+        fixParentPtrs(txn, r, rchild, 0, rAdd - 1);
+
+        FullKey kn = getFullKey(l, split);
+        l->nextChild = kn.prevChildBucket;
+
+        // Because lchild is a descendant of thisLoc, updating thisLoc will not affect packing or
+        // keys of lchild and kn will be stable during the following setInternalKey()            
+        setInternalKey(txn, bucket, bucketLoc, leftIndex, kn.recordLoc, kn.data, lchild, rchild);
+
+        // lchild and rchild cannot be merged, so there must be >0 (actually more) keys to the left
+        // of split.
+        int zeropos = 0;
+        truncateTo(l, split, zeropos);
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::doBalanceRightToLeft(OperationContext* txn,
+                                                       BucketType* bucket,
+                                                       const DiskLoc bucketLoc,
+                                                       int leftIndex,
+                                                       int split,
+                                                       BucketType* l,
+                                                       const DiskLoc lchild,
+                                                       BucketType* r,
+                                                       const DiskLoc rchild) {
+        // As a precondition, lchild + the old separator are <= half a body size,
+        // and rchild is at most completely full.  Based on the value of split,
+        // lchild will get less than half of the total bytes which is at most 75%
+        // of a full body.  So lchild will have room for the following keys:
+        int lN = l->n;
+
+        {
+            // left child's right child becomes old parent key's left child
+            FullKey kn = getFullKey(bucket, leftIndex);
+            pushBack(l, kn.recordLoc, kn.data, l->nextChild);
+        }
+
+        for (int i = 0; i < split - lN - 1; ++i) {
+            FullKey kn = getFullKey(r, i);
+            pushBack(l, kn.recordLoc, kn.data, kn.prevChildBucket);
+        }
+
+        {
+            FullKey kn = getFullKey(r, split - lN - 1);
+            l->nextChild = kn.prevChildBucket;
+            // Child lN was lchild's old nextChild, and don't need to fix that one.
+            fixParentPtrs(txn, l, lchild, lN + 1, l->n);
+            // Because rchild is a descendant of thisLoc, updating thisLoc will
+            // not affect packing or keys of rchild and kn will be stable
+            // during the following setInternalKey()
+            setInternalKey(txn, bucket, bucketLoc, leftIndex, kn.recordLoc, kn.data, lchild, rchild);
+        }
+
+        // lchild and rchild cannot be merged, so there must be >0 (actually more)
+        // keys to the right of split.
+        int zeropos = 0;
+        dropFront(r, split - lN, zeropos);
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::doBalanceChildren(OperationContext* txn,
+                                                    BucketType* bucket,
+                                                    const DiskLoc bucketLoc,
+                                                    int leftIndex) {
+
+        DiskLoc lchild = childLocForPos(bucket, leftIndex);
+        DiskLoc rchild = childLocForPos(bucket, leftIndex + 1);
+
+        int zeropos = 0;
+        BucketType* l = btreemod(txn, getBucket(lchild));
+        _packReadyForMod(l, zeropos);
+
+        BucketType* r = btreemod(txn, getBucket(rchild));
+        _packReadyForMod(r, zeropos);
+
+        int split = _rebalancedSeparatorPos(txn, bucket, leftIndex);
+
+        // By definition, if we are below the low water mark and cannot merge
+        // then we must actively balance.
+        invariant(split != l->n);
+        if (split < l->n) {
+            doBalanceLeftToRight(txn, bucket, bucketLoc, leftIndex, split, l, lchild, r, rchild);
+        }
+        else {
+            doBalanceRightToLeft(txn, bucket, bucketLoc, leftIndex, split, l, lchild, r, rchild);
+        }
+    }
+
+    template <class BtreeLayout>
+    bool BtreeLogic<BtreeLayout>::mayBalanceWithNeighbors(OperationContext* txn,
+                                                          BucketType* bucket,
+                                                          const DiskLoc bucketLoc) {
+        if (bucket->parent.isNull()) {
+            return false;
+        }
+
+        if (_packedDataSize(bucket, 0) >= lowWaterMark()) {
+            return false;
+        }
+
+        BucketType* p = getBucket(bucket->parent);
+        int parentIdx = indexInParent(txn, bucket, bucketLoc);
+
+        // TODO will missing neighbor case be possible long term?  Should we try to merge/balance
+        // somehow in that case if so?
+        bool mayBalanceRight = (parentIdx < p->n) && !childLocForPos(p, parentIdx + 1).isNull();
+        bool mayBalanceLeft = ( parentIdx > 0 ) && !childLocForPos(p, parentIdx - 1).isNull();
+
+        // Balance if possible on one side - we merge only if absolutely necessary to preserve btree
+        // bucket utilization constraints since that's a more heavy duty operation (especially if we
+        // must re-split later).
+        if (mayBalanceRight && tryBalanceChildren(txn, p, bucket->parent, parentIdx)) {
+            return true;
+        }
+
+        if (mayBalanceLeft && tryBalanceChildren(txn, p, bucket->parent, parentIdx - 1)) {
+            return true;
+        }
+
+        BucketType* pm = btreemod(txn, getBucket(bucket->parent));
+        if (mayBalanceRight) {
+            doMergeChildren(txn, pm, bucket->parent, parentIdx);
+            return true;
+        }
+        else if (mayBalanceLeft) {
+            doMergeChildren(txn, pm, bucket->parent, parentIdx - 1);
+            return true;
+        }
+
+        return false;
+    }
+
+    template <class BtreeLayout>
+    bool BtreeLogic<BtreeLayout>::unindex(OperationContext* txn,
+                                          const BSONObj& key,
+                                          const DiskLoc& recordLoc) {
+        int pos;
+        bool found = false;
+        KeyDataOwnedType ownedKey(key);
+
+        DiskLoc loc = _locate(txn, getRootLoc(), ownedKey, &pos, &found, recordLoc, 1);
+        if (found) {
+            BucketType* bucket = btreemod(txn, getBucket(loc));
+            delKeyAtPos(txn, bucket, loc, pos);
+            assertValid(_indexName, getRoot(), _ordering);
+        }
+        return found;
+    }
+
+    template <class BtreeLayout>
+    bool BtreeLogic<BtreeLayout>::isEmpty() const {
+        return getRoot()->n == 0;
+    }
+
+    /**
+     * This can cause a lot of additional page writes when we assign buckets to different parents.
+     * Maybe get rid of parent ptrs?
+     */
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::fixParentPtrs(OperationContext* txn,
+                                                BucketType* bucket,
+                                                const DiskLoc bucketLoc,
+                                                int firstIndex,
+                                                int lastIndex) {
+
+        invariant(getBucket(bucketLoc) == bucket);
+
+        if (lastIndex == -1) {
+            lastIndex = bucket->n;
+        }
+
+        for (int i = firstIndex; i <= lastIndex; i++) {
+            const DiskLoc childLoc = childLocForPos(bucket, i);
+            if (!childLoc.isNull()) {
+                *txn->recoveryUnit()->writing(&getBucket(childLoc)->parent) = bucketLoc;
+            }
+        }
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::setInternalKey(OperationContext* txn,
+                                                 BucketType* bucket,
+                                                 const DiskLoc bucketLoc,
+                                                 int keypos,
+                                                 const DiskLoc recordLoc,
+                                                 const KeyDataType& key,
+                                                 const DiskLoc lchild,
+                                                 const DiskLoc rchild) {
+        childLocForPos(bucket, keypos).Null();
+        // This may leave the bucket empty (n == 0) which is ok only as a txnient state.  In the
+        // instant case, the implementation of insertHere behaves correctly when n == 0 and as a
+        // side effect increments n.
+        _delKeyAtPos(bucket, keypos, true);
+
+        // Ensure we do not orphan neighbor's old child.
+        invariant(childLocForPos(bucket, keypos ) == rchild);
+
+        // Just set temporarily - required to pass validation in insertHere()
+        childLocForPos(bucket, keypos) = lchild;
+
+        insertHere(txn, bucketLoc, keypos, key, recordLoc, lchild, rchild);
+    }
+
+    /**
+     * insert a key in this bucket, splitting if necessary.
+     *
+     * @keypos - where to insert the key in range 0..n.  0=make leftmost, n=make rightmost.  NOTE
+     * this function may free some data, and as a result the value passed for keypos may be invalid
+     * after calling insertHere()
+     *
+     * Some of the write intent signaling below relies on the implementation of the optimized write
+     * intent code in basicInsert().
+     */
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::insertHere(OperationContext* txn,
+                                             const DiskLoc bucketLoc,
+                                             int pos,
+                                             const KeyDataType& key,
+                                             const DiskLoc recordLoc,
+                                             const DiskLoc leftChildLoc,
+                                             const DiskLoc rightChildLoc) {
+
+        BucketType* bucket = getBucket(bucketLoc);
+
+        if (!basicInsert(txn, bucket, bucketLoc, pos, key, recordLoc)) {
+            // If basicInsert() fails, the bucket will be packed as required by split().
+            split(txn, btreemod(txn, bucket), bucketLoc, pos, recordLoc, key, leftChildLoc, rightChildLoc);
+            return;
+        }
+
+        KeyHeaderType* kn = &getKeyHeader(bucket, pos);
+        if (pos + 1 == bucket->n) {
+            // It's the last key.
+            if (bucket->nextChild != leftChildLoc) {
+                // XXX log more
+                invariant(false);
+            }
+            kn->prevChildBucket = bucket->nextChild;
+            invariant(kn->prevChildBucket == leftChildLoc);
+            *txn->recoveryUnit()->writing(&bucket->nextChild) = rightChildLoc;
+            if (!rightChildLoc.isNull()) {
+                *txn->recoveryUnit()->writing(&getBucket(rightChildLoc)->parent) = bucketLoc;
+            }
+        }
+        else {
+            kn->prevChildBucket = leftChildLoc;
+            if (getKeyHeader(bucket, pos + 1).prevChildBucket != leftChildLoc) {
+                // XXX: log more
+                invariant(false);
+            }
+            const LocType *pc = &getKeyHeader(bucket, pos + 1).prevChildBucket;
+            // Intent declared in basicInsert()
+            *const_cast<LocType*>(pc) = rightChildLoc;
+            if (!rightChildLoc.isNull()) {
+                *txn->recoveryUnit()->writing(&getBucket(rightChildLoc)->parent) = bucketLoc;
+            }
+        }
+    }
+
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::split(OperationContext* txn,
+                                        BucketType* bucket,
+                                        const DiskLoc bucketLoc,
+                                        int keypos,
+                                        const DiskLoc recordLoc,
+                                        const KeyDataType& key,
+                                        const DiskLoc lchild,
+                                        const DiskLoc rchild) {
+
+        int split = splitPos(bucket, keypos);
+        DiskLoc rLoc = _addBucket(txn);
+        BucketType* r = btreemod(txn, getBucket(rLoc));
+
+        for (int i = split + 1; i < bucket->n; i++) {
+            FullKey kn = getFullKey(bucket, i);
+            pushBack(r, kn.recordLoc, kn.data, kn.prevChildBucket);
+        }
+        r->nextChild = bucket->nextChild;
+        assertValid(_indexName, r, _ordering);
+
+        r = NULL;
+        fixParentPtrs(txn, getBucket(rLoc), rLoc);
+
+        FullKey splitkey = getFullKey(bucket, split);
+        // splitkey key gets promoted, its children will be thisLoc (l) and rLoc (r)
+        bucket->nextChild = splitkey.prevChildBucket;
+
+        // Because thisLoc is a descendant of parent, updating parent will not affect packing or
+        // keys of thisLoc and splitkey will be stable during the following:
+
+        if (bucket->parent.isNull()) {
+            // promote splitkey to a parent this->node make a new parent if we were the root
+            DiskLoc L = _addBucket(txn);
+            BucketType* p = btreemod(txn, getBucket(L));
+            pushBack(p, splitkey.recordLoc, splitkey.data, bucketLoc);
+            p->nextChild = rLoc;
+            assertValid(_indexName, p, _ordering);
+            bucket->parent = L;
+            _headManager->setHead(txn, L);
+            *txn->recoveryUnit()->writing(&getBucket(rLoc)->parent) = bucket->parent;
+        }
+        else {
+            // set this before calling _insert - if it splits it will do fixParent() logic and
+            // change the value.
+            *txn->recoveryUnit()->writing(&getBucket(rLoc)->parent) = bucket->parent;
+            _insert(txn,
+                    getBucket(bucket->parent),
+                    bucket->parent,
+                    splitkey.data,
+                    splitkey.recordLoc,
+                    true,  // dupsallowed
+                    bucketLoc,
+                    rLoc);
+        }
+
+        int newpos = keypos;
+        // note this may trash splitkey.key.  thus we had to promote it before finishing up here.
+        truncateTo(bucket, split, newpos);
+
+        // add our this->new key, there is room this->now
+        if (keypos <= split) {
+            insertHere(txn, bucketLoc, newpos, key, recordLoc, lchild, rchild);
+        }
+        else {
+            int kp = keypos - split - 1;
+            invariant(kp >= 0);
+            insertHere(txn, rLoc, kp, key, recordLoc, lchild, rchild);
+        }
+    }
+
+    class DummyDocWriter : public DocWriter {
+    public:
+        DummyDocWriter(size_t sz) : _sz(sz) { }
+        virtual void writeDocument(char* buf) const { /* no-op */ }
+        virtual size_t documentSize() const { return _sz; }
+    private:
+        size_t _sz;
+    };
+
+    template <class BtreeLayout>
+    Status BtreeLogic<BtreeLayout>::initAsEmpty(OperationContext* txn) {
+        if (!_headManager->getHead().isNull()) {
+            return Status(ErrorCodes::InternalError, "index already initialized");
+        }
+
+        _headManager->setHead(txn, _addBucket(txn));
+        return Status::OK();
+    }
+
+    template <class BtreeLayout>
+    DiskLoc BtreeLogic<BtreeLayout>::_addBucket(OperationContext* txn) {
+        DummyDocWriter docWriter(BtreeLayout::BucketSize);
+        StatusWith<DiskLoc> loc = _recordStore->insertRecord(txn, &docWriter, false);
+        // XXX: remove this(?) or turn into massert or sanely bubble it back up.
+        uassertStatusOK(loc.getStatus());
+
+        // this is a new bucket, not referenced by anyone, probably don't need this lock
+        BucketType* b = btreemod(txn, getBucket(loc.getValue()));
+        init(b);
+        return loc.getValue();
+    }
+
+    // static
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::dumpBucket(const BucketType* bucket, int indentLength) {
+        log() << "BUCKET n:" << bucket->n << ", parent:" << hex << bucket->parent.getOfs() << dec;
+
+        const string indent = string(indentLength, ' ');
+
+        for (int i = 0; i < bucket->n; i++) {
+            log() << '\n' << indent;
+            FullKey k = getFullKey(bucket, i);
+            string ks = k.data.toString();
+            log() << "  " << hex << k.prevChildBucket.getOfs() << "<-- prevChildBucket for " << i << '\n';
+            log() << indent << "    " << i << ' ' << ks.substr(0, 30)
+                  << " Loc:" << k.recordLoc.toString() << dec;
+            if (getKeyHeader(bucket, i).isUnused()) {
+                log() << " UNUSED";
+            }
+        }
+
+        log() << "\n" << indent << "  " << hex << bucket->nextChild.getOfs() << dec << endl;
+    }
+
+    template <class BtreeLayout>
+    DiskLoc BtreeLogic<BtreeLayout>::getDiskLoc(const DiskLoc& bucketLoc, const int keyOffset) const {
+        invariant(!bucketLoc.isNull());
+        BucketType* bucket = getBucket(bucketLoc);
+        return getKeyHeader(bucket, keyOffset).recordLoc;
+    }
+
+    template <class BtreeLayout>
+    BSONObj BtreeLogic<BtreeLayout>::getKey(const DiskLoc& bucketLoc, const int keyOffset) const {
+        invariant(!bucketLoc.isNull());
+        BucketType* bucket = getBucket(bucketLoc);
+        int n = bucket->n;
+        invariant(n != BtreeLayout::INVALID_N_SENTINEL);
+        invariant(n >= 0);
+        invariant(n < 10000);
+        invariant(n != 0xffff);
+
+        invariant(keyOffset >= 0);
+        invariant(keyOffset < n);
+
+        // XXX: should we really return an empty obj if keyOffset>=n?
+        if (keyOffset >= n) {
+            return BSONObj();
+        }
+        else {
+            return getFullKey(bucket, keyOffset).data.toBson();
+        }
+    }
+
+    template <class BtreeLayout>
+    Status BtreeLogic<BtreeLayout>::touch(OperationContext* txn) const {
+        return _recordStore->touch( txn, NULL );
+    }
+
+    template <class BtreeLayout>
+    long long BtreeLogic<BtreeLayout>::fullValidate(OperationContext* txn,
+                                                    long long *unusedCount,
+                                                    bool strict,
+                                                    bool dumpBuckets,
+                                                    unsigned depth) {
+        return _fullValidate(txn, getRootLoc(), unusedCount, strict, dumpBuckets, depth);
+    }
+
+    template <class BtreeLayout>
+    long long BtreeLogic<BtreeLayout>::_fullValidate(OperationContext* txn,
+                                                     const DiskLoc bucketLoc,
+                                                     long long *unusedCount,
+                                                     bool strict,
+                                                     bool dumpBuckets,
+                                                     unsigned depth) {
+        BucketType* bucket = getBucket(bucketLoc);
+        assertValid(_indexName, bucket, _ordering, true);
+
+        if (dumpBuckets) {
+            log() << bucketLoc.toString() << ' ';
+            dumpBucket(bucket, depth);
+        }
+
+        long long keyCount = 0;
+
+        for (int i = 0; i < bucket->n; i++) {
+            KeyHeaderType& kn = getKeyHeader(bucket, i);
+
+            if (kn.isUsed()) {
+                keyCount++;
+            }
+            else if (NULL != unusedCount) {
+                ++(*unusedCount);
+            }
+
+            if (!kn.prevChildBucket.isNull()) {
+                DiskLoc left = kn.prevChildBucket;
+                BucketType* b = getBucket(left);
+
+                if (strict) {
+                    invariant(b->parent == bucketLoc);
+                }
+                else {
+                    wassert(b->parent == bucketLoc);
+                }
+
+                keyCount += _fullValidate(txn, left, unusedCount, strict, dumpBuckets, depth + 1);
+            }
+        }
+
+        if (!bucket->nextChild.isNull()) {
+            BucketType* b = getBucket(bucket->nextChild);
+            if (strict) {
+                invariant(b->parent == bucketLoc);
+            }
+            else {
+                wassert(b->parent == bucketLoc);
+            }
+
+            keyCount += _fullValidate(txn, bucket->nextChild, unusedCount, strict, dumpBuckets, depth + 1);
+        }
+
+        return keyCount;
+    }
+
+    // XXX: remove this(?)  used to not dump every key in assertValid.
+    int nDumped = 0;
+
+    // static
+    template <class BtreeLayout>
+    void BtreeLogic<BtreeLayout>::assertValid(const std::string& ns,
+                                              BucketType* bucket,
+                                              const Ordering& ordering,
+                                              bool force) {
+        if (!force) {
+            return;
+        }
+
+        // this is very slow so don't do often
+        {
+            static int _k;
+            if (++_k % 128) {
+                return;
+            }
+        }
+
+        DEV {
+            // slow:
+            for (int i = 0; i < bucket->n - 1; i++) {
+                FullKey firstKey = getFullKey(bucket, i);
+                FullKey secondKey = getFullKey(bucket, i + 1);
+                int z = firstKey.data.woCompare(secondKey.data, ordering);
+                if (z > 0) {
+                    log() << "ERROR: btree key order corrupt.  Keys:" << endl;
+                    if (++nDumped < 5) {
+                        for (int j = 0; j < bucket->n; j++) {
+                            log() << "  " << getFullKey(bucket, j).data.toString() << endl;
+                        }
+                        dumpBucket(bucket);
+                    }
+                    wassert(false);
+                    break;
+                }
+                else if (z == 0) {
+                    if (!(firstKey.header.recordLoc < secondKey.header.recordLoc)) {
+                        log() << "ERROR: btree key order corrupt (recordlocs wrong):" << endl;
+                        log() << " k(" << i << ")" << firstKey.data.toString()
+                              << " RL:" << firstKey.header.recordLoc.toString() << endl;
+                        log() << " k(" << i + 1 << ")" << secondKey.data.toString()
+                              << " RL:" << secondKey.header.recordLoc.toString() << endl;
+                        wassert(firstKey.header.recordLoc < secondKey.header.recordLoc);
+                    }
+                }
+            }
+        }
+        else {
+            //faster:
+            if (bucket->n > 1) {
+                FullKey k1 = getFullKey(bucket, 0);
+                FullKey k2 = getFullKey(bucket, bucket->n - 1);
+                int z = k1.data.woCompare(k2.data, ordering);
+                //wassert( z <= 0 );
+                if (z > 0) {
+                    log() << "Btree keys out of order in collection " << ns;
+                    ONCE {
+                        dumpBucket(bucket);
+                    }
+                    invariant(false);
+                }
+            }
+        }
+    }
+
+    template <class BtreeLayout>
+    Status BtreeLogic<BtreeLayout>::insert(OperationContext* txn,
+                                           const BSONObj& rawKey,
+                                           const DiskLoc& value,
+                                           bool dupsAllowed) {
+        KeyDataOwnedType key(rawKey);
+
+        if (key.dataSize() > BtreeLayout::KeyMax) {
+            string msg = str::stream() << "Btree::insert: key too large to index, failing "
+                                       << _indexName << ' '
+                                       << key.dataSize() << ' ' << key.toString();
+            return Status(ErrorCodes::KeyTooLong, msg);
+        }
+
+        Status status = _insert(txn,
+                                getRoot(),
+                                getRootLoc(),
+                                key,
+                                value,
+                                dupsAllowed,
+                                DiskLoc(),
+                                DiskLoc());
+
+        assertValid(_indexName, getRoot(), _ordering);
+        return status;
+    }
+
+    template <class BtreeLayout>
+    Status BtreeLogic<BtreeLayout>::_insert(OperationContext* txn,
+                                            BucketType* bucket,
+                                            const DiskLoc bucketLoc,
+                                            const KeyDataType& key,
+                                            const DiskLoc recordLoc,
+                                            bool dupsAllowed,
+                                            const DiskLoc leftChild,
+                                            const DiskLoc rightChild) {
+        invariant( key.dataSize() > 0 );
+
+        int pos;
+        bool found;
+        Status findStatus = _find(txn, bucket, key, recordLoc, !dupsAllowed, &pos, &found);
+        if (!findStatus.isOK()) {
+            return findStatus;
+        }
+
+        if (found) {
+            static KeyHeaderType& header = getKeyHeader(bucket, pos);
+            if (header.isUnused()) {
+                LOG(4) << "btree _insert: reusing unused key" << endl;
+                massert(17433, "_insert: reuse key but lchild is not null", leftChild.isNull());
+                massert(17434, "_insert: reuse key but rchild is not null", rightChild.isNull());
+                txn->recoveryUnit()->writing(&header)->setUsed();
+                return Status::OK();
+            }
+            return Status(ErrorCodes::UniqueIndexViolation, "FIXME");
+        }
+
+        DiskLoc childLoc = childLocForPos(bucket, pos);
+
+        // In current usage, rightChild is NULL for a new key and is not NULL when we are
+        // promoting a split key.  These are the only two cases where _insert() is called
+        // currently.
+        if (childLoc.isNull() || !rightChild.isNull()) {
+            insertHere(txn, bucketLoc, pos, key, recordLoc, leftChild, rightChild);
+            return Status::OK();
+        }
+        else {
+            return _insert(txn,
+                           getBucket(childLoc),
+                           childLoc,
+                           key,
+                           recordLoc,
+                           dupsAllowed,
+                           DiskLoc(),
+                           DiskLoc());
+        }
+    }
+
+    template <class BtreeLayout>
+    DiskLoc BtreeLogic<BtreeLayout>::advance(OperationContext* txn,
+                                             const DiskLoc& bucketLoc,
+                                             int* posInOut,
+                                             int direction) const {
+        BucketType* bucket = getBucket(bucketLoc);
+
+        if (*posInOut < 0 || *posInOut >= bucket->n ) {
+            log() << "ASSERT failure advancing btree bucket" << endl;
+            log() << "  thisLoc: " << bucketLoc.toString() << endl;
+            log() << "  keyOfs: " << *posInOut << " n:" << bucket->n << " direction: " << direction << endl;
+            // log() << bucketSummary() << endl;
+            invariant(false);
+        }
+
+        // XXX document
+        int adj = direction < 0 ? 1 : 0;
+        int ko = *posInOut + direction;
+
+        // Look down if we need to.
+        DiskLoc nextDownLoc = childLocForPos(bucket, ko + adj);
+        BucketType* nextDown = getBucket(nextDownLoc);
+        if (NULL != nextDown) {
+            for (;;) {
+                if (direction > 0) {
+                    *posInOut = 0;
+                }
+                else {
+                    *posInOut = nextDown->n - 1;
+                }
+                DiskLoc newNextDownLoc = childLocForPos(nextDown, *posInOut + adj);
+                BucketType* newNextDownBucket = getBucket(newNextDownLoc);
+                if (NULL == newNextDownBucket) {
+                    break;
+                }
+                nextDownLoc = newNextDownLoc;
+                nextDown = newNextDownBucket;
+            }
+            return nextDownLoc;
+        }
+
+        // Looking down isn't the right choice, move forward.
+        if (ko < bucket->n && ko >= 0) {
+            *posInOut = ko;
+            return bucketLoc;
+        }
+
+        // Hit the end of the bucket, move up and over.
+        DiskLoc childLoc = bucketLoc;
+        DiskLoc ancestor = getBucket(bucketLoc)->parent;
+        for (;;) {
+            if (ancestor.isNull()) {
+                break;
+            }
+            BucketType* an = getBucket(ancestor);
+            for (int i = 0; i < an->n; i++) {
+                if (childLocForPos(an, i + adj) == childLoc) {
+                    *posInOut = i;
+                    return ancestor;
+                }
+            }
+            invariant(direction < 0 || an->nextChild == childLoc);
+            // parent exhausted also, keep going up
+            childLoc = ancestor;
+            ancestor = an->parent;
+        }
+
+        return DiskLoc();
+    }
+
+    template <class BtreeLayout>
+    bool BtreeLogic<BtreeLayout>::keyIsUsed(const DiskLoc& loc, const int& pos) const {
+        return getKeyHeader(getBucket(loc), pos).isUsed();
+    }
+
+    template <class BtreeLayout>
+    bool BtreeLogic<BtreeLayout>::locate(OperationContext* txn,
+                                         const BSONObj& key,
+                                         const DiskLoc& recordLoc,
+                                         const int direction,
+                                         int* posOut,
+                                         DiskLoc* bucketLocOut) const {
+        // Clear out any data.
+        *posOut = 0;
+        *bucketLocOut = DiskLoc();
+
+        bool found = false;
+        KeyDataOwnedType owned(key);
+
+        *bucketLocOut = _locate(txn, getRootLoc(), owned, posOut, &found, recordLoc, direction);
+
+        if (!found) {
+            return false;
+        }
+
+        skipUnusedKeys(txn, bucketLocOut, posOut, direction);
+
+        return found;
+    }
+
+    /**
+     * Recursively walk down the btree, looking for a match of key and recordLoc.
+     * Caller should have acquired lock on bucketLoc.
+     */
+    template <class BtreeLayout>
+    DiskLoc BtreeLogic<BtreeLayout>::_locate(OperationContext* txn,
+                                             const DiskLoc& bucketLoc,
+                                             const KeyDataType& key,
+                                             int* posOut,
+                                             bool* foundOut,
+                                             const DiskLoc& recordLoc,
+                                             const int direction) const {
+        int position;
+        BucketType* bucket = getBucket(bucketLoc);
+        // XXX: owned to not owned conversion(?)
+        _find(txn, bucket, key, recordLoc, false, &position, foundOut);
+
+        // Look in our current bucket.
+        if (*foundOut) {
+            *posOut = position;
+            return bucketLoc;
+        }
+
+        // Not in our current bucket.  'position' tells us where there may be a child.
+        DiskLoc childLoc = childLocForPos(bucket, position);
+
+        if (!childLoc.isNull()) {
+            DiskLoc inChild = _locate(txn, childLoc, key, posOut, foundOut, recordLoc, direction);
+            if (!inChild.isNull()) {
+                return inChild;
+            }
+        }
+
+        *posOut = position;
+
+        if (direction < 0) {
+            // The key *would* go to our left.
+            (*posOut)--;
+            if (-1 == *posOut) {
+                // But there's no space for that in our bucket.
+                return DiskLoc();
+            }
+            else {
+                return bucketLoc;
+            }
+        }
+        else {
+            // The key would go to our right...
+            if (bucket->n == *posOut) {
+                return DiskLoc();
+            }
+            else {
+                // But only if there is space.
+                return bucketLoc;
+            }
+        }
+    }
+
+    // TODO relcoate
+    template <class BtreeLayout>
+    bool BtreeLogic<BtreeLayout>::isHead(BucketType* bucket) {
+        return bucket->parent.isNull();
+    }
+
+    template <class BtreeLayout>
+    typename BtreeLogic<BtreeLayout>::BucketType*
+    BtreeLogic<BtreeLayout>::getBucket(const DiskLoc dl) const {
+        if (dl.isNull()) {
+            return NULL;
+        }
+
+        RecordData recordData = _recordStore->dataFor(dl);
+
+        // we need to be working on the raw bytes, not a transient copy
+        invariant(!recordData.isOwned());
+
+        return reinterpret_cast<BucketType*>(const_cast<char*>(recordData.data()));
+    }
+
+    template <class BtreeLayout>
+    typename BtreeLogic<BtreeLayout>::BucketType*
+    BtreeLogic<BtreeLayout>::getRoot() const {
+        return getBucket(_headManager->getHead());
+    }
+
+    template <class BtreeLayout>
+    DiskLoc
+    BtreeLogic<BtreeLayout>::getRootLoc() const {
+        return _headManager->getHead();
+    }
+
+    template <class BtreeLayout>
+    typename BtreeLogic<BtreeLayout>::BucketType*
+    BtreeLogic<BtreeLayout>::childForPos(BucketType* bucket, int pos) const {
+        DiskLoc loc = childLocForPos(bucket, pos);
+        return getBucket(loc);
+    }
+
+    template <class BtreeLayout>
+    typename BtreeLogic<BtreeLayout>::LocType&
+    BtreeLogic<BtreeLayout>::childLocForPos(BucketType* bucket, int pos) {
+        if (bucket->n == pos) {
+            return bucket->nextChild;
+        }
+        else {
+            return getKeyHeader(bucket, pos).prevChildBucket;
+        }
+    }
+
+    //
+    // And, template stuff.
+    //
+
+    // V0 format.
+    template struct FixedWidthKey<DiskLoc>;
+    template class BtreeLogic<BtreeLayoutV0>;
+
+    // V1 format.
+    template struct FixedWidthKey<DiskLoc56Bit>;
+    template class BtreeLogic<BtreeLayoutV1>;
+
+}  // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic.h b/src/mongo/db/storage/mmap_v1/btree/btree_logic.h
new file mode 100644
index 00000000000..ff7d7718de9
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_logic.h
@@ -0,0 +1,593 @@
+/**
+ *    Copyright (C) 2014 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/catalog/head_manager.h"
+#include "mongo/db/catalog/index_catalog_entry.h"
+#include "mongo/db/diskloc.h"
+#include "mongo/db/jsobj.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/db/storage/mmap_v1/btree/btree_ondisk.h"
+#include "mongo/db/storage/mmap_v1/btree/key.h"
+#include "mongo/db/storage/mmap_v1/btree/bucket_deletion_notification.h"
+
+
+namespace mongo {
+
+    class BucketDeletionNotification;
+    class RecordStore;
+
+    // Used for unit-testing only
+    template <class BtreeLayout> class BtreeLogicTestBase;
+    template <class BtreeLayout> class ArtificialTreeBuilder;
+
+    /**
+     * This is the logic for manipulating the Btree.  It is (mostly) independent of the on-disk
+     * format.
+     */
+    template <class BtreeLayout>
+    class BtreeLogic {
+    public:
+        // AKA _keyNode
+        typedef typename BtreeLayout::FixedWidthKeyType KeyHeaderType;
+
+        // AKA Key
+        typedef typename BtreeLayout::KeyType KeyDataType;
+
+        // AKA KeyOwned
+        typedef typename BtreeLayout::KeyOwnedType KeyDataOwnedType;
+
+        // AKA Loc
+        typedef typename BtreeLayout::LocType LocType;
+
+        // AKA BucketBasics or BtreeBucket, either one.
+        typedef typename BtreeLayout::BucketType BucketType;
+
+        /**
+         * 'head' manages the catalog information.
+         * 'store' allocates and frees buckets.
+         * 'ordering' is meta-information we store in the catalog.
+         * 'indexName' is a string identifying the index that we use to print errors with.
+         */
+        BtreeLogic(HeadManager* head,
+                   RecordStore* store,
+                   const Ordering& ordering,
+                   const string& indexName,
+                   BucketDeletionNotification* bucketDeletion)
+            : _headManager(head),
+              _recordStore(store),
+              _ordering(ordering),
+              _indexName(indexName),
+              _bucketDeletion(bucketDeletion) { 
+        
+        }
+
+        //
+        // Public-facing
+        //
+
+        class Builder {
+        public:
+            typedef typename BtreeLayout::KeyOwnedType KeyDataOwnedType;
+            typedef typename BtreeLayout::KeyType KeyDataType;
+
+            Status addKey(const BSONObj& key, const DiskLoc& loc);
+
+            // XXX: status, outparam for # keys?
+            unsigned long long commit(bool mayInterrupt);
+
+        private:
+            friend class BtreeLogic;
+
+            Builder(BtreeLogic* logic, OperationContext* txn, bool dupsAllowed);
+
+            // Direct ports of functionality
+            void newBucket();
+            void buildNextLevel(DiskLoc loc, bool mayInterrupt);
+            void mayCommitProgressDurably();
+            BucketType* _getModifiableBucket(DiskLoc loc);
+            BucketType* _getBucket(DiskLoc loc);
+            // Direct ports of functionality
+
+            // Not owned.
+            BtreeLogic* _logic;
+
+            // Direct port of names.
+            DiskLoc _cur;
+            DiskLoc _first;
+            BucketType* _b;
+            bool _committed;
+            bool _dupsAllowed;
+            long long _numAdded;
+            auto_ptr<KeyDataOwnedType> _keyLast;
+
+            // Not owned.
+            OperationContext* _txn;
+        };
+
+        /**
+         * Caller owns the returned pointer.
+         * 'this' must outlive the returned pointer.
+         */
+        Builder* newBuilder(OperationContext* txn, bool dupsAllowed);
+
+        Status dupKeyCheck(OperationContext* txn,
+                           const BSONObj& key,
+                           const DiskLoc& loc) const;
+
+        Status insert(OperationContext* txn,
+                      const BSONObj& rawKey,
+                      const DiskLoc& value,
+                      bool dupsAllowed);
+
+        /**
+         * Navigates down the tree and locates the bucket and position containing a record with
+         * the specified <key, recordLoc> combination.
+         *
+         * @return true if the exact <key, recordLoc> was found. Otherwise, false and the
+         *      bucketLocOut would contain the bucket containing key which is before or after the
+         *      searched one (dependent on the direction).
+         */
+        bool locate(OperationContext* txn,
+                    const BSONObj& key,
+                    const DiskLoc& recordLoc,
+                    const int direction,
+                    int* posOut,
+                    DiskLoc* bucketLocOut) const;
+
+        void advance(OperationContext* txn,
+                     DiskLoc* bucketLocInOut,
+                     int* posInOut,
+                     int direction) const;
+
+        bool exists(OperationContext* txn, const KeyDataType& key) const;
+
+        bool unindex(OperationContext* txn,
+                     const BSONObj& key,
+                     const DiskLoc& recordLoc);
+
+        bool isEmpty() const;
+
+        long long fullValidate(OperationContext*,
+                               long long *unusedCount,
+                               bool strict,
+                               bool dumpBuckets,
+                               unsigned depth);
+
+        DiskLoc getDiskLoc(const DiskLoc& bucketLoc, const int keyOffset) const;
+
+        BSONObj getKey(const DiskLoc& bucketLoc, const int keyOffset) const;
+
+        DiskLoc getHead() const { return _headManager->getHead(); }
+
+        Status touch(OperationContext* txn) const;
+
+        //
+        // Composite key navigation methods
+        //
+
+        void customLocate(OperationContext* txn,
+                          DiskLoc* locInOut,
+                          int* keyOfsInOut,
+                          const BSONObj& keyBegin,
+                          int keyBeginLen,
+                          bool afterKey,
+                          const vector<const BSONElement*>& keyEnd,
+                          const vector<bool>& keyEndInclusive,
+                          int direction) const;
+
+        void advanceTo(OperationContext*,
+                       DiskLoc* thisLocInOut,
+                       int* keyOfsInOut,
+                       const BSONObj &keyBegin,
+                       int keyBeginLen,
+                       bool afterKey,
+                       const vector<const BSONElement*>& keyEnd,
+                       const vector<bool>& keyEndInclusive,
+                       int direction) const;
+
+        void restorePosition(OperationContext* txn,
+                             const BSONObj& savedKey,
+                             const DiskLoc& savedLoc,
+                             int direction,
+                             DiskLoc* bucketInOut,
+                             int* keyOffsetInOut) const;
+
+        //
+        // Creation and deletion
+        //
+
+        /**
+         * Returns OK if the index was uninitialized before, error status otherwise.
+         */
+        Status initAsEmpty(OperationContext* txn);
+
+        //
+        // Size constants
+        //
+
+        static int lowWaterMark();
+
+    private:
+        friend class BtreeLogic::Builder;
+
+        // Used for unit-testing only
+        friend class BtreeLogicTestBase<BtreeLayout>;
+        friend class ArtificialTreeBuilder<BtreeLayout>;
+
+        /**
+         * This is an in memory wrapper for the variable length data associated with a
+         * KeyHeaderType.  It points to on-disk data but is not itself on-disk data.
+         *
+         * This object and its BSONObj 'key' will become invalid if the KeyHeaderType data that owns
+         * this it is moved within the btree.  In general, a KeyWrapper should not be expected to be
+         * valid after a write.
+         */
+        struct FullKey {
+            FullKey(const BucketType* bucket, int i)
+                : header(getKeyHeader(bucket, i)),
+                  prevChildBucket(header.prevChildBucket),
+                  recordLoc(header.recordLoc),
+                  data(bucket->data + header.keyDataOfs()) { }
+
+            // This is actually a reference to something on-disk.
+            const KeyHeaderType& header;
+
+            // These are actually in 'header'.
+            const LocType& prevChildBucket;
+            const LocType& recordLoc;
+
+            // This is *not* memory-mapped but its members point to something on-disk.
+            KeyDataType data;
+        };
+
+        //
+        // Functions that depend on the templated type info but nothing in 'this'.
+        //
+
+        static LocType& childLocForPos(BucketType* bucket, int pos);
+
+        static FullKey getFullKey(const BucketType* bucket, int i);
+
+        static KeyHeaderType& getKeyHeader(BucketType* bucket, int i);
+
+        static const KeyHeaderType& getKeyHeader(const BucketType* bucket, int i);
+
+        static char* dataAt(BucketType* bucket, short ofs);
+
+        static void markUnused(BucketType* bucket, int keypos);
+
+        static int totalDataSize(BucketType* bucket);
+
+        static void init(BucketType* bucket);
+
+        static int _alloc(BucketType* bucket, int bytes);
+
+        static void _unalloc(BucketType* bucket, int bytes);
+
+        static void _delKeyAtPos(BucketType* bucket, int keypos, bool mayEmpty = false);
+
+        static void popBack(BucketType* bucket, DiskLoc* recordLocOut, KeyDataType *keyDataOut);
+
+        static bool mayDropKey(BucketType* bucket, int index, int refPos);
+
+        static int _packedDataSize(BucketType* bucket, int refPos);
+
+        static void setPacked(BucketType* bucket);
+
+        static void setNotPacked(BucketType* bucket);
+
+        static BucketType* btreemod(OperationContext* txn, BucketType* bucket);
+
+        static int splitPos(BucketType* bucket, int keypos);
+
+        static void reserveKeysFront(BucketType* bucket, int nAdd);
+
+        static void setKey(BucketType* bucket,
+                           int i,
+                           const DiskLoc recordLoc,
+                           const KeyDataType &key,
+                           const DiskLoc prevChildBucket);
+
+        static bool isHead(BucketType* bucket);
+
+        static void dumpBucket(const BucketType* bucket, int indentLength = 0);
+
+        static void assertValid(const std::string& ns, 
+                                BucketType* bucket, 
+                                const Ordering& ordering, 
+                                bool force = false);
+
+        //
+        // 'this'-specific helpers (require record store, catalog information, or ordering, or type
+        // information).
+        //
+
+        bool basicInsert(OperationContext* txn,
+                         BucketType* bucket,
+                         const DiskLoc bucketLoc,
+                         int& keypos,
+                         const KeyDataType& key,
+                         const DiskLoc recordLoc);
+
+        void dropFront(BucketType* bucket, int nDrop, int& refpos);
+
+        void _pack(OperationContext* txn, BucketType* bucket, const DiskLoc thisLoc, int &refPos);
+
+        void customLocate(OperationContext* txn,
+                          DiskLoc* locInOut,
+                          int* keyOfsInOut,
+                          const BSONObj& keyBegin,
+                          int keyBeginLen,
+                          bool afterKey,
+                          const vector<const BSONElement*>& keyEnd,
+                          const vector<bool>& keyEndInclusive,
+                          int direction,
+                          pair<DiskLoc, int>& bestParent) const;
+
+        Status _find(OperationContext* txn,
+                     BucketType* bucket,
+                     const KeyDataType& key,
+                     const DiskLoc& recordLoc,
+                     bool errorIfDup,
+                     int* keyPositionOut,
+                     bool* foundOut) const;
+
+        bool customFind(int low,
+                        int high,
+                        const BSONObj& keyBegin,
+                        int keyBeginLen,
+                        bool afterKey,
+                        const vector<const BSONElement*>& keyEnd,
+                        const vector<bool>& keyEndInclusive,
+                        const Ordering& order,
+                        int direction,
+                        DiskLoc* thisLocInOut,
+                        int* keyOfsInOut,
+                        pair<DiskLoc, int>& bestParent) const;
+
+        void advanceToImpl(OperationContext* txn,
+                           DiskLoc* thisLocInOut,
+                           int* keyOfsInOut,
+                           const BSONObj &keyBegin,
+                           int keyBeginLen,
+                           bool afterKey,
+                           const vector<const BSONElement*>& keyEnd,
+                           const vector<bool>& keyEndInclusive,
+                           int direction) const;
+
+        bool wouldCreateDup(OperationContext* txn,
+                            const KeyDataType& key,
+                            const DiskLoc self) const;
+
+        bool keyIsUsed(const DiskLoc& loc, const int& pos) const;
+
+        void skipUnusedKeys(OperationContext* txn,
+                            DiskLoc* loc,
+                            int* pos,
+                            int direction) const;
+
+        DiskLoc advance(OperationContext* txn,
+                        const DiskLoc& bucketLoc,
+                        int* posInOut,
+                        int direction) const;
+
+        DiskLoc _locate(OperationContext* txn,
+                        const DiskLoc& bucketLoc,
+                        const KeyDataType& key,
+                        int* posOut,
+                        bool* foundOut,
+                        const DiskLoc& recordLoc,
+                        const int direction) const;
+
+        long long _fullValidate(OperationContext* txn,
+                                const DiskLoc bucketLoc,
+                                long long *unusedCount,
+                                bool strict,
+                                bool dumpBuckets,
+                                unsigned depth);
+
+        DiskLoc _addBucket(OperationContext* txn);
+
+        bool canMergeChildren(OperationContext* txn,
+                              BucketType* bucket,
+                              const DiskLoc bucketLoc,
+                              const int leftIndex);
+
+        // has to look in children of 'bucket' and requires record store
+        int _rebalancedSeparatorPos(OperationContext* txn,
+                                    BucketType* bucket,
+                                    int leftIndex);
+
+        void _packReadyForMod(BucketType* bucket, int &refPos);
+
+        void truncateTo(BucketType* bucket, int N, int &refPos);
+
+        void split(OperationContext* txn,
+                   BucketType* bucket,
+                   const DiskLoc bucketLoc,
+                   int keypos,
+                   const DiskLoc recordLoc,
+                   const KeyDataType& key,
+                   const DiskLoc lchild,
+                   const DiskLoc rchild);
+
+        Status _insert(OperationContext* txn,
+                       BucketType* bucket,
+                       const DiskLoc bucketLoc,
+                       const KeyDataType& key,
+                       const DiskLoc recordLoc,
+                       bool dupsAllowed,
+                       const DiskLoc leftChild,
+                       const DiskLoc rightChild);
+
+        // TODO take a BucketType*?
+        void insertHere(OperationContext* txn,
+                        const DiskLoc bucketLoc,
+                        int pos,
+                        const KeyDataType& key,
+                        const DiskLoc recordLoc,
+                        const DiskLoc leftChild,
+                        const DiskLoc rightChild);
+
+        std::string dupKeyError(const KeyDataType& key) const;
+
+        void setInternalKey(OperationContext* txn,
+                            BucketType* bucket,
+                            const DiskLoc bucketLoc,
+                            int keypos,
+                            const DiskLoc recordLoc,
+                            const KeyDataType& key,
+                            const DiskLoc lchild,
+                            const DiskLoc rchild);
+
+        void fixParentPtrs(OperationContext* trans,
+                           BucketType* bucket,
+                           const DiskLoc bucketLoc,
+                           int firstIndex = 0,
+                           int lastIndex = -1);
+
+        bool mayBalanceWithNeighbors(OperationContext* txn, BucketType* bucket, const DiskLoc bucketLoc);
+
+        void doBalanceChildren(OperationContext* txn,
+                               BucketType* bucket,
+                               const DiskLoc bucketLoc,
+                               int leftIndex);
+
+        void doBalanceLeftToRight(OperationContext* txn,
+                                  BucketType* bucket,
+                                  const DiskLoc thisLoc,
+                                  int leftIndex,
+                                  int split,
+                                  BucketType* l,
+                                  const DiskLoc lchild,
+                                  BucketType* r,
+                                  const DiskLoc rchild);
+
+        void doBalanceRightToLeft(OperationContext* txn,
+                                  BucketType* bucket,
+                                  const DiskLoc thisLoc,
+                                  int leftIndex,
+                                  int split,
+                                  BucketType* l,
+                                  const DiskLoc lchild,
+                                  BucketType* r,
+                                  const DiskLoc rchild);
+
+        bool tryBalanceChildren(OperationContext* txn,
+                                BucketType* bucket,
+                                const DiskLoc bucketLoc,
+                                int leftIndex);
+
+        int indexInParent(OperationContext* txn,
+                          BucketType* bucket,
+                          const DiskLoc bucketLoc) const;
+
+        void doMergeChildren(OperationContext* txn,
+                             BucketType* bucket,
+                             const DiskLoc bucketLoc,
+                             int leftIndex);
+
+        void replaceWithNextChild(OperationContext* txn,
+                                  BucketType* bucket,
+                                  const DiskLoc bucketLoc);
+
+        void deleteInternalKey(OperationContext* txn,
+                               BucketType* bucket,
+                               const DiskLoc bucketLoc,
+                               int keypos);
+
+        void delKeyAtPos(OperationContext* txn,
+                         BucketType* bucket,
+                         const DiskLoc bucketLoc,
+                         int p);
+
+        void delBucket(OperationContext* txn,
+                       BucketType* bucket,
+                       const DiskLoc bucketLoc);
+
+        void deallocBucket(OperationContext* txn,
+                           BucketType* bucket,
+                           const DiskLoc bucketLoc);
+
+        bool _keyIsAt(const BSONObj& savedKey,
+                     const DiskLoc& savedLoc,
+                     BucketType* bucket,
+                     int keyPos) const;
+
+        // TODO 'this' for _ordering(?)
+        int customBSONCmp(const BSONObj& l,
+                          const BSONObj& rBegin,
+                          int rBeginLen,
+                          bool rSup,
+                          const std::vector<const BSONElement*>& rEnd,
+                          const std::vector<bool>& rEndInclusive,
+                          const Ordering& o,
+                          int direction) const;
+
+        // TODO needs 'this' for _ordering for sanity check
+        bool _pushBack(BucketType* bucket,
+                       const DiskLoc recordLoc,
+                       const KeyDataType& key,
+                       const DiskLoc prevChild);
+
+        void pushBack(BucketType* bucket,
+                      const DiskLoc recordLoc,
+                      const KeyDataType& key,
+                      const DiskLoc prevChild) {
+            invariant(_pushBack(bucket, recordLoc, key, prevChild));
+        }
+
+        BucketType* childForPos(BucketType* bucket, int pos) const;
+
+        BucketType* getBucket(const DiskLoc dl) const;
+
+        BucketType* getRoot() const;
+
+        DiskLoc getRootLoc() const;
+
+        //
+        // Data
+        //
+
+        // Not owned here.
+        HeadManager* _headManager;
+
+        // Not owned here.
+        RecordStore* _recordStore;
+
+        Ordering _ordering;
+
+        string _indexName;
+
+        // Not owned here
+        BucketDeletionNotification* _bucketDeletion;
+    };
+
+}  // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp
new file mode 100644
index 00000000000..ca6cdce9a9e
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp
@@ -0,0 +1,2207 @@
+// btree_logic_test.cpp : Btree unit tests
+//
+
+/**
+ *    Copyright (C) 2014 MongoDB
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects
+ *    for all of the code used other than as permitted herein. If you modify
+ *    file(s) with this exception, you may extend this exception to your
+ *    version of the file(s), but you are not obligated to do so. If you do not
+ *    wish to do so, delete this exception statement from your version. If you
+ *    delete this exception statement from all source files in the program,
+ *    then also delete it in the license file.
+ */
+
+// This file contains simple single-threaded tests, which check various aspects of the Btree logic
+//
+
+#include "mongo/db/instance.h"
+#include "mongo/db/operation_context_noop.h"
+#include "mongo/db/storage/mmap_v1/btree/btree_test_help.h"
+#include "mongo/unittest/unittest.h"
+
+
+namespace mongo {
+
+    /**
+     * This class is made friend of BtreeLogic so we can add whatever private method accesses we
+     * need to it, to be used by the tests.
+     */
+    template<class BtreeLayoutType>
+    class BtreeLogicTestBase {
+    public:
+        typedef typename BtreeLayoutType::BucketType BucketType;
+        typedef typename BtreeLayoutType::FixedWidthKeyType FixedWidthKeyType;
+
+        typedef typename BtreeLogic<BtreeLayoutType>::FullKey FullKey;
+        typedef typename BtreeLogic<BtreeLayoutType>::KeyDataOwnedType KeyDataOwnedType;
+
+        BtreeLogicTestBase() : _helper(BSON("TheKey" << 1)) {
+
+        }
+
+        virtual ~BtreeLogicTestBase() {
+
+        }
+
+    protected:
+        void checkValidNumKeys(int nKeys) {
+            OperationContextNoop txn;
+            ASSERT_EQUALS(nKeys, _helper.btree.fullValidate(&txn, NULL, true, true, 0));
+        }
+
+        void insert(const BSONObj &key, const DiskLoc dl) {
+            OperationContextNoop txn;
+            _helper.btree.insert(&txn, key, dl, true);
+        }
+
+        bool unindex(const BSONObj &key) {
+            OperationContextNoop txn;
+            return _helper.btree.unindex(&txn, key, _helper.dummyDiskLoc);
+        }
+
+        void locate(const BSONObj &key,
+                    int expectedPos,
+                    bool expectedFound,
+                    const DiskLoc &expectedLocation,
+                    int direction) {
+            int pos;
+            DiskLoc loc;
+            OperationContextNoop txn;
+            ASSERT_EQUALS(expectedFound,
+                          _helper.btree.locate(&txn, key, _helper.dummyDiskLoc, direction, &pos, &loc));
+            ASSERT_EQUALS(expectedLocation, loc);
+            ASSERT_EQUALS(expectedPos, pos);
+        }
+
+        const BucketType* child(const BucketType* bucket, int i) const {
+            verify(i <= bucket->n);
+
+            DiskLoc diskLoc;
+            if (i == bucket->n) {
+                diskLoc = bucket->nextChild;
+            }
+            else {
+                FullKey fullKey = BtreeLogic<BtreeLayoutType>::getFullKey(bucket, i);
+                diskLoc = fullKey.prevChildBucket;
+            }
+
+            verify(!diskLoc.isNull());
+
+            return _helper.btree.getBucket(diskLoc);
+        }
+
+        BucketType* head() const {
+            return _helper.btree.getBucket(_helper.headManager.getHead());
+        }
+
+        void forcePackBucket(const DiskLoc bucketLoc) {
+            BucketType* bucket = _helper.btree.getBucket(bucketLoc);
+
+            bucket->topSize += bucket->emptySize;
+            bucket->emptySize = 0;
+            BtreeLogic<BtreeLayoutType>::setNotPacked(bucket);
+        }
+
+        void truncateBucket(BucketType* bucket, int N, int &refPos) {
+            _helper.btree.truncateTo(bucket, N, refPos);
+        }
+
+        int bucketPackedDataSize(BucketType* bucket, int refPos) {
+            return _helper.btree._packedDataSize(bucket, refPos);
+        }
+
+        int bucketRebalancedSeparatorPos(const DiskLoc bucketLoc, int leftIndex) {
+            BucketType* bucket = _helper.btree.getBucket(bucketLoc);
+            OperationContextNoop txn;
+            return _helper.btree._rebalancedSeparatorPos(&txn, bucket, leftIndex);
+        }
+
+        FullKey getKey(const DiskLoc bucketLoc, int pos) const {
+            const BucketType* bucket = _helper.btree.getBucket(bucketLoc);
+            return BtreeLogic<BtreeLayoutType>::getFullKey(bucket, pos);
+        }
+
+        void markKeyUnused(const DiskLoc bucketLoc, int keyPos) {
+            BucketType* bucket = _helper.btree.getBucket(bucketLoc);
+            invariant(keyPos >= 0 && keyPos < bucket->n);
+
+            _helper.btree.getKeyHeader(bucket, keyPos).setUnused();
+        }
+
+        DiskLoc newBucket() {
+            OperationContextNoop txn;
+            return _helper.btree._addBucket(&txn);
+        }
+
+        /**
+         * Sets the nextChild pointer for the bucket at the specified location.
+         */
+        void setBucketNextChild(const DiskLoc bucketLoc, const DiskLoc nextChild) {
+            OperationContextNoop txn;
+
+            BucketType* bucket = _helper.btree.getBucket(bucketLoc);
+            bucket->nextChild = nextChild;
+
+            _helper.btree.fixParentPtrs(&txn, bucket, bucketLoc);
+        }
+
+    protected:
+        BtreeLogicTestHelper<BtreeLayoutType> _helper;
+    };
+
+    //
+    // TESTS
+    //
+
+    template<class OnDiskFormat>
+    class SimpleCreate : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            this->_helper.btree.initAsEmpty(&txn);
+
+            this->checkValidNumKeys(0);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class SimpleInsertDelete : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            this->_helper.btree.initAsEmpty(&txn);
+
+            BSONObj key = simpleKey('z');
+            this->insert(key, this->_helper.dummyDiskLoc);
+
+            this->checkValidNumKeys(1);
+            this->locate(key, 0, true, this->_helper.headManager.getHead(), 1);
+
+            this->unindex(key);
+
+            this->checkValidNumKeys(0);
+            this->locate(key, 0, false, DiskLoc(), 1);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class SplitUnevenBucketBase : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            this->_helper.btree.initAsEmpty(&txn);
+
+            for (int i = 0; i < 10; ++i) {
+                BSONObj shortKey = simpleKey(shortToken(i), 1);
+                this->insert(shortKey, this->_helper.dummyDiskLoc);
+
+                BSONObj longKey = simpleKey(longToken(i), 800);
+                this->insert(longKey, this->_helper.dummyDiskLoc);
+            }
+
+            this->checkValidNumKeys(20);
+            ASSERT_EQUALS(1, this->head()->n);
+            checkSplit();
+        }
+
+    protected:
+        virtual char shortToken(int i) const = 0;
+        virtual char longToken(int i) const = 0;
+        virtual void checkSplit() = 0;
+
+        static char leftToken(int i) {
+            return 'a' + i;
+        }
+
+        static char rightToken(int i) {
+            return 'z' - i;
+        }
+    };
+
+    template<class OnDiskFormat>
+    class SplitRightHeavyBucket : public SplitUnevenBucketBase<OnDiskFormat> {
+    private:
+        virtual char shortToken(int i) const {
+            return this->leftToken(i);
+        }
+        virtual char longToken(int i) const {
+            return this->rightToken(i);
+        }
+        virtual void checkSplit() {
+            ASSERT_EQUALS(15, this->child(this->head(), 0)->n);
+            ASSERT_EQUALS(4, this->child(this->head(), 1)->n);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class SplitLeftHeavyBucket : public SplitUnevenBucketBase<OnDiskFormat> {
+    private:
+        virtual char shortToken(int i) const {
+            return this->rightToken(i);
+        }
+        virtual char longToken(int i) const {
+            return this->leftToken(i);
+        }
+        virtual void checkSplit() {
+            ASSERT_EQUALS(4, this->child(this->head(), 0)->n);
+            ASSERT_EQUALS(15, this->child(this->head(), 1)->n);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class MissingLocate : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            this->_helper.btree.initAsEmpty(&txn);
+
+            for (int i = 0; i < 3; ++i) {
+                BSONObj k = simpleKey('b' + 2 * i);
+                this->insert(k, this->_helper.dummyDiskLoc);
+            }
+
+            locateExtended(1, 'a', 'b', this->_helper.headManager.getHead());
+            locateExtended(1, 'c', 'd', this->_helper.headManager.getHead());
+            locateExtended(1, 'e', 'f', this->_helper.headManager.getHead());
+            locateExtended(1, 'g', 'g' + 1, DiskLoc()); // of course, 'h' isn't in the index.
+
+            // old behavior
+            //       locateExtended( -1, 'a', 'b', dl() );
+            //       locateExtended( -1, 'c', 'd', dl() );
+            //       locateExtended( -1, 'e', 'f', dl() );
+            //       locateExtended( -1, 'g', 'f', dl() );
+
+            locateExtended(-1, 'a', 'a' - 1, DiskLoc()); // of course, 'a' - 1 isn't in the index
+            locateExtended(-1, 'c', 'b', this->_helper.headManager.getHead());
+            locateExtended(-1, 'e', 'd', this->_helper.headManager.getHead());
+            locateExtended(-1, 'g', 'f', this->_helper.headManager.getHead());
+        }
+
+    private:
+        void locateExtended(
+                    int direction, char token, char expectedMatch, DiskLoc expectedLocation) {
+            const BSONObj k = simpleKey(token);
+            int expectedPos = (expectedMatch - 'b') / 2;
+
+            this->locate(k, expectedPos, false, expectedLocation, direction);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class MissingLocateMultiBucket : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            this->_helper.btree.initAsEmpty(&txn);
+
+            this->insert(simpleKey('A', 800), this->_helper.dummyDiskLoc);
+            this->insert(simpleKey('B', 800), this->_helper.dummyDiskLoc);
+            this->insert(simpleKey('C', 800), this->_helper.dummyDiskLoc);
+            this->insert(simpleKey('D', 800), this->_helper.dummyDiskLoc);
+            this->insert(simpleKey('E', 800), this->_helper.dummyDiskLoc);
+            this->insert(simpleKey('F', 800), this->_helper.dummyDiskLoc);
+            this->insert(simpleKey('G', 800), this->_helper.dummyDiskLoc);
+            this->insert(simpleKey('H', 800), this->_helper.dummyDiskLoc);
+            this->insert(simpleKey('J', 800), this->_helper.dummyDiskLoc);
+
+            // This causes split
+            this->insert(simpleKey('I', 800), this->_helper.dummyDiskLoc);
+
+            int pos;
+            DiskLoc loc;
+
+            // 'E' is the split point and should be in the head the rest should be ~50/50
+            const BSONObj splitPoint = simpleKey('E', 800);
+            this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &pos, &loc);
+            ASSERT_EQUALS(this->_helper.headManager.getHead(), loc);
+            ASSERT_EQUALS(0, pos);
+
+            // Find the one before 'E'
+            int largePos;
+            DiskLoc largeLoc;
+            this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &largePos, &largeLoc);
+            this->_helper.btree.advance(&txn, &largeLoc, &largePos, -1);
+
+            // Find the one after 'E'
+            int smallPos;
+            DiskLoc smallLoc;
+            this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &smallPos, &smallLoc);
+            this->_helper.btree.advance(&txn, &smallLoc, &smallPos, 1);
+
+            ASSERT_NOT_EQUALS(smallLoc, largeLoc);
+            ASSERT_NOT_EQUALS(smallLoc, loc);
+            ASSERT_NOT_EQUALS(largeLoc, loc);
+        }
+    };
+
+    /**
+     * Validates that adding keys incrementally produces buckets, which are 90%/10% full.
+     */
+    template<class OnDiskFormat>
+    class SERVER983 : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            this->_helper.btree.initAsEmpty(&txn);
+
+            this->insert(simpleKey('A', 800), this->_helper.dummyDiskLoc);
+            this->insert(simpleKey('B', 800), this->_helper.dummyDiskLoc);
+            this->insert(simpleKey('C', 800), this->_helper.dummyDiskLoc);
+            this->insert(simpleKey('D', 800), this->_helper.dummyDiskLoc);
+            this->insert(simpleKey('E', 800), this->_helper.dummyDiskLoc);
+            this->insert(simpleKey('F', 800), this->_helper.dummyDiskLoc);
+            this->insert(simpleKey('G', 800), this->_helper.dummyDiskLoc);
+            this->insert(simpleKey('H', 800), this->_helper.dummyDiskLoc);
+            this->insert(simpleKey('I', 800), this->_helper.dummyDiskLoc);
+
+            // This will cause split
+            this->insert(simpleKey('J', 800), this->_helper.dummyDiskLoc);
+
+            int pos;
+            DiskLoc loc;
+
+            // 'H' is the maximum 'large' interval key, 90% should be < 'H' and 10% larger
+            const BSONObj splitPoint = simpleKey('H', 800);
+            this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &pos, &loc);
+            ASSERT_EQUALS(this->_helper.headManager.getHead(), loc);
+            ASSERT_EQUALS(0, pos);
+
+            // Find the one before 'H'
+            int largePos;
+            DiskLoc largeLoc;
+            this->_helper.btree.locate(&txn,
+                                splitPoint, this->_helper.dummyDiskLoc, 1, &largePos, &largeLoc);
+            this->_helper.btree.advance(&txn, &largeLoc, &largePos, -1);
+
+            // Find the one after 'H'
+            int smallPos;
+            DiskLoc smallLoc;
+            this->_helper.btree.locate(&txn, 
+                                splitPoint, this->_helper.dummyDiskLoc, 1, &smallPos, &smallLoc);
+            this->_helper.btree.advance(&txn, &smallLoc, &smallPos, 1);
+
+            ASSERT_NOT_EQUALS(smallLoc, largeLoc);
+            ASSERT_NOT_EQUALS(smallLoc, loc);
+            ASSERT_NOT_EQUALS(largeLoc, loc);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class DontReuseUnused : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            this->_helper.btree.initAsEmpty(&txn);
+
+            for (int i = 0; i < 10; ++i) {
+                const BSONObj k = simpleKey('b' + 2 * i, 800);
+                this->insert(k, this->_helper.dummyDiskLoc);
+            }
+
+            const BSONObj root = simpleKey('p', 800);
+            this->unindex(root);
+
+            this->insert(root, this->_helper.dummyDiskLoc);
+            this->locate(root, 0, true, this->head()->nextChild, 1);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class MergeBucketsTestBase : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            this->_helper.btree.initAsEmpty(&txn);
+
+            for (int i = 0; i < 10; ++i) {
+                const BSONObj k = simpleKey('b' + 2 * i, 800);
+                this->insert(k, this->_helper.dummyDiskLoc);
+            }
+            
+            // numRecords() - 1, because this->_helper.dummyDiskLoc is actually in the record store too
+            ASSERT_EQUALS(3, this->_helper.recordStore.numRecords() - 1);
+
+            long long expectedCount = 10 - unindexKeys();
+            ASSERT_EQUALS(1, this->_helper.recordStore.numRecords() - 1);
+
+            long long unusedCount = 0;
+            ASSERT_EQUALS(expectedCount, this->_helper.btree.fullValidate(&txn, &unusedCount, true, true, 0));
+            ASSERT_EQUALS(0, unusedCount);
+        }
+
+    protected:
+        virtual int unindexKeys() = 0;
+    };
+
+    template<class OnDiskFormat>
+    class MergeBucketsLeft : public MergeBucketsTestBase<OnDiskFormat> {
+        virtual int unindexKeys() {
+            BSONObj k = simpleKey('b', 800);
+            this->unindex(k);
+
+            k = simpleKey('b' + 2, 800);
+            this->unindex(k);
+
+            k = simpleKey('b' + 4, 800);
+            this->unindex(k);
+
+            k = simpleKey('b' + 6, 800);
+            this->unindex(k);
+
+            return 4;
+        }
+    };
+
+    template<class OnDiskFormat>
+    class MergeBucketsRight : public MergeBucketsTestBase<OnDiskFormat> {
+        virtual int unindexKeys() {
+            const BSONObj k = simpleKey('b' + 2 * 9, 800);
+            this->unindex(k);
+            return 1;
+        }
+    };
+
+    template<class OnDiskFormat>
+    class MergeBucketsDontReplaceHead : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            this->_helper.btree.initAsEmpty(&txn);
+
+            for (int i = 0; i < 18; ++i) {
+                const BSONObj k = simpleKey('a' + i, 800);
+                this->insert(k, this->_helper.dummyDiskLoc);
+            }
+
+            // numRecords() - 1, because fixedDiskLoc is actually in the record store too
+            ASSERT_EQUALS(4, this->_helper.recordStore.numRecords() - 1);
+
+            const BSONObj k = simpleKey('a' + 17, 800);
+            this->unindex(k);
+            ASSERT_EQUALS(3, this->_helper.recordStore.numRecords() - 1);
+
+            long long unusedCount = 0;
+            ASSERT_EQUALS(17, this->_helper.btree.fullValidate(&txn, &unusedCount, true, true, 0));
+            ASSERT_EQUALS(0, unusedCount);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class MergeBucketsDelInternal : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{d:{b:{a:null},bb:null,_:{c:null}},_:{f:{e:null},_:{g:null}}}");
+            ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << "bb");
+            verify(this->unindex(k));
+
+            ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(6, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{b:{a:null},d:{c:null},f:{e:null},_:{g:null}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class MergeBucketsRightNull : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{d:{b:{a:null},bb:null,cc:{c:null}},_:{f:{e:null},h:{g:null}}}");
+            ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << "bb");
+            verify(this->unindex(k));
+
+            ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(6, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}");
+        }
+    };
+
+    // This comment was here during porting, not sure what it means:
+    //
+    // "Not yet handling this case"
+    template<class OnDiskFormat>
+    class DontMergeSingleBucket : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{d:{b:{a:null},c:null}}");
+
+            ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << "c");
+            verify(this->unindex(k));
+
+            ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{d:{b:{a:null}}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class ParentMergeNonRightToLeft : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{d:{b:{a:null},bb:null,cc:{c:null}},i:{f:{e:null},h:{g:null}}}");
+
+            ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << "bb");
+            verify(this->unindex(k));
+
+            ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // Child does not currently replace parent in this case. Also, the tree 
+            // has 6 buckets + 1 for the this->_helper.dummyDiskLoc.
+            ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{i:{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class ParentMergeNonRightToRight : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{d:{b:{a:null},cc:{c:null}},i:{f:{e:null},ff:null,h:{g:null}}}");
+
+            ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << "ff");
+            verify(this->unindex(k));
+
+            ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // Child does not currently replace parent in this case. Also, the tree 
+            // has 6 buckets + 1 for the this->_helper.dummyDiskLoc.
+            ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{i:{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class CantMergeRightNoMerge : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{d:{b:{a:null},bb:null,cc:{c:null}},"
+                             "dd:null,"
+                             "_:{f:{e:null},h:{g:null}}}");
+
+            ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << "bb");
+            verify(this->unindex(k));
+
+            ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{d:{b:{a:null},cc:{c:null}},"
+                                   "dd:null,"
+                                   "_:{f:{e:null},h:{g:null}}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class CantMergeLeftNoMerge : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{c:{b:{a:null}},d:null,_:{f:{e:null},g:null}}");
+
+            ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(6, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << "g");
+            verify(this->unindex(k));
+
+            ASSERT_EQUALS(6, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(6, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{c:{b:{a:null}},d:null,_:{f:{e:null}}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class MergeOption : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{c:{b:{a:null}},f:{e:{d:null},ee:null},_:{h:{g:null}}}");
+
+            ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << "ee");
+            verify(this->unindex(k));
+
+            ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{c:{b:{a:null}},_:{e:{d:null},f:null,h:{g:null}}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class ForceMergeLeft : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{c:{b:{a:null}},f:{e:{d:null},ee:null},ff:null,_:{h:{g:null}}}");
+
+            ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << "ee");
+            verify(this->unindex(k));
+            
+            ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{f:{b:{a:null},c:null,e:{d:null}},ff:null,_:{h:{g:null}}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class ForceMergeRight : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{c:{b:{a:null}},cc:null,f:{e:{d:null},ee:null},_:{h:{g:null}}}");
+
+            ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(8, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << "ee");
+            verify(this->unindex(k));
+
+            ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{c:{b:{a:null}},cc:null,_:{e:{d:null},f:null,h:{g:null}}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class RecursiveMerge : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},g:{f:null}},j:{i:null}}");
+
+            ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << "c");
+            verify(this->unindex(k));
+
+            ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+            // Height is not currently reduced in this case
+            builder.checkStructure("{j:{g:{b:{a:null},d:null,e:null,f:null},h:null,i:null}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class RecursiveMergeRightBucket : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},g:{f:null}},_:{i:null}}");
+
+            ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << "c");
+            verify(this->unindex(k));
+
+            ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{g:{b:{a:null},d:null,e:null,f:null},h:null,i:null}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class RecursiveMergeDoubleRightBucket : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},_:{f:null}},_:{i:null}}");
+
+            ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << "c");
+            verify(this->unindex(k));
+
+            ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+            // no recursion currently in this case
+            builder.checkStructure("{h:{b:{a:null},d:null,e:null,f:null},_:{i:null}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class MergeSizeTestBase : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        MergeSizeTestBase() : _count(0) {
+
+        }
+
+        void run() {
+            OperationContextNoop txn;
+            this->_helper.btree.initAsEmpty(&txn);
+
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+            
+            const BSONObj& topKey = biggestKey('m');
+
+            DiskLoc leftChild = this->newBucket();
+            builder.push(this->_helper.headManager.getHead(), topKey, leftChild);
+            _count++;
+
+            DiskLoc rightChild = this->newBucket();
+            this->setBucketNextChild(this->_helper.headManager.getHead(), rightChild);
+
+            _count += builder.fillBucketToExactSize(leftChild, leftSize(), 'a');
+            _count += builder.fillBucketToExactSize(rightChild, rightSize(), 'n');
+
+            ASSERT(leftAdditional() <= 2);
+            if (leftAdditional() >= 2) {
+                builder.push(leftChild, bigKey('k'), DiskLoc());
+            }
+            if (leftAdditional() >= 1) {
+                builder.push(leftChild, bigKey('l'), DiskLoc());
+            }
+
+            ASSERT(rightAdditional() <= 2);
+            if (rightAdditional() >= 2) {
+                builder.push(rightChild, bigKey('y'), DiskLoc());
+            }
+            if (rightAdditional() >= 1) {
+                builder.push(rightChild, bigKey('z'), DiskLoc());
+            }
+
+            _count += leftAdditional() + rightAdditional();
+
+            initCheck();
+
+            const char *keys = delKeys();
+            for (const char *i = keys; *i; ++i) {
+                long long unused = 0;
+                ASSERT_EQUALS(_count, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+                ASSERT_EQUALS(0, unused);
+
+                // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+                ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+
+                const BSONObj k = bigKey(*i);
+                this->unindex(k);
+
+                --_count;
+            }
+
+            long long unused = 0;
+            ASSERT_EQUALS(_count, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+            ASSERT_EQUALS(0, unused);
+
+            validate();
+
+            if (!merge()) {
+                // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+                ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+            }
+            else {
+                // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+                ASSERT_EQUALS(2, this->_helper.recordStore.numRecords());
+            }
+        }
+
+    protected:
+        virtual int leftAdditional() const { return 2; }
+        virtual int rightAdditional() const { return 2; }
+        virtual void initCheck() {}
+        virtual void validate() {}
+        virtual int leftSize() const = 0;
+        virtual int rightSize() const = 0;
+        virtual const char * delKeys() const { return "klyz"; }
+        virtual bool merge() const { return true; }
+
+        static BSONObj bigKey(char a) {
+            return simpleKey(a, 801);
+        }
+
+        static BSONObj biggestKey(char a) {
+            int size = OnDiskFormat::KeyMax - bigSize() + 801;
+            return simpleKey(a, size);
+        }
+
+        static int bigSize() {
+            return typename BtreeLogicTestBase<OnDiskFormat>::KeyDataOwnedType(bigKey('a')).dataSize();
+        }
+
+        static int biggestSize() {
+            return typename BtreeLogicTestBase<OnDiskFormat>::KeyDataOwnedType(biggestKey('a')).dataSize();
+        }
+
+        int _count;
+    };
+
+    template<class OnDiskFormat>
+    class MergeSizeJustRightRight : public MergeSizeTestBase<OnDiskFormat> {
+    protected:
+        virtual int rightSize() const { 
+            return BtreeLogic<OnDiskFormat>::lowWaterMark() - 1; 
+        }
+
+        virtual int leftSize() const { 
+            return OnDiskFormat::BucketBodySize -
+                   MergeSizeTestBase<OnDiskFormat>::biggestSize() -
+                   sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) -
+                   (BtreeLogic<OnDiskFormat>::lowWaterMark() - 1);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class MergeSizeJustRightLeft : public MergeSizeTestBase<OnDiskFormat> {
+    protected:
+        virtual int leftSize() const { 
+            return BtreeLogic<OnDiskFormat>::lowWaterMark() - 1;
+        }
+
+        virtual int rightSize() const { 
+            return OnDiskFormat::BucketBodySize -
+                   MergeSizeTestBase<OnDiskFormat>::biggestSize() -
+                   sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) -
+                   (BtreeLogic<OnDiskFormat>::lowWaterMark() - 1);
+        }
+
+        virtual const char * delKeys() const { return "yzkl"; }
+    };
+
+    template<class OnDiskFormat>
+    class MergeSizeRight : public MergeSizeJustRightRight<OnDiskFormat> {
+        virtual int rightSize() const { return MergeSizeJustRightRight<OnDiskFormat>::rightSize() - 1; }
+        virtual int leftSize() const { return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1; }
+    };
+
+    template<class OnDiskFormat>
+    class MergeSizeLeft : public MergeSizeJustRightLeft<OnDiskFormat> {
+        virtual int rightSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1; }
+        virtual int leftSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() - 1; }
+    };
+
+    template<class OnDiskFormat>
+    class NoMergeBelowMarkRight : public MergeSizeJustRightRight<OnDiskFormat> {
+        virtual int rightSize() const { return MergeSizeJustRightRight<OnDiskFormat>::rightSize() + 1; }
+        virtual int leftSize() const { return MergeSizeJustRightRight<OnDiskFormat>::leftSize() - 1; }
+        virtual bool merge() const { return false; }
+    };
+
+    template<class OnDiskFormat>
+    class NoMergeBelowMarkLeft : public MergeSizeJustRightLeft<OnDiskFormat> {
+        virtual int rightSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() - 1; }
+        virtual int leftSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() + 1; }
+        virtual bool merge() const { return false; }
+    };
+
+    template<class OnDiskFormat>
+    class MergeSizeRightTooBig : public MergeSizeJustRightLeft<OnDiskFormat> {
+        virtual int rightSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1; }
+        virtual bool merge() const { return false; }
+    };
+
+    template<class OnDiskFormat>
+    class MergeSizeLeftTooBig : public MergeSizeJustRightRight<OnDiskFormat> {
+        virtual int leftSize() const { return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1; }
+        virtual bool merge() const { return false; }
+    };
+
+    template<class OnDiskFormat>
+    class MergeRightEmpty : public MergeSizeTestBase<OnDiskFormat> {
+    protected:
+        virtual int rightAdditional() const { return 1; }
+        virtual int leftAdditional() const { return 1; }
+        virtual const char * delKeys() const { return "lz"; }
+        virtual int rightSize() const { return 0; }
+        virtual int leftSize() const { 
+            return OnDiskFormat::BucketBodySize -
+                   MergeSizeTestBase<OnDiskFormat>::biggestSize() - 
+                   sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class MergeMinRightEmpty : public MergeSizeTestBase<OnDiskFormat> {
+    protected:
+        virtual int rightAdditional() const { return 1; }
+        virtual int leftAdditional() const { return 0; }
+        virtual const char * delKeys() const { return "z"; }
+        virtual int rightSize() const { return 0; }
+        virtual int leftSize() const { 
+            return MergeSizeTestBase<OnDiskFormat>::bigSize() + 
+                   sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class MergeLeftEmpty : public MergeSizeTestBase<OnDiskFormat> {
+    protected:
+        virtual int rightAdditional() const { return 1; }
+        virtual int leftAdditional() const { return 1; }
+        virtual const char * delKeys() const { return "zl"; }
+        virtual int leftSize() const { return 0; }
+        virtual int rightSize() const { 
+            return OnDiskFormat::BucketBodySize -
+                   MergeSizeTestBase<OnDiskFormat>::biggestSize() - 
+                   sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class MergeMinLeftEmpty : public MergeSizeTestBase<OnDiskFormat> {
+    protected:
+        virtual int leftAdditional() const { return 1; }
+        virtual int rightAdditional() const { return 0; }
+        virtual const char * delKeys() const { return "l"; }
+        virtual int leftSize() const { return 0; }
+        virtual int rightSize() const { 
+            return MergeSizeTestBase<OnDiskFormat>::bigSize() +
+                   sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class BalanceRightEmpty : public MergeRightEmpty<OnDiskFormat> {
+    protected:
+        virtual int leftSize() const { 
+            return OnDiskFormat::BucketBodySize -
+                   MergeSizeTestBase<OnDiskFormat>::biggestSize() - 
+                   sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) + 1;
+        }
+
+        virtual bool merge() const { return false; }
+
+        virtual void initCheck() {
+            _oldTop = this->getKey(this->_helper.headManager.getHead(), 0).data.toBson();
+        }
+
+        virtual void validate() {
+            ASSERT_NOT_EQUALS(_oldTop, this->getKey(this->_helper.headManager.getHead(), 0).data.toBson());
+        }
+
+    private:
+        BSONObj _oldTop;
+    };
+
+    template<class OnDiskFormat>
+    class BalanceLeftEmpty : public MergeLeftEmpty<OnDiskFormat> {
+    protected:
+        virtual int rightSize() const { 
+            return OnDiskFormat::BucketBodySize -
+                   MergeSizeTestBase<OnDiskFormat>::biggestSize() - 
+                   sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) + 1;
+        }
+
+        virtual bool merge() const { return false; }
+
+        virtual void initCheck() {
+            _oldTop = this->getKey(this->_helper.headManager.getHead(), 0).data.toBson();
+        }
+
+        virtual void validate() { 
+            ASSERT_TRUE(_oldTop != this->getKey(this->_helper.headManager.getHead(), 0).data.toBson());
+        }
+
+    private:
+        BSONObj _oldTop;
+    };
+
+    template<class OnDiskFormat>
+    class BalanceOneLeftToRight : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},"
+                                "b:{$20:null,$30:null,$40:null,$50:null,a:null},"
+                                "_:{c:null}}");
+
+            ASSERT_EQUALS(14, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << bigNumString(0x40, 800));
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},"
+                                     "b:{$10:null,$20:null,$30:null,$50:null,a:null},"
+                                     "_:{c:null}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class BalanceOneRightToLeft : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null},"
+                                "b:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null},"
+                                "_:{c:null}}");
+
+            ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << bigNumString(0x3, 800));
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{$20:{$1:null,$2:null,$4:null,$10:null},"
+                                      "b:{$30:null,$40:null,$50:null,$60:null,$70:null},"
+                                      "_:{c:null}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class BalanceThreeLeftToRight : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{$20:{$1:{$0:null},$3:{$2:null},$5:{$4:null},$7:{$6:null},"
+                                   "$9:{$8:null},$11:{$10:null},$13:{$12:null},_:{$14:null}},"
+                                "b:{$30:null,$40:{$35:null},$50:{$45:null}},"
+                                "_:{c:null}}");
+
+            ASSERT_EQUALS(23, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 14 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(15, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << bigNumString(0x30, 800));
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(22, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 14 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(15, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{$9:{$1:{$0:null},$3:{$2:null},"
+                                            "$5:{$4:null},$7:{$6:null},_:{$8:null}},"
+                                     "b:{$11:{$10:null},$13:{$12:null},$20:{$14:null},"
+                                            "$40:{$35:null},$50:{$45:null}},"
+                                     "_:{c:null}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class BalanceThreeRightToLeft : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{$20:{$1:{$0:null},$3:{$2:null},$5:null,_:{$14:null}},"
+                                "b:{$30:{$25:null},$40:{$35:null},$50:{$45:null},$60:{$55:null},"
+                                        "$70:{$65:null},$80:{$75:null},"
+                                        "$90:{$85:null},$100:{$95:null}},"
+                                "_:{c:null}}");
+
+            ASSERT_EQUALS(25, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 15 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(16, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << bigNumString(0x5, 800));
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(24, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 15 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(16, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{$50:{$1:{$0:null},$3:{$2:null},$20:{$14:null},"
+                                            "$30:{$25:null},$40:{$35:null},_:{$45:null}},"
+                                      "b:{$60:{$55:null},$70:{$65:null},$80:{$75:null},"
+                                            "$90:{$85:null},$100:{$95:null}},"
+                                      "_:{c:null}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class BalanceSingleParentKey : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},"
+                                "_:{$20:null,$30:null,$40:null,$50:null,a:null}}");
+
+            ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << bigNumString(0x40, 800));
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},"
+                                     "_:{$10:null,$20:null,$30:null,$50:null,a:null}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class PackEmptyBucket : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{a:null}");
+
+            const BSONObj k = BSON("" << "a");
+            ASSERT(this->unindex(k));
+
+            this->forcePackBucket(this->_helper.headManager.getHead());
+
+            typename BtreeLogicTestBase<OnDiskFormat>::BucketType* headBucket = this->head();
+
+            ASSERT_EQUALS(0, headBucket->n);
+            ASSERT_FALSE(headBucket->flags & Packed);
+
+            int unused = 0;
+            this->truncateBucket(headBucket, 0, unused);
+
+            ASSERT_EQUALS(0, headBucket->n);
+            ASSERT_EQUALS(0, headBucket->topSize);
+            ASSERT_EQUALS((int)OnDiskFormat::BucketBodySize, headBucket->emptySize);
+            ASSERT_TRUE(headBucket->flags & Packed);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class PackedDataSizeEmptyBucket : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{a:null}");
+
+            const BSONObj k = BSON("" << "a");
+            ASSERT(this->unindex(k));
+
+            this->forcePackBucket(this->_helper.headManager.getHead());
+
+            typename BtreeLogicTestBase<OnDiskFormat>::BucketType* headBucket = this->head();
+
+            ASSERT_EQUALS(0, headBucket->n);
+            ASSERT_FALSE(headBucket->flags & Packed);
+            ASSERT_EQUALS(0, this->bucketPackedDataSize(headBucket, 0));
+            ASSERT_FALSE(headBucket->flags & Packed);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class BalanceSingleParentKeyPackParent : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},"
+                                "_:{$20:null,$30:null,$40:null,$50:null,a:null}}");
+
+            ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+
+            // force parent pack
+            this->forcePackBucket(this->_helper.headManager.getHead());
+
+            const BSONObj k = BSON("" << bigNumString(0x40, 800));
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{$6:{$1:null,$2:null,$3:null,$4:null,$5:null},"
+                                     "_:{$10:null,$20:null,$30:null,$50:null,a:null}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class BalanceSplitParent : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree(
+                "{$10$10:{$1:null,$2:null,$3:null,$4:null},"
+                   "$100:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null,$80:null},"
+                   "$200:null,$300:null,$400:null,$500:null,$600:null,"
+                   "$700:null,$800:null,$900:null,_:{c:null}}");
+
+            ASSERT_EQUALS(22, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << bigNumString(0x3, 800));
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(21, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(7, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{$500:{  $30:{$1:null,$2:null,$4:null,$10$10:null,$20:null},"
+                                           "$100:{$40:null,$50:null,$60:null,$70:null,$80:null},"
+                                           "$200:null,$300:null,$400:null},"
+                                       "_:{$600:null,$700:null,$800:null,$900:null,_:{c:null}}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class RebalancedSeparatorBase : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree(treeSpec());
+            modTree();
+
+            ASSERT_EQUALS(expectedSeparator(),
+                          this->bucketRebalancedSeparatorPos(
+                                    this->_helper.headManager.getHead(), 0));
+        }
+
+        virtual string treeSpec() const = 0;
+        virtual int expectedSeparator() const = 0;
+        virtual void modTree() {}
+    };
+
+    template<class OnDiskFormat>
+    class EvenRebalanceLeft : public RebalancedSeparatorBase<OnDiskFormat> {
+        virtual string treeSpec() const { return "{$7:{$1:null,$2$31f:null,$3:null,"
+                                                        "$4$31f:null,$5:null,$6:null},"
+                                                   "_:{$8:null,$9:null,$10$31e:null}}"; }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    template<class OnDiskFormat>
+    class EvenRebalanceLeftCusp : public RebalancedSeparatorBase<OnDiskFormat> {
+        virtual string treeSpec() const { 
+            return "{$6:{$1:null,$2$31f:null,$3:null,$4$31f:null,$5:null},"
+                     "_:{$7:null,$8:null,$9$31e:null,$10:null}}"; 
+        }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    template<class OnDiskFormat>
+    class EvenRebalanceRight : public RebalancedSeparatorBase<OnDiskFormat> {
+        virtual string treeSpec() const { return "{$3:{$1:null,$2$31f:null},_:{$4$31f:null,$5:null,$6:null,$7:null,$8$31e:null,$9:null,$10:null}}"; }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    template<class OnDiskFormat>
+    class EvenRebalanceRightCusp : public RebalancedSeparatorBase<OnDiskFormat> {
+        virtual string treeSpec() const { return "{$4$31f:{$1:null,$2$31f:null,$3:null},_:{$5:null,$6:null,$7$31e:null,$8:null,$9:null,$10:null}}"; }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    template<class OnDiskFormat>
+    class EvenRebalanceCenter : public RebalancedSeparatorBase<OnDiskFormat> {
+        virtual string treeSpec() const { return "{$5:{$1:null,$2$31f:null,$3:null,$4$31f:null},_:{$6:null,$7$31e:null,$8:null,$9:null,$10:null}}"; }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    template<class OnDiskFormat>
+    class OddRebalanceLeft : public RebalancedSeparatorBase<OnDiskFormat> {
+        virtual string treeSpec() const { return "{$6$31f:{$1:null,$2:null,$3:null,$4:null,$5:null},_:{$7:null,$8:null,$9:null,$10:null}}"; }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    template<class OnDiskFormat>
+    class OddRebalanceRight : public RebalancedSeparatorBase<OnDiskFormat> {
+        virtual string treeSpec() const { return "{$4:{$1:null,$2:null,$3:null},_:{$5:null,$6:null,$7:null,$8$31f:null,$9:null,$10:null}}"; }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    template<class OnDiskFormat>
+    class OddRebalanceCenter : public RebalancedSeparatorBase<OnDiskFormat> {
+        virtual string treeSpec() const { return "{$5:{$1:null,$2:null,$3:null,$4:null},_:{$6:null,$7:null,$8:null,$9:null,$10$31f:null}}"; }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    template<class OnDiskFormat>
+    class RebalanceEmptyRight : public RebalancedSeparatorBase<OnDiskFormat> {
+        virtual string treeSpec() const { return "{$a:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null,$7:null,$8:null,$9:null},_:{$b:null}}"; }
+        virtual void modTree() {
+            BSONObj k = BSON("" << bigNumString(0xb, 800));
+            ASSERT(this->unindex(k));
+        }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    template<class OnDiskFormat>
+    class RebalanceEmptyLeft : public RebalancedSeparatorBase<OnDiskFormat> {
+        virtual string treeSpec() const { return "{$a:{$1:null},_:{$11:null,$12:null,$13:null,$14:null,$15:null,$16:null,$17:null,$18:null,$19:null}}"; }
+        virtual void modTree() {
+            BSONObj k = BSON("" << bigNumString(0x1, 800));
+            ASSERT(this->unindex(k));
+        }
+        virtual int expectedSeparator() const { return 4; }
+    };
+
+    template<class OnDiskFormat>
+    class NoMoveAtLowWaterMarkRight : public MergeSizeJustRightRight<OnDiskFormat> {
+        virtual int rightSize() const { return MergeSizeJustRightRight<OnDiskFormat>::rightSize() + 1; }
+
+        virtual void initCheck() {
+            _oldTop = this->getKey(this->_helper.headManager.getHead(), 0).data.toBson();
+        }
+
+        virtual void validate() {
+            ASSERT_EQUALS(_oldTop, this->getKey(this->_helper.headManager.getHead(), 0).data.toBson());
+        }
+
+        virtual bool merge() const { return false; }
+
+    protected:
+        BSONObj _oldTop;
+    };
+
+    template<class OnDiskFormat>
+    class MoveBelowLowWaterMarkRight : public NoMoveAtLowWaterMarkRight<OnDiskFormat> {
+        virtual int rightSize() const { return MergeSizeJustRightRight<OnDiskFormat>::rightSize(); }
+        virtual int leftSize() const { return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1; }
+
+        virtual void validate() {
+            // Different top means we rebalanced
+            ASSERT_NOT_EQUALS(this->_oldTop, 
+                              this->getKey(this->_helper.headManager.getHead(), 0).data.toBson());
+        }
+    };
+
+    template<class OnDiskFormat>
+    class NoMoveAtLowWaterMarkLeft : public MergeSizeJustRightLeft<OnDiskFormat> {
+        virtual int leftSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() + 1; }
+        virtual void initCheck() {
+            this->_oldTop = this->getKey(this->_helper.headManager.getHead(), 0).data.toBson();
+        }
+
+        virtual void validate() {
+            ASSERT_EQUALS(this->_oldTop,
+                          this->getKey(this->_helper.headManager.getHead(), 0).data.toBson());
+        }
+        virtual bool merge() const { return false; }
+
+    protected:
+        BSONObj _oldTop;
+    };
+
+    template<class OnDiskFormat>
+    class MoveBelowLowWaterMarkLeft : public NoMoveAtLowWaterMarkLeft<OnDiskFormat> {
+        virtual int leftSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::leftSize(); }
+        virtual int rightSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1; }
+
+        virtual void validate() {
+            // Different top means we rebalanced
+            ASSERT_NOT_EQUALS(this->_oldTop,
+                              this->getKey(this->_helper.headManager.getHead(), 0).data.toBson());
+        }
+    };
+
+    template<class OnDiskFormat>
+    class PreferBalanceLeft : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null},"
+                              "$20:{$11:null,$12:null,$13:null,$14:null},"
+                                "_:{$30:null}}");
+
+            ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << bigNumString(0x12, 800));
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{$5:{$1:null,$2:null,$3:null,$4:null},"
+                                   "$20:{$6:null,$10:null,$11:null,$13:null,$14:null},"
+                                     "_:{$30:null}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class PreferBalanceRight : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{$10:{$1:null},"
+                              "$20:{$11:null,$12:null,$13:null,$14:null},"
+                                "_:{$31:null,$32:null,$33:null,$34:null,$35:null,$36:null}}");
+
+            ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << bigNumString(0x12, 800));
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{$10:{$1:null},"
+                                    "$31:{$11:null,$13:null,$14:null,$20:null},"
+                                      "_:{$32:null,$33:null,$34:null,$35:null,$36:null}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class RecursiveMergeThenBalance : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{$10:{$5:{$1:null,$2:null},$8:{$6:null,$7:null}},"
+                                "_:{$20:null,$30:null,$40:null,$50:null,"
+                                    "$60:null,$70:null,$80:null,$90:null}}");
+
+            ASSERT_EQUALS(15, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(6, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << bigNumString(0x7, 800));
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(14, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure(
+                "{$40:{$8:{$1:null,$2:null,$5:null,$6:null},$10:null,$20:null,$30:null},"
+                   "_:{$50:null,$60:null,$70:null,$80:null,$90:null}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class DelEmptyNoNeighbors : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{b:{a:null}}");
+
+            ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(3, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << "a");
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(1, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(2, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{b:null}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class DelEmptyEmptyNeighbors : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{a:null,c:{b:null},d:null}");
+
+            ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(3, this->_helper.recordStore.numRecords());
+
+            const BSONObj k = BSON("" << "b");
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0));
+
+            // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(2, this->_helper.recordStore.numRecords());
+
+            builder.checkStructure("{a:null,c:null,d:null}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class DelInternal : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{a:null,c:{b:null},d:null}");
+
+            long long unused = 0;
+            ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(3, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(0, unused);
+
+            const BSONObj k = BSON("" << "c");
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(2, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(0, unused);
+
+            builder.checkStructure("{a:null,b:null,d:null}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class DelInternalReplaceWithUnused : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{a:null,c:{b:null},d:null}");
+
+            const DiskLoc prevChildBucket = 
+                            this->getKey(this->_helper.headManager.getHead(), 1).prevChildBucket;
+            this->markKeyUnused(prevChildBucket, 0);
+
+            long long unused = 0;
+            ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(3, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(1, unused);
+
+            const BSONObj k = BSON("" << "c");
+            ASSERT(this->unindex(k));
+
+            unused = 0;
+            ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(2, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(1, unused);
+
+            // doesn't discriminate between used and unused
+            builder.checkStructure("{a:null,b:null,d:null}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class DelInternalReplaceRight : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{a:null,_:{b:null}}");
+
+            long long unused = 0;
+            ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(3, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(0, unused);
+
+            const BSONObj k = BSON("" << "a");
+            ASSERT(this->unindex(k));
+
+            unused = 0;
+            ASSERT_EQUALS(1, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(2, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(0, unused);
+
+            builder.checkStructure("{b:null}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class DelInternalPromoteKey : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{a:null,y:{d:{c:{b:null}},_:{e:null}},z:null}");
+
+            long long unused = 0;
+            ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(6, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(0, unused);
+
+            const BSONObj k = BSON("" << "y");
+            ASSERT(this->unindex(k));
+
+            unused = 0;
+            ASSERT_EQUALS(6, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(0, unused);
+
+            builder.checkStructure("{a:null,e:{c:{b:null},d:null},z:null}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class DelInternalPromoteRightKey : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{a:null,_:{e:{c:null},_:{f:null}}}");
+
+            long long unused = 0;
+            ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(0, unused);
+
+            const BSONObj k = BSON("" << "a");
+            ASSERT(this->unindex(k));
+
+            unused = 0;
+            ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(3, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(0, unused);
+
+            builder.checkStructure("{c:null,_:{e:null,f:null}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class DelInternalReplacementPrevNonNull : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{a:null,d:{c:{b:null}},e:null}");
+
+            long long unused = 0;
+            ASSERT_EQUALS(5, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(0, unused);
+
+            const BSONObj k = BSON("" << "d");
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(1, unused);
+
+            builder.checkStructure("{a:null,d:{c:{b:null}},e:null}");
+
+            // Check 'unused' key
+            ASSERT(this->getKey(this->_helper.headManager.getHead(), 1).recordLoc.getOfs() & 1);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class DelInternalReplacementNextNonNull : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{a:null,_:{c:null,_:{d:null}}}");
+
+            long long unused = 0;
+            ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(0, unused);
+
+            const BSONObj k = BSON("" << "a");
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(4, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(1, unused);
+
+            builder.checkStructure("{a:null,_:{c:null,_:{d:null}}}");
+
+            // Check 'unused' key
+            ASSERT(this->getKey(this->_helper.headManager.getHead(), 0).recordLoc.getOfs() & 1);
+        }
+    };
+
+    template<class OnDiskFormat>
+    class DelInternalSplitPromoteLeft : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{$10:null,$20:null,$30$10:{$25:{$23:null},_:{$27:null}},"
+                              "$40:null,$50:null,$60:null,$70:null,$80:null,$90:null,$100:null}");
+
+            long long unused = 0;
+            ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(0, unused);
+            
+            const BSONObj k = BSON("" << bigNumString(0x30, 0x10));
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(0, unused);
+
+            builder.checkStructure("{$60:{$10:null,$20:null,"
+                                    "$27:{$23:null,$25:null},$40:null,$50:null},"
+                                      "_:{$70:null,$80:null,$90:null,$100:null}}");
+        }
+    };
+
+    template<class OnDiskFormat>
+    class DelInternalSplitPromoteRight : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            OperationContextNoop txn;
+            ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper);
+
+            builder.makeTree("{$10:null,$20:null,$30:null,$40:null,$50:null,$60:null,$70:null,"
+                              "$80:null,$90:null,$100$10:{$95:{$93:null},_:{$97:null}}}");
+
+            long long unused = 0;
+            ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(0, unused);
+
+            const BSONObj k = BSON("" << bigNumString(0x100, 0x10));
+            ASSERT(this->unindex(k));
+
+            ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0));
+
+            // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc
+            ASSERT_EQUALS(5, this->_helper.recordStore.numRecords());
+            ASSERT_EQUALS(0, unused);
+
+            builder.checkStructure(
+                        "{$80:{$10:null,$20:null,$30:null,$40:null,$50:null,$60:null,$70:null},"
+                           "_:{$90:null,$97:{$93:null,$95:null}}}");
+        }
+    };
+
+    /* This test requires the entire server to be linked-in and it is better implemented using
+       the JS framework. Disabling here and will put in jsCore.
+
+    template<class OnDiskFormat>
+    class SignedZeroDuplication : public BtreeLogicTestBase<OnDiskFormat> {
+    public:
+        void run() {
+            ASSERT_EQUALS(0.0, -0.0);
+            DBDirectClient c;
+            
+            static const string ns("unittests.SignedZeroDuplication");
+
+            c.ensureIndex(ns, BSON("b" << 1), true);
+            c.insert(ns, BSON("b" << 0.0));
+            c.insert(ns, BSON("b" << 1.0));
+            c.update(ns, BSON("b" << 1.0), BSON("b" << -0.0));
+
+            ASSERT_EQUALS(1U, c.count(ns, BSON("b" << 0.0)));
+        }
+    };
+    */
+
+/*
+// QUERY_MIGRATION: port later
+    class PackUnused : public Base {
+    public:
+        void run() {
+            for ( long long i = 0; i < 1000000; i += 1000 ) {
+                insert( i );
+            }
+            string orig, after;
+            {
+                stringstream ss;
+                bt()->shape( ss );
+                orig = ss.str();
+            }
+            vector< string > toDel;
+            vector< string > other;
+            BSONObjBuilder start;
+            start.appendMinKey( "a" );
+            BSONObjBuilder end;
+            end.appendMaxKey( "a" );
+            auto_ptr< BtreeCursor > c( BtreeCursor::make( nsdetails( ns() ),
+                                                          id(),
+                                                          start.done(),
+                                                          end.done(),
+                                                          false,
+                                                          1 ) );
+            while( c->ok() ) {
+                bool has_child =
+                    c->getBucket().btree()->keyNode(c->getKeyOfs()).prevChildBucket.isNull();
+
+                if (has_child) {
+                    toDel.push_back( c->currKey().firstElement().valuestr() );
+                }
+                else {
+                    other.push_back( c->currKey().firstElement().valuestr() );
+                }
+                c->advance();
+            }
+            ASSERT( toDel.size() > 0 );
+            for( vector< string >::const_iterator i = toDel.begin(); i != toDel.end(); ++i ) {
+                BSONObj o = BSON( "a" << *i );
+                this->unindex( o );
+            }
+            ASSERT( other.size() > 0 );
+            for( vector< string >::const_iterator i = other.begin(); i != other.end(); ++i ) {
+                BSONObj o = BSON( "a" << *i );
+                this->unindex( o );
+            }
+
+            long long unused = 0;
+            ASSERT_EQUALS( 0, bt()->fullValidate(&txn,  dl(), order(), &unused, true ) );
+
+            for ( long long i = 50000; i < 50100; ++i ) {
+                insert( i );
+            }
+
+            long long unused2 = 0;
+            ASSERT_EQUALS( 100, bt()->fullValidate(&txn,  dl(), order(), &unused2, true ) );
+
+//            log() << "old unused: " << unused << ", new unused: " << unused2 << endl;
+//
+            ASSERT( unused2 <= unused );
+        }
+    protected:
+        void insert( long long n ) {
+            string val = bigNumString( n );
+            BSONObj k = BSON( "a" << val );
+            Base::insert( k );
+        }
+    };
+
+    class DontDropReferenceKey : public PackUnused {
+    public:
+        void run() {
+            // with 80 root node is full
+            for ( long long i = 0; i < 80; i += 1 ) {
+                insert( i );
+            }
+
+            BSONObjBuilder start;
+            start.appendMinKey( "a" );
+            BSONObjBuilder end;
+            end.appendMaxKey( "a" );
+            BSONObj l = bt()->keyNode( 0 ).key.toBson();
+            string toInsert;
+            auto_ptr< BtreeCursor > c( BtreeCursor::make( nsdetails( ns() ),
+                                                          id(),
+                                                          start.done(),
+                                                          end.done(),
+                                                          false,
+                                                          1 ) );
+            while( c->ok() ) {
+                if ( c->currKey().woCompare( l ) > 0 ) {
+                    toInsert = c->currKey().firstElement().valuestr();
+                    break;
+                }
+                c->advance();
+            }
+            // too much work to try to make this happen through inserts and deletes
+            // we are intentionally manipulating the btree bucket directly here
+            BtreeBucket::Loc* L = const_cast< BtreeBucket::Loc* >( &bt()->keyNode( 1 ).prevChildBucket );
+            getDur().writing(L)->Null();
+            getDur().writingInt( const_cast< BtreeBucket::Loc& >( bt()->keyNode( 1 ).recordLoc ).GETOFS() ) |= 1; // make unused
+            BSONObj k = BSON( "a" << toInsert );
+            Base::insert( k );
+        }
+    };
+    */
+
+    //
+    // TEST SUITE DEFINITION
+    //
+
+    template<class OnDiskFormat>
+    class BtreeLogicTestSuite : public unittest::Suite {
+    public:
+        BtreeLogicTestSuite(const std::string& name) : Suite(name) {
+
+        }
+
+        void setupTests() {
+            add< SimpleCreate<OnDiskFormat> >();
+            add< SimpleInsertDelete<OnDiskFormat> >();
+            add< SplitRightHeavyBucket<OnDiskFormat> >();
+            add< SplitLeftHeavyBucket<OnDiskFormat> >();
+            add< MissingLocate<OnDiskFormat> >();
+            add< MissingLocateMultiBucket<OnDiskFormat> >();
+            add< SERVER983<OnDiskFormat> >();
+            add< DontReuseUnused<OnDiskFormat> >();
+            add< MergeBucketsLeft<OnDiskFormat> >();
+            add< MergeBucketsRight<OnDiskFormat> >();
+            add< MergeBucketsDontReplaceHead<OnDiskFormat> >();
+            add< MergeBucketsDelInternal<OnDiskFormat> >();
+            add< MergeBucketsRightNull<OnDiskFormat> >();
+            add< DontMergeSingleBucket<OnDiskFormat> >();
+            add< ParentMergeNonRightToLeft<OnDiskFormat> >();
+            add< ParentMergeNonRightToRight<OnDiskFormat> >();
+            add< CantMergeRightNoMerge<OnDiskFormat> >();
+            add< CantMergeLeftNoMerge<OnDiskFormat> >();
+            add< MergeOption<OnDiskFormat> >();
+            add< ForceMergeLeft<OnDiskFormat> >();
+            add< ForceMergeRight<OnDiskFormat> >();
+            add< RecursiveMerge<OnDiskFormat> >();
+            add< RecursiveMergeRightBucket<OnDiskFormat> >();
+            add< RecursiveMergeDoubleRightBucket<OnDiskFormat> >();
+
+            add< MergeSizeJustRightRight<OnDiskFormat> >();
+            add< MergeSizeJustRightLeft<OnDiskFormat> >();
+            add< MergeSizeRight<OnDiskFormat> >();
+            add< MergeSizeLeft<OnDiskFormat> >();
+            add< NoMergeBelowMarkRight<OnDiskFormat> >();
+            add< NoMergeBelowMarkLeft<OnDiskFormat> >();
+            add< MergeSizeRightTooBig<OnDiskFormat> >();
+            add< MergeSizeLeftTooBig<OnDiskFormat> >();
+            add< MergeRightEmpty<OnDiskFormat> >();
+            add< MergeMinRightEmpty<OnDiskFormat> >();
+            add< MergeLeftEmpty<OnDiskFormat> >();
+            add< MergeMinLeftEmpty<OnDiskFormat> >();
+            add< BalanceRightEmpty<OnDiskFormat> >();
+            add< BalanceLeftEmpty<OnDiskFormat> >();
+
+            add< BalanceOneLeftToRight<OnDiskFormat> >();
+            add< BalanceOneRightToLeft<OnDiskFormat> >();
+            add< BalanceThreeLeftToRight<OnDiskFormat> >();
+            add< BalanceThreeRightToLeft<OnDiskFormat> >();
+            add< BalanceSingleParentKey<OnDiskFormat> >();
+
+            add< PackEmptyBucket<OnDiskFormat> >();
+            add< PackedDataSizeEmptyBucket<OnDiskFormat> >();
+
+            add< BalanceSingleParentKeyPackParent<OnDiskFormat> >();
+            add< BalanceSplitParent<OnDiskFormat> >();
+            add< EvenRebalanceLeft<OnDiskFormat> >();
+            add< EvenRebalanceLeftCusp<OnDiskFormat> >();
+            add< EvenRebalanceRight<OnDiskFormat> >();
+            add< EvenRebalanceRightCusp<OnDiskFormat> >();
+            add< EvenRebalanceCenter<OnDiskFormat> >();
+            add< OddRebalanceLeft<OnDiskFormat> >();
+            add< OddRebalanceRight<OnDiskFormat> >();
+            add< OddRebalanceCenter<OnDiskFormat> >();
+            add< RebalanceEmptyRight<OnDiskFormat> >();
+            add< RebalanceEmptyLeft<OnDiskFormat> >();
+
+            add< NoMoveAtLowWaterMarkRight<OnDiskFormat> >();
+            add< MoveBelowLowWaterMarkRight<OnDiskFormat> >();
+            add< NoMoveAtLowWaterMarkLeft<OnDiskFormat> >();
+            add< MoveBelowLowWaterMarkLeft<OnDiskFormat> >();
+
+            add< PreferBalanceLeft<OnDiskFormat> >();
+            add< PreferBalanceRight<OnDiskFormat> >();
+            add< RecursiveMergeThenBalance<OnDiskFormat> >();
+            add< DelEmptyNoNeighbors<OnDiskFormat> >();
+            add< DelEmptyEmptyNeighbors<OnDiskFormat> >();
+            add< DelInternal<OnDiskFormat> >();
+            add< DelInternalReplaceWithUnused<OnDiskFormat> >();
+            add< DelInternalReplaceRight<OnDiskFormat> >();
+            add< DelInternalPromoteKey<OnDiskFormat> >();
+            add< DelInternalPromoteRightKey<OnDiskFormat> >();
+            add< DelInternalReplacementPrevNonNull<OnDiskFormat> >();
+            add< DelInternalReplacementNextNonNull<OnDiskFormat> >();
+            add< DelInternalSplitPromoteLeft<OnDiskFormat> >();
+            add< DelInternalSplitPromoteRight<OnDiskFormat> >();
+        }
+    };
+
+    // Test suite for both V0 and V1
+    static BtreeLogicTestSuite<BtreeLayoutV0> SUITE_V0("BTreeLogicTests_V0");
+    static BtreeLogicTestSuite<BtreeLayoutV1> SUITE_V1("BTreeLogicTests_V1");
+}
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h b/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h
new file mode 100644
index 00000000000..7f91cd2fb27
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h
@@ -0,0 +1,380 @@
+/**
+ *    Copyright (C) 2014 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/diskloc.h"
+#include "mongo/db/jsobj.h"
+#include "mongo/db/storage/mmap_v1/btree/key.h"
+
+namespace mongo {
+
+    const int OldBucketSize = 8192;
+
+    //
+    // On-disk index format
+    //
+
+#pragma pack(1)
+    /**
+     * This is the fixed width data component for storage of a key within a bucket.  It contains an
+     * offset pointer to the variable width bson data component.  This may be 'unused', please see
+     * below.
+     *
+     * Why is this templated on Loc?  Because V0 and V1 have different size DiskLoc(s) but otherwise
+     * the same layout.
+     */
+    template <class LocType>
+    struct FixedWidthKey {
+        //
+        // Data
+        //
+
+        /**
+         * The 'left' child bucket of this key.  If this is the i-th key, it points to the i index
+         * child bucket.
+         */
+        LocType prevChildBucket;
+
+        /**
+         * The location of the record associated with this key.
+         */
+        LocType recordLoc;
+
+        /**
+         * Offset within current bucket of the variable width bson key for this _KeyNode.
+         */
+        unsigned short _kdo;
+
+        //
+        // Accessors / mutators
+        //
+
+        short keyDataOfs() const {
+            return static_cast<short>(_kdo);
+        }
+
+        void setKeyDataOfs(short s) {
+            _kdo = s;
+            invariant(s>=0);
+        }
+
+        void setKeyDataOfsSavingUse(short s) {
+            // XXX kill this func
+            setKeyDataOfs(s);
+        }
+
+        /**
+         * Unused keys are not returned by read operations.  Keys may be marked
+         * as unused in cases where it is difficult to delete them while
+         * maintaining the constraints required of a btree.
+         *
+         * Setting ofs to odd is the sentinel for unused, as real recordLoc's
+         * are always even numbers.  Note we need to keep its value basically
+         * the same as we use the recordLoc as part of the key in the index
+         * (to handle duplicate keys efficiently).
+         *
+         * Flagging keys as unused is a feature that is being phased out in favor
+         * of deleting the keys outright.  The current btree implementation is
+         * not expected to mark a key as unused in a non legacy btree.
+         */
+        void setUnused() {
+            recordLoc.GETOFS() |= 1;
+        }
+
+        void setUsed() { recordLoc.GETOFS() &= ~1; }
+
+        int isUnused() const {
+            return recordLoc.getOfs() & 1;
+        }
+
+        int isUsed() const {
+            return !isUnused();
+        }
+    };
+
+    /**
+     * This structure represents header data for a btree bucket.  An object of
+     * this type is typically allocated inside of a buffer of size BucketSize,
+     * resulting in a full bucket with an appropriate header.
+     *
+     * The body of a btree bucket contains an array of _KeyNode objects starting
+     * from its lowest indexed bytes and growing to higher indexed bytes.  The
+     * body also contains variable width bson keys, which are allocated from the
+     * highest indexed bytes toward lower indexed bytes.
+     *
+     * |hhhh|kkkkkkk--------bbbbbbbbbbbuuubbbuubbb|
+     * h = header data
+     * k = KeyNode data
+     * - = empty space
+     * b = bson key data
+     * u = unused (old) bson key data, that may be garbage collected
+     */
+    struct BtreeBucketV0 {
+        /**
+         * Parent bucket of this bucket, which isNull() for the root bucket.
+         */
+        DiskLoc parent;
+
+        /**
+         * Given that there are n keys, this is the n index child.
+         */
+        DiskLoc nextChild;
+
+        /**
+         * Can be reused, value is 8192 in current pdfile version Apr2010
+         */
+        unsigned short _wasSize;
+
+        /**
+         * zero
+         */
+        unsigned short _reserved1;
+
+        int flags;
+
+        /** basicInsert() assumes the next three members are consecutive and in this order: */
+
+        /** Size of the empty region. */
+        int emptySize;
+
+        /** Size used for bson storage, including storage of old keys. */
+        int topSize;
+
+        /* Number of keys in the bucket. */
+        int n;
+
+        int reserved;
+
+        /* Beginning of the bucket's body */
+        char data[4];
+
+        // Precalculated size constants
+        enum { HeaderSize = 40 };
+    };
+
+    // BtreeBucketV0 is part of the on-disk format, so it should never be changed
+    BOOST_STATIC_ASSERT(
+        sizeof(BtreeBucketV0) - sizeof(reinterpret_cast<BtreeBucketV0*>(NULL)->data) 
+                == BtreeBucketV0::HeaderSize);
+
+    /**
+     * A variant of DiskLoc Used by the V1 bucket type.
+     */
+    struct DiskLoc56Bit {
+        //
+        // Data
+        //
+
+        int ofs;
+
+        unsigned char _a[3];
+
+        //
+        // Accessors XXX rename these, this is terrible
+        //
+
+        int& GETOFS()      { return ofs; }
+
+        int getOfs() const { return ofs; }
+
+        //
+        // Comparison
+        //
+
+        bool isNull() const { return ofs < 0; }
+
+        unsigned long long toLongLong() const { 
+            // endian
+            unsigned long long result = ofs;
+            char* cursor = reinterpret_cast<char *>(&result);
+            *reinterpret_cast<uint16_t*>(cursor + 4) = *reinterpret_cast<const uint16_t*>(&_a[0]);
+            *reinterpret_cast<uint8_t*>(cursor + 6) = *reinterpret_cast<const uint8_t*>(&_a[2]);
+            *reinterpret_cast<uint8_t*>(cursor + 7) = uint8_t(0);
+            return result;
+        }
+
+        bool operator<(const DiskLoc56Bit& rhs) const {
+            // the orderering of dup keys in btrees isn't too critical, but we'd like to put items
+            // that are close together on disk close together in the tree, so we do want the file #
+            // to be the most significant bytes
+            return toLongLong() < rhs.toLongLong();
+        }
+
+        int compare(const DiskLoc56Bit& rhs) const {
+            unsigned long long a = toLongLong();
+            unsigned long long b = rhs.toLongLong();
+            if ( a < b ) {
+                return -1;
+            }
+            else {
+                return a == b ? 0 : 1;
+            }
+        }
+
+        bool operator==(const DiskLoc56Bit& rhs) const {
+            return toLongLong() == rhs.toLongLong();
+        }
+
+        bool operator!=(const DiskLoc56Bit& rhs) const {
+            return toLongLong() != rhs.toLongLong();
+        }
+
+        bool operator==(const DiskLoc& rhs) const {
+            return DiskLoc(*this) == rhs;
+        }
+
+        bool operator!=(const DiskLoc& rhs) const {
+            return !(*this==rhs);
+        }
+
+        //
+        // Mutation
+        //
+
+        enum { 
+            // first bit of offsets used in _KeyNode we don't use -1 here.
+            OurNullOfs = -2
+        };
+
+        void Null() { 
+            ofs = OurNullOfs; 
+            _a[0] = _a[1] = _a[2] = 0;
+        }
+
+        void operator=(const DiskLoc& loc) {
+            ofs = loc.getOfs();
+            int la = loc.a();
+            invariant( la <= 0xffffff ); // must fit in 3 bytes
+            if( la < 0 ) {
+                if ( la != -1 ) {
+                    log() << "btree diskloc isn't negative 1: " << la << std::endl;
+                    invariant ( la == -1 );
+                }
+                la = 0;
+                ofs = OurNullOfs;
+            }
+            memcpy(_a, &la, 3); // endian
+        }
+
+        //
+        // Type Conversion
+        //
+
+        operator const DiskLoc() const { 
+            // endian
+            if( isNull() ) return DiskLoc();
+            unsigned a = *((unsigned *) (_a-1));
+            return DiskLoc(a >> 8, ofs);
+        }
+
+        std::string toString() const { return DiskLoc(*this).toString(); }
+    };
+
+    struct BtreeBucketV1 {
+        /** Parent bucket of this bucket, which isNull() for the root bucket. */
+        DiskLoc56Bit parent;
+
+        /** Given that there are n keys, this is the n index child. */
+        DiskLoc56Bit nextChild;
+
+        unsigned short flags;
+
+        /** Size of the empty region. */
+        unsigned short emptySize;
+
+        /** Size used for bson storage, including storage of old keys. */
+        unsigned short topSize;
+
+        /* Number of keys in the bucket. */
+        unsigned short n;
+
+        /* Beginning of the bucket's body */
+        char data[4];
+
+        // Precalculated size constants
+        enum { HeaderSize = 22 };
+    };
+
+    // BtreeBucketV1 is part of the on-disk format, so it should never be changed
+    BOOST_STATIC_ASSERT(
+        sizeof(BtreeBucketV1) - sizeof(reinterpret_cast<BtreeBucketV1*>(NULL)->data) 
+                == BtreeBucketV1::HeaderSize);
+
+    enum Flags {
+        Packed = 1
+    };
+
+    struct BtreeLayoutV0 {
+        typedef FixedWidthKey<DiskLoc> FixedWidthKeyType;
+        typedef DiskLoc LocType;
+        typedef KeyBson KeyType;
+        typedef KeyBson KeyOwnedType;
+        typedef BtreeBucketV0 BucketType;
+
+        enum { BucketSize = 8192,
+               BucketBodySize = BucketSize - BucketType::HeaderSize 
+        };
+
+        // largest key size we allow.  note we very much need to support bigger keys (somehow) in
+        // the future.
+
+        static const int KeyMax = OldBucketSize / 10;
+
+        // A sentinel value sometimes used to identify a deallocated bucket.
+        static const int INVALID_N_SENTINEL = -1;
+
+        static void initBucket(BucketType* bucket) {
+            bucket->_reserved1 = 0;
+            bucket->_wasSize = BucketSize;
+            bucket->reserved = 0;
+        }
+    };
+
+    struct BtreeLayoutV1 {
+        typedef FixedWidthKey<DiskLoc56Bit> FixedWidthKeyType;
+        typedef KeyV1 KeyType;
+        typedef KeyV1Owned KeyOwnedType;
+        typedef DiskLoc56Bit LocType;
+        typedef BtreeBucketV1 BucketType;
+
+        enum { BucketSize = 8192 - 16,  // The -16 is to leave room for the Record header
+               BucketBodySize = BucketSize - BucketType::HeaderSize 
+        };
+
+        static const int KeyMax = 1024;
+
+        // A sentinel value sometimes used to identify a deallocated bucket.
+        static const unsigned short INVALID_N_SENTINEL = 0xffff;
+
+        static void initBucket(BucketType* bucket) { }
+    };
+
+#pragma pack()
+
+}  // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp
new file mode 100644
index 00000000000..99385d46e86
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp
@@ -0,0 +1,247 @@
+// btree_test_help.cpp : Helper functions for Btree unit-testing
+//
+
+/**
+ *    Copyright (C) 2014 MongoDB
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects
+ *    for all of the code used other than as permitted herein. If you modify
+ *    file(s) with this exception, you may extend this exception to your
+ *    version of the file(s), but you are not obligated to do so. If you do not
+ *    wish to do so, delete this exception statement from your version. If you
+ *    delete this exception statement from all source files in the program,
+ *    then also delete it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/btree/btree_test_help.h"
+
+#include "mongo/db/operation_context_noop.h"
+#include "mongo/unittest/unittest.h"
+
+
+namespace mongo {
+
+    string bigNumString(long long n, int len) {
+        char sub[17];
+        sprintf(sub, "%.16llx", n);
+        string val(len, ' ');
+        for (int i = 0; i < len; ++i) {
+            val[i] = sub[i % 16];
+        }
+        return val;
+    }
+
+    BSONObj simpleKey(char c, int n) {
+        BSONObjBuilder builder;
+        string val(n, c);
+        builder.append("a", val);
+        return builder.obj();
+    }
+
+    //
+    // BtreeLogicTestHelper
+    //
+
+    static BucketDeletionNotification dummyBucketDeletionNotification;
+
+    template <class OnDiskFormat>
+    BtreeLogicTestHelper<OnDiskFormat>::BtreeLogicTestHelper(const BSONObj& order)
+            : recordStore("TestRecordStore"),
+              btree(&headManager,
+                    &recordStore,
+                    Ordering::make(order),
+                    "TestIndex",
+                    &dummyBucketDeletionNotification) {
+
+        static const string randomData("RandomStuff");
+
+        // Generate a valid record location for a "fake" record, which we will repeatedly use
+        // thoughout the tests.
+        OperationContextNoop txn;
+        StatusWith<DiskLoc> s =
+            recordStore.insertRecord(&txn, randomData.c_str(), randomData.length(), false);
+
+        ASSERT_TRUE(s.isOK());
+        ASSERT_EQUALS(1, recordStore.numRecords());
+
+        dummyDiskLoc = s.getValue();
+    }
+
+
+    //
+    // ArtificialTreeBuilder
+    //
+
+    template <class OnDiskFormat>
+    void ArtificialTreeBuilder<OnDiskFormat>::makeTree(const string &spec) {
+        _helper->headManager.setHead(_txn, makeTree(fromjson(spec)));
+    }
+
+    template <class OnDiskFormat>
+    DiskLoc ArtificialTreeBuilder<OnDiskFormat>::makeTree(const BSONObj &spec) {
+        DiskLoc bucketLoc = _helper->btree._addBucket(_txn);
+        BucketType* bucket = _helper->btree.getBucket(bucketLoc);
+
+        BSONObjIterator i(spec);
+        while (i.more()) {
+            BSONElement e = i.next();
+            DiskLoc child;
+            if (e.type() == Object) {
+                child = makeTree(e.embeddedObject());
+            }
+
+            if (e.fieldName() == string("_")) {
+                bucket->nextChild = child;
+            }
+            else {
+                KeyDataOwnedType key(BSON("" << expectedKey(e.fieldName())));
+                _helper->btree._pushBack(bucket, _helper->dummyDiskLoc, key, child);
+            }
+        }
+
+        _helper->btree.fixParentPtrs(_txn, bucket, bucketLoc);
+        return bucketLoc;
+    }
+
+    template <class OnDiskFormat>
+    void ArtificialTreeBuilder<OnDiskFormat>::checkStructure(const string &spec) const {
+        checkStructure(fromjson(spec), _helper->headManager.getHead());
+    }
+
+    template <class OnDiskFormat>
+    void ArtificialTreeBuilder<OnDiskFormat>::push(
+                        const DiskLoc bucketLoc, const BSONObj& key, const DiskLoc child) {
+        KeyDataOwnedType k(key);
+        BucketType* bucket = _helper->btree.getBucket(bucketLoc);
+
+        _helper->btree._pushBack(bucket, _helper->dummyDiskLoc, k, child);
+        _helper->btree.fixParentPtrs(_txn, bucket, bucketLoc);
+    }
+
+    template <class OnDiskFormat>
+    void ArtificialTreeBuilder<OnDiskFormat>::checkStructure(
+                        const BSONObj &spec, const DiskLoc node) const {
+        BucketType* bucket = _helper->btree.getBucket(node);
+
+        BSONObjIterator j(spec);
+        for (int i = 0; i < bucket->n; ++i) {
+            ASSERT(j.more());
+            BSONElement e = j.next();
+            KeyHeaderType kn = BtreeLogic<OnDiskFormat>::getKeyHeader(bucket, i);
+            string expected = expectedKey(e.fieldName());
+            ASSERT(isPresent(BSON("" << expected), 1));
+            ASSERT(isPresent(BSON("" << expected), -1));
+
+            // ASSERT_EQUALS(expected, kn.key.toBson().firstElement().valuestr());
+            if (kn.prevChildBucket.isNull()) {
+                ASSERT(e.type() == jstNULL);
+            }
+            else {
+                ASSERT(e.type() == Object);
+                checkStructure(e.embeddedObject(), kn.prevChildBucket);
+            }
+        }
+        if (bucket->nextChild.isNull()) {
+            // maybe should allow '_' field with null value?
+            ASSERT(!j.more());
+        }
+        else {
+            BSONElement e = j.next();
+            ASSERT_EQUALS(string("_"), e.fieldName());
+            ASSERT(e.type() == Object);
+            checkStructure(e.embeddedObject(), bucket->nextChild);
+        }
+        ASSERT(!j.more());
+    }
+
+    template <class OnDiskFormat>
+    bool ArtificialTreeBuilder<OnDiskFormat>::isPresent(const BSONObj &key, int direction) const {
+        int pos;
+        DiskLoc loc;
+        OperationContextNoop txn;
+        return _helper->btree.locate(&txn, key, _helper->dummyDiskLoc, direction, &pos, &loc);
+    }
+
+    // Static
+    template <class OnDiskFormat>
+    string ArtificialTreeBuilder<OnDiskFormat>::expectedKey(const char *spec) {
+        if (spec[0] != '$') {
+            return spec;
+        }
+        char *endPtr;
+
+        // parsing a long long is a pain, so just allow shorter keys for now
+        unsigned long long num = strtol(spec + 1, &endPtr, 16);
+        int len = 800;
+        if (*endPtr == '$') {
+            len = strtol(endPtr + 1, 0, 16);
+        }
+
+        return bigNumString(num, len);
+    }
+
+    template <class OnDiskFormat>
+    int ArtificialTreeBuilder<OnDiskFormat>::fillBucketToExactSize(
+                            const DiskLoc bucketLoc, int targetSize, char startKey) {
+        ASSERT_FALSE(bucketLoc.isNull());
+
+        BucketType* bucket = _helper->btree.getBucket(bucketLoc);
+        ASSERT_EQUALS(0, bucket->n);
+
+        static const int bigSize = KeyDataOwnedType(simpleKey('a', 801)).dataSize();
+
+        int size = 0;
+        int keyCount = 0;
+        while (size < targetSize) {
+            int space = targetSize - size;
+            int nextSize = space - sizeof(FixedWidthKeyType);
+            verify(nextSize > 0);
+
+            BSONObj newKey;
+            if (nextSize >= bigSize) {
+                newKey = simpleKey(startKey++, 801);
+            }
+            else {
+                newKey = simpleKey(startKey++, nextSize - (bigSize - 801));
+            }
+
+            push(bucketLoc, newKey, DiskLoc());
+
+            size += KeyDataOwnedType(newKey).dataSize() + 
+                    sizeof(FixedWidthKeyType);
+            keyCount += 1;
+        }
+
+        ASSERT_EQUALS(_helper->btree._packedDataSize(bucket, 0), targetSize);
+
+        return keyCount;
+    }
+
+    //
+    // This causes actual code to be generated for the usages of the templates in this file.
+    //
+
+    // V0 format.
+    template struct BtreeLogicTestHelper<BtreeLayoutV0>;
+    template class ArtificialTreeBuilder<BtreeLayoutV0>;
+
+    // V1 format.
+    template struct BtreeLogicTestHelper<BtreeLayoutV1>;
+    template class ArtificialTreeBuilder<BtreeLayoutV1>;
+}
diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_test_help.h b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.h
new file mode 100644
index 00000000000..52d468f053a
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.h
@@ -0,0 +1,154 @@
+/**
+ *    Copyright (C) 2014 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include <string>
+
+#include "mongo/db/json.h"
+#include "mongo/db/storage/heap1/record_store_heap.h"  // XXX why is this here?
+#include "mongo/db/storage/mmap_v1//btree/btree_logic.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h"
+
+
+namespace mongo {
+
+    /**
+     * Generates a string of the specified length containing repeated concatenation of the
+     * hexadecimal representation of the input value.
+     */
+    std::string bigNumString(long long n, int len);
+
+    /**
+     * Generates key on a field 'a', with the specified number of repetitions of the character.
+     */
+    BSONObj simpleKey(char c, int n = 1);
+
+    /**
+     * Simple head manager, which performs no validity checking or persistence.
+     */
+    class TestHeadManager : public HeadManager {
+    public:
+        virtual const DiskLoc getHead() const {
+            return _head;
+        }
+
+        virtual void setHead(OperationContext* txn, const DiskLoc newHead) {
+            _head = newHead;
+        }
+
+    private:
+        DiskLoc _head;
+    };
+
+
+    /**
+     * This structure encapsulates a Btree and all the infrastructure needed by it (head manager,
+     * record store and a valid disk location to use by the tests).
+     */
+    template <class OnDiskFormat>
+    struct BtreeLogicTestHelper {
+        BtreeLogicTestHelper(const BSONObj& order);
+
+        // Everything needed for a fully-functional Btree logic
+        TestHeadManager headManager;
+        HeapRecordStore recordStore;
+        BtreeLogic<OnDiskFormat> btree;
+        DiskLoc dummyDiskLoc;
+    };
+
+
+    /**
+     * Tool to construct custom tree shapes for tests.
+     */
+    template <class OnDiskFormat>
+    class ArtificialTreeBuilder {
+    public:
+
+        typedef typename BtreeLogic<OnDiskFormat>::BucketType BucketType;
+        typedef typename BtreeLogic<OnDiskFormat>::KeyDataOwnedType KeyDataOwnedType;
+        typedef typename BtreeLogic<OnDiskFormat>::KeyHeaderType KeyHeaderType;
+
+        typedef typename OnDiskFormat::FixedWidthKeyType FixedWidthKeyType;
+
+        /**
+         * The tree builder wraps around the passed-in helper and will invoke methods on it. It 
+         * does not do any cleanup, so constructing multiple trees over the same helper will
+         * cause leaked records.
+         */
+        ArtificialTreeBuilder(OperationContext* txn,
+                              BtreeLogicTestHelper<OnDiskFormat>* helper)
+            : _txn(txn), _helper(helper) {
+
+        }
+
+        /**
+         * Causes the specified tree shape to be built on the associated helper and the tree's
+         * root installed as the head. Uses a custom JSON-based language with the following 
+         * syntax:
+         *
+         * Btree := BTreeBucket
+         * BtreeBucket := { Child_1_Key: <BtreeBucket | null>, 
+         *                  Child_2_Key: <BtreeBucket | null>, 
+         *                  ...,
+         *                  _: <BtreeBucket | null> }
+         *
+         * The _ key name specifies the content of the nextChild pointer. The value null means 
+         * use a fixed disk loc.
+         */
+        void makeTree(const std::string& spec);
+
+        /**
+         * Validates that the structure of the Btree in the helper matches the specification.
+         */
+        void checkStructure(const std::string& spec) const;
+
+        /**
+         * Adds the following key to the bucket and fixes up the child pointers.
+         */
+        void push(const DiskLoc bucketLoc, const BSONObj& key, const DiskLoc child);
+
+        /**
+         * @return The number of keys inserted.
+         */
+        int fillBucketToExactSize(const DiskLoc bucketLoc, int targetSize, char startKey);
+
+    private:
+        DiskLoc makeTree(const BSONObj& spec);
+
+        void checkStructure(const BSONObj& spec, const DiskLoc node) const;
+
+        bool isPresent(const BSONObj& key, int direction) const;
+
+        static string expectedKey(const char* spec);
+
+        OperationContext* _txn;
+        BtreeLogicTestHelper<OnDiskFormat>* _helper;
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/bucket_deletion_notification.h b/src/mongo/db/storage/mmap_v1/btree/bucket_deletion_notification.h
new file mode 100644
index 00000000000..5d6fa99434f
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/bucket_deletion_notification.h
@@ -0,0 +1,54 @@
+/**
+*    Copyright (C) 2014 MongoDB Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#pragma once
+
+#include "mongo/db/diskloc.h"
+
+namespace mongo {
+
+    /**
+     * Notifies interested parties before a bucket is about to be deleted. Currently used by
+     * the cursor manager, so the appropriate cursors can be invalidated.
+     *
+     * The default implementation is a no-op.
+     */
+    class BucketDeletionNotification {
+    public:
+
+        /**
+         * If the same object is passed in to different BtreeLogic implementations, this
+         * notification may be invoked on multiple threads, so it is up to the implementor
+         * to ensure thread-safety.
+         */
+        virtual void aboutToDeleteBucket(const DiskLoc& bucket) { }
+
+        virtual ~BucketDeletionNotification() { }
+    };
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/btree/key.cpp b/src/mongo/db/storage/mmap_v1/btree/key.cpp
new file mode 100644
index 00000000000..a6ccd61d2cf
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/key.cpp
@@ -0,0 +1,691 @@
+/**
+ *    Copyright (C) 2011 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/btree/key.h"
+
+#include "mongo/bson/util/builder.h"
+#include "mongo/platform/float_utils.h"
+#include "mongo/util/startup_test.h"
+
+
+namespace mongo {
+
+    extern const Ordering nullOrdering = Ordering::make(BSONObj());
+
+    // KeyBson is for V0 (version #0) indexes
+
+    int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o);
+
+    // "old" = pre signed dates & such; i.e. btree V0
+    /* must be same canon type when called */
+    int oldCompareElementValues(const BSONElement& l, const BSONElement& r) {
+        dassert( l.canonicalType() == r.canonicalType() );
+        int f;
+        double x;
+
+        switch ( l.type() ) {
+        case EOO:
+        case Undefined: // EOO and Undefined are same canonicalType
+        case jstNULL:
+        case MaxKey:
+        case MinKey:
+            return 0;
+        case Bool:
+            return *l.value() - *r.value();
+        case Timestamp:
+        case Date:
+            // unsigned dates for old version
+            if ( l.date() < r.date() )
+                return -1;
+            return l.date() == r.date() ? 0 : 1;
+        case NumberLong:
+            if( r.type() == NumberLong ) {
+                long long L = l._numberLong();
+                long long R = r._numberLong();
+                if( L < R ) return -1;
+                if( L == R ) return 0;
+                return 1;
+            }
+            // else fall through
+        case NumberInt:
+        case NumberDouble: {
+            double left = l.number();
+            double right = r.number();
+            bool lNan = !( left <= numeric_limits< double >::max() &&
+                           left >= -numeric_limits< double >::max() );
+            bool rNan = !( right <= numeric_limits< double >::max() &&
+                           right >= -numeric_limits< double >::max() );
+            if ( lNan ) {
+                if ( rNan ) {
+                    return 0;
+                }
+                else {
+                    return -1;
+                }
+            }
+            else if ( rNan ) {
+                return 1;
+            }
+            x = left - right;
+            if ( x < 0 ) return -1;
+            return x == 0 ? 0 : 1;
+        }
+        case jstOID:
+            return memcmp(l.value(), r.value(), 12);
+        case Code:
+        case Symbol:
+        case String:
+            // nulls not allowed in the middle of strings in the old version
+            return strcmp(l.valuestr(), r.valuestr());
+        case Object:
+        case Array:
+            return oldCompare(l.embeddedObject(), r.embeddedObject(), nullOrdering);
+        case DBRef: {
+            int lsz = l.valuesize();
+            int rsz = r.valuesize();
+            if ( lsz - rsz != 0 ) return lsz - rsz;
+            return memcmp(l.value(), r.value(), lsz);
+        }
+        case BinData: {
+            int lsz = l.objsize(); // our bin data size in bytes, not including the subtype byte
+            int rsz = r.objsize();
+            if ( lsz - rsz != 0 ) return lsz - rsz;
+            return memcmp(l.value()+4, r.value()+4, lsz+1);
+        }
+        case RegEx: {
+            int c = strcmp(l.regex(), r.regex());
+            if ( c )
+                return c;
+            return strcmp(l.regexFlags(), r.regexFlags());
+        }
+        case CodeWScope : {
+            f = l.canonicalType() - r.canonicalType();
+            if ( f )
+                return f;
+            f = strcmp( l.codeWScopeCode() , r.codeWScopeCode() );
+            if ( f )
+                return f;
+            f = strcmp( l.codeWScopeScopeDataUnsafe() , r.codeWScopeScopeDataUnsafe() );
+            if ( f )
+                return f;
+            return 0;
+        }
+        default:
+            log() << "oldCompareElementValues: bad type " << (int) l.type() << endl;
+            verify(false);
+        }
+        return -1;
+    }
+
+    int oldElemCompare(const BSONElement&l , const BSONElement& r) { 
+        int lt = (int) l.canonicalType();
+        int rt = (int) r.canonicalType();
+        int x = lt - rt;
+        if( x )
+            return x;
+        return oldCompareElementValues(l, r);
+    }
+
+    // pre signed dates & such
+    int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o) {
+        BSONObjIterator i(l);
+        BSONObjIterator j(r);
+        unsigned mask = 1;
+        while ( 1 ) {
+            // so far, equal...
+
+            BSONElement l = i.next();
+            BSONElement r = j.next();
+            if ( l.eoo() )
+                return r.eoo() ? 0 : -1;
+            if ( r.eoo() )
+                return 1;
+
+            int x;
+            {
+                x = oldElemCompare(l, r);
+                if( o.descending(mask) )
+                    x = -x;
+            }
+            if ( x != 0 )
+                return x;
+            mask <<= 1;
+        }
+        return -1;
+    }
+
+    /* old style compares:
+       - dates are unsigned 
+       - strings no nulls
+    */
+    int KeyBson::woCompare(const KeyBson& r, const Ordering &o) const { 
+        return oldCompare(_o, r._o, o); 
+    }
+
+    // woEqual could be made faster than woCompare but this is for backward compatibility so not worth a big effort
+    bool KeyBson::woEqual(const KeyBson& r) const { 
+        return oldCompare(_o, r._o, nullOrdering) == 0;
+    }
+
+    // [ ][HASMORE][x][y][canontype_4bits]
+    enum CanonicalsEtc { 
+        cminkey=1,
+        cnull=2,
+        cdouble=4,
+        cstring=6,
+        cbindata=7,
+        coid=8,
+        cfalse=10,
+        ctrue=11,
+        cdate=12,
+        cmaxkey=14,
+        cCANONTYPEMASK = 0xf,
+        cY = 0x10,
+        cint = cY | cdouble,
+        cX = 0x20,
+        clong = cX | cdouble,
+        cHASMORE = 0x40,
+        cNOTUSED = 0x80 // but see IsBSON sentinel - this bit not usable without great care
+    };
+
+    // bindata bson type
+    const unsigned BinDataLenMask = 0xf0;  // lengths are powers of 2 of this value
+    const unsigned BinDataTypeMask = 0x0f; // 0-7 as you would expect, 8-15 are 128+value.  see BinDataType.
+    const int BinDataLenMax = 32;
+    const int BinDataLengthToCode[] = { 
+        0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 
+        0x80, -1/*9*/, 0x90/*10*/, -1/*11*/, 0xa0/*12*/, -1/*13*/, 0xb0/*14*/, -1/*15*/,
+        0xc0/*16*/, -1, -1, -1, 0xd0/*20*/, -1, -1, -1, 
+        0xe0/*24*/, -1, -1, -1, -1, -1, -1, -1, 
+        0xf0/*32*/ 
+    };
+    const int BinDataCodeToLength[] = { 
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 32
+    };
+
+    int binDataCodeToLength(int codeByte) { 
+        return BinDataCodeToLength[codeByte >> 4];
+    }
+
+    /** object cannot be represented in compact format.  so store in traditional bson format 
+        with a leading sentinel byte IsBSON to indicate it's in that format.
+
+        Given that the KeyV1Owned constructor already grabbed a bufbuilder, we reuse it here 
+        so that we don't have to do an extra malloc.
+    */
+    void KeyV1Owned::traditional(const BSONObj& obj) { 
+        b.reset();
+        b.appendUChar(IsBSON);
+        b.appendBuf(obj.objdata(), obj.objsize());
+        _keyData = (const unsigned char *) b.buf();
+    }
+
+    KeyV1Owned::KeyV1Owned(const KeyV1& rhs) {
+        b.appendBuf( rhs.data(), rhs.dataSize() );
+        _keyData = (const unsigned char *) b.buf();
+        dassert( b.len() == dataSize() ); // check datasize method is correct
+        dassert( (*_keyData & cNOTUSED) == 0 );
+    }
+
+    // fromBSON to Key format
+    KeyV1Owned::KeyV1Owned(const BSONObj& obj) {
+        BSONObj::iterator i(obj);
+        unsigned char bits = 0;
+        while( 1 ) { 
+            BSONElement e = i.next();
+            if( i.more() )
+                bits |= cHASMORE;
+            switch( e.type() ) { 
+            case MinKey:
+                b.appendUChar(cminkey|bits);
+                break;
+            case jstNULL:
+                b.appendUChar(cnull|bits);
+                break;
+            case MaxKey:
+                b.appendUChar(cmaxkey|bits);
+                break;
+            case Bool:
+                b.appendUChar( (e.boolean()?ctrue:cfalse) | bits );
+                break;
+            case jstOID:
+                b.appendUChar(coid|bits);
+                b.appendBuf(&e.__oid(), sizeof(OID));
+                break;
+            case BinData:
+                {
+                    int t = e.binDataType();
+                    // 0-7 and 0x80 to 0x87 are supported by Key
+                    if( (t & 0x78) == 0 && t != ByteArrayDeprecated ) {
+                        int len;
+                        const char * d = e.binData(len);
+                        if( len <= BinDataLenMax ) {
+                            int code = BinDataLengthToCode[len];
+                            if( code >= 0 ) {
+                                if( t >= 128 )
+                                    t = (t-128) | 0x08;
+                                dassert( (code&t) == 0 );
+                                b.appendUChar( cbindata|bits );
+                                b.appendUChar( code | t );
+                                b.appendBuf(d, len);
+                                break;
+                            }
+                        }
+                    }
+                    traditional(obj);
+                    return;
+                }
+            case Date:
+                b.appendUChar(cdate|bits);
+                b.appendStruct(e.date());
+                break;
+            case String:
+                {
+                    b.appendUChar(cstring|bits);
+                    // note we do not store the terminating null, to save space.
+                    unsigned x = (unsigned) e.valuestrsize() - 1;
+                    if( x > 255 ) { 
+                        traditional(obj);
+                        return;
+                    }
+                    b.appendUChar(x);
+                    b.appendBuf(e.valuestr(), x);
+                    break;
+                }
+            case NumberInt:
+                b.appendUChar(cint|bits);
+                b.appendNum((double) e._numberInt());
+                break;
+            case NumberLong:
+                {
+                    long long n = e._numberLong();
+                    long long m = 2LL << 52;
+                    DEV {
+                        long long d = m-1;
+                        verify( ((long long) ((double) -d)) == -d );
+                    }
+                    if( n >= m || n <= -m ) {
+                        // can't represent exactly as a double
+                        traditional(obj);
+                        return;
+                    }
+                    b.appendUChar(clong|bits);
+                    b.appendNum((double) n);
+                    break;
+                }
+            case NumberDouble:
+                {
+                    double d = e._numberDouble();
+                    if( isNaN(d) ) {
+                        traditional(obj);
+                        return;
+                    }
+                    b.appendUChar(cdouble|bits);
+                    b.appendNum(d);
+                    break;
+                }
+            default:
+                // if other types involved, store as traditional BSON
+                traditional(obj);
+                return;
+            }
+            if( !i.more() )
+                break;
+            bits = 0;
+        }
+        _keyData = (const unsigned char *) b.buf();
+        dassert( b.len() == dataSize() ); // check datasize method is correct
+        dassert( (*_keyData & cNOTUSED) == 0 );
+    }
+
+    BSONObj KeyV1::toBson() const { 
+        verify( _keyData != 0 );
+        if( !isCompactFormat() )
+            return bson();
+
+        BSONObjBuilder b(512);
+        const unsigned char *p = _keyData;
+        while( 1 ) { 
+            unsigned bits = *p++;
+
+            switch( bits & 0x3f ) {
+                case cminkey: b.appendMinKey(""); break;
+                case cnull:   b.appendNull(""); break;
+                case cfalse:  b.appendBool("", false); break;
+                case ctrue:   b.appendBool("", true); break;
+                case cmaxkey: 
+                    b.appendMaxKey(""); 
+                    break;
+                case cstring:
+                    {
+                        unsigned sz = *p++;
+                        // we build the element ourself as we have to null terminate it
+                        BufBuilder &bb = b.bb();
+                        bb.appendNum((char) String);
+                        bb.appendUChar(0); // fieldname ""
+                        bb.appendNum(sz+1);
+                        bb.appendBuf(p, sz);
+                        bb.appendUChar(0); // null char at end of string
+                        p += sz;
+                        break;
+                    }
+                case coid:
+                    b.appendOID("", (OID *) p);
+                    p += sizeof(OID);
+                    break;
+                case cbindata:
+                    {
+                        int len = binDataCodeToLength(*p);
+                        int subtype = (*p) & BinDataTypeMask;
+                        if( subtype & 0x8 ) { 
+                            subtype = (subtype & 0x7) | 0x80;
+                        }
+                        b.appendBinData("", len, (BinDataType) subtype, ++p);
+                        p += len;
+                        break;
+                    }
+                case cdate:
+                    b.appendDate("", (Date_t&) *p);
+                    p += 8;
+                    break;
+                case cdouble:
+                    b.append("", (double&) *p);
+                    p += sizeof(double);
+                    break;
+                case cint:
+                    b.append("", static_cast< int >((reinterpret_cast< const PackedDouble& >(*p)).d));
+                    p += sizeof(double);
+                    break;
+                case clong:
+                    b.append("", static_cast< long long>((reinterpret_cast< const PackedDouble& >(*p)).d));
+                    p += sizeof(double);
+                    break;
+                default:
+                    verify(false);
+            }
+
+            if( (bits & cHASMORE) == 0 )
+                break;
+        }
+        return b.obj();
+    }
+
+    static int compare(const unsigned char *&l, const unsigned char *&r) { 
+        int lt = (*l & cCANONTYPEMASK);
+        int rt = (*r & cCANONTYPEMASK);
+        int x = lt - rt;
+        if( x ) 
+            return x;
+
+        l++; r++;
+
+        // same type
+        switch( lt ) { 
+        case cdouble:
+            {
+                double L = (reinterpret_cast< const PackedDouble* >(l))->d;
+                double R = (reinterpret_cast< const PackedDouble* >(r))->d;
+                if( L < R )
+                    return -1;
+                if( L != R )
+                    return 1;
+                l += 8; r += 8;
+                break;
+            }
+        case cstring:
+            {
+                int lsz = *l;
+                int rsz = *r;
+                int common = min(lsz, rsz);
+                l++; r++; // skip the size byte
+                // use memcmp as we (will) allow zeros in UTF8 strings
+                int res = memcmp(l, r, common);
+                if( res ) 
+                    return res;
+                // longer string is the greater one
+                int diff = lsz-rsz;
+                if( diff ) 
+                    return diff;
+                l += lsz; r += lsz;
+                break;
+            }
+        case cbindata:
+            {
+                int L = *l;
+                int R = *r;
+                int llen = binDataCodeToLength(L);
+                int diff = L-R; // checks length and subtype simultaneously
+                if( diff ) {
+                    // unfortunately nibbles are backwards to do subtype and len in one check (could bit swap...)
+                    int rlen = binDataCodeToLength(R);
+                    if( llen != rlen ) 
+                        return llen - rlen;
+                    return diff;
+                }
+                // same length, same type
+                l++; r++;
+                int res = memcmp(l, r, llen);
+                if( res ) 
+                    return res;
+                l += llen; r += llen;
+                break;
+            }
+        case cdate:
+            {
+                long long L = *((long long *) l);
+                long long R = *((long long *) r);
+                if( L < R )
+                    return -1;
+                if( L > R )
+                    return 1;
+                l += 8; r += 8;
+                break;
+            }
+        case coid:
+            {
+                int res = memcmp(l, r, sizeof(OID));
+                if( res ) 
+                    return res;
+                l += 12; r += 12;
+                break;
+            }
+        default:
+            // all the others are a match -- e.g. null == null
+            ;
+        }
+
+        return 0;
+    }
+
+    // at least one of this and right are traditional BSON format
+    int NOINLINE_DECL KeyV1::compareHybrid(const KeyV1& right, const Ordering& order) const { 
+        BSONObj L = toBson();
+        BSONObj R = right.toBson();
+        return L.woCompare(R, order, /*considerfieldname*/false);
+    }
+
+    int KeyV1::woCompare(const KeyV1& right, const Ordering &order) const {
+        const unsigned char *l = _keyData;
+        const unsigned char *r = right._keyData;
+
+        if( (*l|*r) == IsBSON ) // only can do this if cNOTUSED maintained
+            return compareHybrid(right, order);
+
+        unsigned mask = 1;
+        while( 1 ) { 
+            char lval = *l; 
+            char rval = *r;
+            {
+                int x = compare(l, r); // updates l and r pointers
+                if( x ) {
+                    if( order.descending(mask) )
+                        x = -x;
+                    return x;
+                }
+            }
+
+            {
+                int x = ((int)(lval & cHASMORE)) - ((int)(rval & cHASMORE));
+                if( x ) 
+                    return x;
+                if( (lval & cHASMORE) == 0 )
+                    break;
+            }
+
+            mask <<= 1;
+        }
+
+        return 0;
+    }
+
+    static unsigned sizes[] = {
+        0,
+        1, //cminkey=1,
+        1, //cnull=2,
+        0,
+        9, //cdouble=4,
+        0,
+        0, //cstring=6,
+        0,
+        13, //coid=8,
+        0,
+        1, //cfalse=10,
+        1, //ctrue=11,
+        9, //cdate=12,
+        0,
+        1, //cmaxkey=14,
+        0
+    };
+
+    inline unsigned sizeOfElement(const unsigned char *p) { 
+        unsigned type = *p & cCANONTYPEMASK;
+        unsigned sz = sizes[type];
+        if( sz == 0 ) {
+            if( type == cstring ) { 
+                sz = ((unsigned) p[1]) + 2;
+            }
+            else {
+                verify( type == cbindata );
+                sz = binDataCodeToLength(p[1]) + 2;
+            }
+        }
+        return sz;
+    }
+
+    int KeyV1::dataSize() const { 
+        const unsigned char *p = _keyData;
+        if( !isCompactFormat() ) {
+            return bson().objsize() + 1;
+        }
+
+        bool more;
+        do { 
+            unsigned z = sizeOfElement(p);
+            more = (*p & cHASMORE) != 0;
+            p += z;
+        } while( more );
+        return p - _keyData;
+    }
+
+    bool KeyV1::woEqual(const KeyV1& right) const {
+        const unsigned char *l = _keyData;
+        const unsigned char *r = right._keyData;
+
+        if( (*l|*r) == IsBSON ) {
+            return toBson().equal(right.toBson());
+        }
+
+        while( 1 ) { 
+            char lval = *l; 
+            char rval = *r;
+            if( (lval&(cCANONTYPEMASK|cHASMORE)) != (rval&(cCANONTYPEMASK|cHASMORE)) )
+                return false;
+            l++; r++;
+            switch( lval&cCANONTYPEMASK ) { 
+            case coid:
+                if( *((unsigned*) l) != *((unsigned*) r) )
+                    return false;
+                l += 4; r += 4;
+            case cdate:
+                if( *((unsigned long long *) l) != *((unsigned long long *) r) )
+                    return false;
+                l += 8; r += 8;
+                break;
+            case cdouble:
+                if( (reinterpret_cast< const PackedDouble* > (l))->d != (reinterpret_cast< const PackedDouble* >(r))->d )
+                    return false;
+                l += 8; r += 8;
+                break;
+            case cstring:
+                {
+                    if( *l != *r ) 
+                        return false; // not same length
+                    unsigned sz = ((unsigned) *l) + 1;
+                    if( memcmp(l, r, sz) )
+                        return false;
+                    l += sz; r += sz;
+                    break;
+                }
+            case cbindata:
+                {
+                    if( *l != *r )
+                        return false; // len or subtype mismatch
+                    int len = binDataCodeToLength(*l) + 1;
+                    if( memcmp(l, r, len) ) 
+                        return false;
+                    l += len; r += len;
+                    break;
+                }
+            case cminkey:
+            case cnull:
+            case cfalse:
+            case ctrue:
+            case cmaxkey:
+                break;
+            default:
+                verify(false);
+            }
+            if( (lval&cHASMORE) == 0 )
+                break;
+        }
+        return true;
+    }
+
+    struct CmpUnitTest : public StartupTest {
+        void run() {
+            char a[2];
+            char b[2];
+            a[0] = -3;
+            a[1] = 0;
+            b[0] = 3;
+            b[1] = 0;
+            verify( strcmp(a,b)>0 && memcmp(a,b,2)>0 );
+        }
+    } cunittest;
+
+}  // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/btree/key.h b/src/mongo/db/storage/mmap_v1/btree/key.h
new file mode 100644
index 00000000000..83203b0fee2
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/btree/key.h
@@ -0,0 +1,130 @@
+// @file key.h class(es) representing individual keys in a btree
+
+/**
+*    Copyright (C) 2011 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#pragma once
+
+#include "mongo/db/jsobj.h"
+
+namespace mongo {
+
+    /** Key class for precomputing a small format index key that is denser than a traditional BSONObj. 
+
+        KeyBson is a legacy wrapper implementation for old BSONObj style keys for v:0 indexes.
+
+        KeyV1 is the new implementation.
+    */
+    class KeyBson /* "KeyV0" */ { 
+    public:
+        KeyBson() { }
+        explicit KeyBson(const char *keyData) : _o(keyData) { }
+        explicit KeyBson(const BSONObj& obj) : _o(obj) { }
+        int woCompare(const KeyBson& r, const Ordering &o) const;
+        BSONObj toBson() const { return _o; }
+        std::string toString() const { return _o.toString(); }
+        int dataSize() const { return _o.objsize(); }
+        const char * data() const { return _o.objdata(); }
+        BSONElement _firstElement() const { return _o.firstElement(); }
+        bool isCompactFormat() const { return false; }
+        bool woEqual(const KeyBson& r) const;
+        void assign(const KeyBson& rhs) { *this = rhs; }
+        bool isValid() const { return true; }
+    private:
+        BSONObj _o;
+    };
+
+    class KeyV1Owned;
+
+    // corresponding to BtreeData_V1
+    class KeyV1 { 
+        void operator=(const KeyV1&); // disallowed just to make people be careful as we don't own the buffer
+        KeyV1(const KeyV1Owned&);     // disallowed as this is not a great idea as KeyV1Owned likely will go out of scope
+    public:
+        KeyV1() { _keyData = 0; }
+        ~KeyV1() { DEV _keyData = (const unsigned char *) 1; }
+
+        KeyV1(const KeyV1& rhs) : _keyData(rhs._keyData) { 
+            dassert( _keyData > (const unsigned char *) 1 );
+        }
+
+        // explicit version of operator= to be safe
+        void assign(const KeyV1& rhs) { 
+            _keyData = rhs._keyData;
+        }
+
+        /** @param keyData can be a buffer containing data in either BSON format, OR in KeyV1 format. 
+                   when BSON, we are just a wrapper
+        */
+        explicit KeyV1(const char *keyData) : _keyData((unsigned char *) keyData) { }
+
+        int woCompare(const KeyV1& r, const Ordering &o) const;
+        bool woEqual(const KeyV1& r) const;
+        BSONObj toBson() const;
+        std::string toString() const { return toBson().toString(); }
+
+        /** get the key data we want to store in the btree bucket */
+        const char * data() const { return (const char *) _keyData; }
+
+        /** @return size of data() */
+        int dataSize() const;
+
+        /** only used by geo, which always has bson keys */
+        BSONElement _firstElement() const { return bson().firstElement(); }
+        bool isCompactFormat() const { return *_keyData != IsBSON; }
+
+        bool isValid() const { return _keyData > (const unsigned char*)1; }
+    protected:
+        enum { IsBSON = 0xff };
+        const unsigned char *_keyData;
+        BSONObj bson() const {
+            dassert( !isCompactFormat() );
+            return BSONObj((const char *) _keyData+1);
+        }
+    private:
+        int compareHybrid(const KeyV1& right, const Ordering& order) const;
+    };
+
+    class KeyV1Owned : public KeyV1 { 
+        void operator=(const KeyV1Owned&);
+    public:
+        /** @obj a BSON object to be translated to KeyV1 format.  If the object isn't 
+                 representable in KeyV1 format (which happens, intentionally, at times)
+                 it will stay as bson herein.
+        */
+        KeyV1Owned(const BSONObj& obj);
+
+        /** makes a copy (memcpy's the whole thing) */
+        KeyV1Owned(const KeyV1& rhs);
+
+    private:
+        StackBufBuilder b;
+        void traditional(const BSONObj& obj); // store as traditional bson not as compact format
+    };
+
+};
diff --git a/src/mongo/db/storage/mmap_v1/catalog/hashtab.h b/src/mongo/db/storage/mmap_v1/catalog/hashtab.h
new file mode 100644
index 00000000000..07916dc873d
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/hashtab.h
@@ -0,0 +1,180 @@
+/* hashtab.h
+
+   Simple, fixed size hash table.  Darn simple.
+
+   Uses a contiguous block of memory, so you can put it in a memory mapped file very easily.
+*/
+
+/*    Copyright 2009 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects
+ *    for all of the code used other than as permitted herein. If you modify
+ *    file(s) with this exception, you may extend this exception to your
+ *    version of the file(s), but you are not obligated to do so. If you do not
+ *    wish to do so, delete this exception statement from your version. If you
+ *    delete this exception statement from all source files in the program,
+ *    then also delete it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/pch.h"
+#include <map>
+#include "mongo/db/storage/mmap_v1/dur.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/stdx/functional.h"
+
+namespace mongo {
+
+#pragma pack(1)
+
+    /* you should define:
+
+       int Key::hash() return > 0 always.
+    */
+
+    template <class Key,class Type>
+    class HashTable : boost::noncopyable {
+    public:
+        const char *name;
+        struct Node {
+            int hash;
+            Key k;
+            Type value;
+            bool inUse() {
+                return hash != 0;
+            }
+            void setUnused() {
+                hash = 0;
+            }
+        };
+        void* _buf;
+        int n; // number of hashtable buckets
+        int maxChain;
+
+        Node& nodes(int i) {
+            Node *nodes = (Node *) _buf;
+            return nodes[i];
+        }
+
+        int _find(const Key& k, bool& found) {
+            found = false;
+            int h = k.hash();
+            int i = h % n;
+            int start = i;
+            int chain = 0;
+            int firstNonUsed = -1;
+            while ( 1 ) {
+                if ( !nodes(i).inUse() ) {
+                    if ( firstNonUsed < 0 )
+                        firstNonUsed = i;
+                }
+
+                if ( nodes(i).hash == h && nodes(i).k == k ) {
+                    if ( chain >= 200 )
+                        log() << "warning: hashtable " << name << " long chain " << std::endl;
+                    found = true;
+                    return i;
+                }
+                chain++;
+                i = (i+1) % n;
+                if ( i == start ) {
+                    // shouldn't get here / defensive for infinite loops
+                    log() << "error: hashtable " << name << " is full n:" << n << std::endl;
+                    return -1;
+                }
+                if( chain >= maxChain ) {
+                    if ( firstNonUsed >= 0 )
+                        return firstNonUsed;
+                    log() << "error: hashtable " << name << " max chain reached:" << maxChain << std::endl;
+                    return -1;
+                }
+            }
+        }
+
+    public:
+        /* buf must be all zeroes on initialization. */
+        HashTable(void* buf, int buflen, const char *_name) : name(_name) {
+            int m = sizeof(Node);
+            // log() << "hashtab init, buflen:" << buflen << " m:" << m << std::endl;
+            n = buflen / m;
+            if ( (n & 1) == 0 )
+                n--;
+            maxChain = (int) (n * 0.05);
+            _buf = buf;
+            //nodes = (Node *) buf;
+
+            if ( sizeof(Node) != 628 ) {
+                log() << "HashTable() " << _name << " sizeof(node):" << sizeof(Node) << " n:" << n << " sizeof(Key): " << sizeof(Key) << " sizeof(Type):" << sizeof(Type) << std::endl;
+                verify( sizeof(Node) == 628 );
+            }
+
+        }
+
+        Type* get(const Key& k) {
+            bool found;
+            int i = _find(k, found);
+            if ( found )
+                return &nodes(i).value;
+            return 0;
+        }
+
+        void kill(OperationContext* txn, const Key& k) {
+            bool found;
+            int i = _find(k, found);
+            if ( i >= 0 && found ) {
+                Node* n = &nodes(i);
+                n = txn->recoveryUnit()->writing(n);
+                n->k.kill();
+                n->setUnused();
+            }
+        }
+
+        /** returns false if too full */
+        bool put(OperationContext* txn, const Key& k, const Type& value) {
+            bool found;
+            int i = _find(k, found);
+            if ( i < 0 )
+                return false;
+            Node* n = txn->recoveryUnit()->writing( &nodes(i) );
+            if ( !found ) {
+                n->k = k;
+                n->hash = k.hash();
+            }
+            else {
+                verify( n->hash == k.hash() );
+            }
+            n->value = value;
+            return true;
+        }
+
+        typedef stdx::function< void ( const Key& k , Type& v ) > IteratorCallback;
+        void iterAll( IteratorCallback callback ) {
+            for ( int i=0; i<n; i++ ) {
+                if ( nodes(i).inUse() ) {
+                    callback( nodes(i).k , nodes(i).value );
+                }
+            }
+        }
+
+    };
+
+#pragma pack()
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/catalog/index_details.cpp b/src/mongo/db/storage/mmap_v1/catalog/index_details.cpp
new file mode 100644
index 00000000000..bc9cc3ee791
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/index_details.cpp
@@ -0,0 +1,40 @@
+// index_details.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#include "mongo/db/storage/mmap_v1/catalog/index_details.h"
+
+namespace mongo {
+
+    void IndexDetails::_reset() {
+        head.setInvalid();
+        info.setInvalid();
+    }
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/index_details.h b/src/mongo/db/storage/mmap_v1/catalog/index_details.h
new file mode 100644
index 00000000000..b2f34ec0681
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/index_details.h
@@ -0,0 +1,69 @@
+// index_details.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#pragma once
+
+#include "mongo/db/diskloc.h"
+
+namespace mongo {
+
+    /* Details about a particular index. There is one of these effectively for each object in
+       system.namespaces (although this also includes the head pointer, which is not in that
+       collection).
+
+       This is an internal part of the catalog.  Nothing outside of the catalog should use this.
+
+       ** MemoryMapped Record ** (i.e., this is on disk data)
+     */
+    struct IndexDetails {
+        /**
+         * btree head disk location
+         */
+        DiskLoc head;
+
+        /* Location of index info object. Format:
+
+             { name:"nameofindex", ns:"parentnsname", key: {keypattobject}
+               [, unique: <bool>, background: <bool>, v:<version>]
+             }
+
+           This object is in the system.indexes collection.  Note that since we
+           have a pointer to the object here, the object in system.indexes MUST NEVER MOVE.
+        */
+        DiskLoc info;
+
+        /**
+         * makes head and info invalid
+        */
+        void _reset();
+
+    };
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace-inl.h b/src/mongo/db/storage/mmap_v1/catalog/namespace-inl.h
new file mode 100644
index 00000000000..5cd45963f1f
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace-inl.h
@@ -0,0 +1,74 @@
+// namespace-inl.h
+
+/**
+*    Copyright (C) 2009 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#pragma once
+
+namespace mongo {
+
+    inline Namespace& Namespace::operator=(const StringData& ns) {
+        // we fill the remaining space with all zeroes here.  as the full Namespace struct is in
+        // the datafiles (the .ns files specifically), that is helpful as then they are deterministic
+        // in the bytes they have for a given sequence of operations.  that makes testing and debugging
+        // the data files easier.
+        //
+        // if profiling indicates this method is a significant bottleneck, we could have a version we
+        // use for reads which does not fill with zeroes, and keep the zeroing behavior on writes.
+        //
+        memset( buf, 0, sizeof(buf) );
+        uassert( 10080 , "ns name too long, max size is 127 bytes", ns.size() <= MaxNsLen);
+        uassert( 17380 , "ns name can't contain embedded '\0' byte", ns.find('\0') == std::string::npos);
+        ns.copyTo( buf, true );
+        return *this;
+    }
+
+    inline std::string Namespace::extraName(int i) const {
+        char ex[] = "$extra";
+        ex[5] += i;
+        std::string s = std::string(buf) + ex;
+        massert( 10348 , "$extra: ns name too long", s.size() <= MaxNsLen);
+        return s;
+    }
+
+    inline bool Namespace::isExtra() const {
+        const char *p = strstr(buf, "$extr");
+        return p && p[5] && p[6] == 0; //==0 important in case an index uses name "$extra_1" for example
+    }
+
+    inline int Namespace::hash() const {
+        unsigned x = 0;
+        const char *p = buf;
+        while ( *p ) {
+            x = x * 131 + *p;
+            p++;
+        }
+        return (x & 0x7fffffff) | 0x8000000; // must be > 0
+    }
+
+}  // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace.cpp
new file mode 100644
index 00000000000..822ed26dedb
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace.cpp
@@ -0,0 +1,49 @@
+// namespace.cpp
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/db/storage/mmap_v1/catalog/namespace.h"
+
+#include <boost/static_assert.hpp>
+
+#include "mongo/db/namespace_string.h"
+
+namespace mongo {
+    namespace {
+        BOOST_STATIC_ASSERT( sizeof(Namespace) == 128 );
+        BOOST_STATIC_ASSERT( Namespace::MaxNsLenWithNUL == MaxDatabaseNameLen );
+        BOOST_STATIC_ASSERT((int)Namespace::MaxNsLenWithNUL == (int)NamespaceString::MaxNsLenWithNUL);
+        BOOST_STATIC_ASSERT((int)Namespace::MaxNsLen == (int)NamespaceString::MaxNsLen);
+        // Note the typo.
+        BOOST_STATIC_ASSERT((int)Namespace::MaxNsColletionLen == (int)NamespaceString::MaxNsCollectionLen);
+    }
+}
+
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace.h b/src/mongo/db/storage/mmap_v1/catalog/namespace.h
new file mode 100644
index 00000000000..40e70ac9857
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace.h
@@ -0,0 +1,92 @@
+// namespace.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#pragma once
+
+#include <cstring>
+#include <string>
+
+#include "mongo/base/string_data.h"
+
+namespace mongo {
+
+#pragma pack(1)
+    /**
+     * This is used for storing a namespace on disk in a fixed witdh form
+     * it should only be used for that, not for passing internally
+     * for that, please use NamespaceString
+     */
+    class Namespace {
+    public:
+        Namespace(const StringData& ns) { *this = ns; }
+        Namespace& operator=(const StringData& ns);
+
+        void kill() { buf[0] = 0x7f; }
+
+        bool operator==(const char *r) const { return strcmp(buf, r) == 0; }
+        bool operator==(const Namespace& r) const { return strcmp(buf, r.buf) == 0; }
+        bool operator!=(const char *r) const { return strcmp(buf, r) != 0; }
+        bool operator!=(const Namespace& r) const { return strcmp(buf, r.buf) != 0; }
+
+        bool hasDollarSign() const { return strchr( buf , '$' ) != NULL;  }
+
+        int hash() const; // value returned is always > 0
+
+        size_t size() const { return strlen( buf ); }
+
+        std::string toString() const { return buf; }
+        operator std::string() const { return buf; }
+
+        /* NamespaceDetails::Extra was added after fact to allow chaining of data blocks to support more than 10 indexes
+           (more than 10 IndexDetails).  It's a bit hacky because of this late addition with backward
+           file support. */
+        std::string extraName(int i) const;
+        bool isExtra() const; /* ends with $extr... -- when true an extra block not a normal NamespaceDetails block */
+
+        enum MaxNsLenValue {
+            // Maximum possible length of name any namespace, including special ones like $extra.
+            // This includes rum for the NUL byte so it can be used when sizing buffers.
+            MaxNsLenWithNUL = 128,
+
+            // MaxNsLenWithNUL excluding the NUL byte. Use this when comparing std::string lengths.
+            MaxNsLen = MaxNsLenWithNUL - 1,
+
+            // Maximum allowed length of fully qualified namespace name of any real collection.
+            // Does not include NUL so it can be directly compared to std::string lengths.
+            MaxNsColletionLen = MaxNsLen - 7/*strlen(".$extra")*/,
+        };
+    private:
+        char buf[MaxNsLenWithNUL];
+    };
+#pragma pack()
+
+} // namespace mongo
+
+#include "mongo/db/storage/mmap_v1/catalog/namespace-inl.h"
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp
new file mode 100644
index 00000000000..2fe3d226e5d
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp
@@ -0,0 +1,244 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#include "mongo/pch.h"
+
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h"
+
+#include <algorithm>
+#include <list>
+
+#include "mongo/base/counter.h"
+#include "mongo/db/catalog/collection.h"
+#include "mongo/db/catalog/collection_options.h"
+#include "mongo/db/clientcursor.h"
+#include "mongo/db/commands/server_status.h"
+#include "mongo/db/db.h"
+#include "mongo/db/index_legacy.h"
+#include "mongo/db/json.h"
+#include "mongo/db/ops/delete.h"
+#include "mongo/db/ops/update.h"
+#include "mongo/db/storage/mmap_v1/catalog/hashtab.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/scripting/engine.h"
+#include "mongo/util/startup_test.h"
+
+
+namespace mongo {
+
+
+    BSONObj idKeyPattern = fromjson("{\"_id\":1}");
+
+    NamespaceDetails::NamespaceDetails( const DiskLoc &loc, bool capped ) {
+        BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails) );
+
+        /* be sure to initialize new fields here -- doesn't default to zeroes the way we use it */
+        firstExtent = lastExtent = capExtent = loc;
+        stats.datasize = stats.nrecords = 0;
+        lastExtentSize = 0;
+        nIndexes = 0;
+        isCapped = capped;
+        maxDocsInCapped = 0x7fffffff; // no limit (value is for pre-v2.3.2 compatibility)
+        paddingFactor = 1.0;
+        systemFlagsOldDoNotUse = 0;
+        userFlags = 0;
+        capFirstNewRecord = DiskLoc();
+        // Signal that we are on first allocation iteration through extents.
+        capFirstNewRecord.setInvalid();
+        // For capped case, signal that we are doing initial extent allocation.
+        if ( capped ) {
+            // WAS: cappedLastDelRecLastExtent().setInvalid();
+            deletedList[1].setInvalid();
+        }
+        verify( sizeof(_dataFileVersion) == 2 );
+        _dataFileVersion = 0;
+        _indexFileVersion = 0;
+        multiKeyIndexBits = 0;
+        _reservedA = 0;
+        _extraOffset = 0;
+        indexBuildsInProgress = 0;
+        memset(_reserved, 0, sizeof(_reserved));
+    }
+
+    NamespaceDetails::Extra* NamespaceDetails::allocExtra( OperationContext* txn,
+                                                           const StringData& ns,
+                                                           NamespaceIndex& ni,
+                                                           int nindexessofar) {
+        txn->lockState()->assertWriteLocked(ns);
+
+        int i = (nindexessofar - NIndexesBase) / NIndexesExtra;
+        verify( i >= 0 && i <= 1 );
+
+        Namespace fullns( ns );
+        Namespace extrans( fullns.extraName(i) ); // throws UserException if ns name too long
+
+        massert( 10350, "allocExtra: base ns missing?", this );
+        massert( 10351, "allocExtra: extra already exists", ni.details(extrans) == 0 );
+
+        Extra temp;
+        temp.init();
+
+        ni.add_ns( txn, extrans, reinterpret_cast<NamespaceDetails*>( &temp ) );
+        Extra* e = reinterpret_cast<NamespaceDetails::Extra*>( ni.details( extrans ) );
+
+        long ofs = e->ofsFrom(this);
+        if( i == 0 ) {
+            verify( _extraOffset == 0 );
+            *txn->recoveryUnit()->writing(&_extraOffset) = ofs;
+            verify( extra() == e );
+        }
+        else {
+            Extra *hd = extra();
+            verify( hd->next(this) == 0 );
+            hd->setNext(txn, ofs);
+        }
+        return e;
+    }
+
+    IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected) {
+        if( idxNo < NIndexesBase ) {
+            IndexDetails& id = _indexes[idxNo];
+            return id;
+        }
+        Extra *e = extra();
+        if ( ! e ) {
+            if ( missingExpected )
+                throw MsgAssertionException( 13283 , "Missing Extra" );
+            massert(14045, "missing Extra", e);
+        }
+        int i = idxNo - NIndexesBase;
+        if( i >= NIndexesExtra ) {
+            e = e->next(this);
+            if ( ! e ) {
+                if ( missingExpected )
+                    throw MsgAssertionException( 14823 , "missing extra" );
+                massert(14824, "missing Extra", e);
+            }
+            i -= NIndexesExtra;
+        }
+        return e->details[i];
+    }
+
+
+    const IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected) const {
+        if( idxNo < NIndexesBase ) {
+            const IndexDetails& id = _indexes[idxNo];
+            return id;
+        }
+        const Extra *e = extra();
+        if ( ! e ) {
+            if ( missingExpected )
+                throw MsgAssertionException( 17421 , "Missing Extra" );
+            massert(17422, "missing Extra", e);
+        }
+        int i = idxNo - NIndexesBase;
+        if( i >= NIndexesExtra ) {
+            e = e->next(this);
+            if ( ! e ) {
+                if ( missingExpected )
+                    throw MsgAssertionException( 17423 , "missing extra" );
+                massert(17424, "missing Extra", e);
+            }
+            i -= NIndexesExtra;
+        }
+        return e->details[i];
+    }
+
+    NamespaceDetails::IndexIterator::IndexIterator(const NamespaceDetails *_d,
+                                                   bool includeBackgroundInProgress) {
+        d = _d;
+        i = 0;
+        n = d->nIndexes;
+        if ( includeBackgroundInProgress )
+            n += d->indexBuildsInProgress;
+    }
+
+    // must be called when renaming a NS to fix up extra
+    void NamespaceDetails::copyingFrom( OperationContext* txn,
+                                        const StringData& thisns,
+                                        NamespaceIndex& ni,
+                                        NamespaceDetails* src) {
+        _extraOffset = 0; // we are a copy -- the old value is wrong.  fixing it up below.
+        Extra *se = src->extra();
+        int n = NIndexesBase;
+        if( se ) {
+            Extra *e = allocExtra(txn, thisns, ni, n);
+            while( 1 ) {
+                n += NIndexesExtra;
+                e->copy(this, *se);
+                se = se->next(src);
+                if( se == 0 ) break;
+                Extra *nxt = allocExtra(txn, thisns, ni, n);
+                e->setNext( txn, nxt->ofsFrom(this) );
+                e = nxt;
+            }
+            verify( _extraOffset );
+        }
+    }
+
+    NamespaceDetails* NamespaceDetails::writingWithoutExtra( OperationContext* txn ) {
+        return txn->recoveryUnit()->writing( this );
+    }
+
+
+    // XXX - this method should go away
+    NamespaceDetails *NamespaceDetails::writingWithExtra( OperationContext* txn ) {
+        for( Extra *e = extra(); e; e = e->next( this ) ) {
+            txn->recoveryUnit()->writing( e );
+        }
+        return writingWithoutExtra( txn );
+    }
+
+    void NamespaceDetails::setMaxCappedDocs( OperationContext* txn, long long max ) {
+        massert( 16499,
+                 "max in a capped collection has to be < 2^31 or -1",
+                 CollectionOptions::validMaxCappedDocs( &max ) );
+        maxDocsInCapped = max;
+    }
+
+    /* ------------------------------------------------------------------------- */
+
+
+    int NamespaceDetails::_catalogFindIndexByName(const Collection* coll,
+                                                  const StringData& name,
+                                                  bool includeBackgroundInProgress) const {
+        IndexIterator i = ii(includeBackgroundInProgress);
+        while( i.more() ) {
+            const BSONObj obj = coll->docFor(i.next().info);
+            if ( name == obj.getStringField("name") )
+                return i.pos()-1;
+        }
+        return -1;
+    }
+
+    void NamespaceDetails::Extra::setNext( OperationContext* txn,
+                                           long ofs ) {
+        *txn->recoveryUnit()->writing(&_next) = ofs;
+    }
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.h
new file mode 100644
index 00000000000..0a6734e7d9d
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.h
@@ -0,0 +1,229 @@
+/**
+ *    Copyright (C) 2008 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/diskloc.h"
+#include "mongo/db/namespace_string.h"
+#include "mongo/db/storage/mmap_v1/catalog/index_details.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_index.h"
+
+namespace mongo {
+
+    class Collection;
+    class OperationContext;
+
+    /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes
+       so you can look for a deleterecord about the right size.
+    */
+    const int Buckets = 19;
+    const int MaxBucket = 18;
+
+    extern int bucketSizes[];
+
+#pragma pack(1)
+    /* NamespaceDetails : this is the "header" for a collection that has all its details.
+       It's in the .ns file and this is a memory mapped region (thus the pack pragma above).
+    */
+    class NamespaceDetails {
+    public:
+        enum { NIndexesMax = 64, NIndexesExtra = 30, NIndexesBase  = 10 };
+
+
+
+        /*-------- data fields, as present on disk : */
+
+        DiskLoc firstExtent;
+        DiskLoc lastExtent;
+
+        /* NOTE: capped collections v1 override the meaning of deletedList.
+                 deletedList[0] points to a list of free records (DeletedRecord's) for all extents in
+                 the capped namespace.
+                 deletedList[1] points to the last record in the prev extent.  When the "current extent"
+                 changes, this value is updated.  !deletedList[1].isValid() when this value is not
+                 yet computed.
+        */
+        DiskLoc deletedList[Buckets];
+
+        // ofs 168 (8 byte aligned)
+        struct Stats {
+            // datasize and nrecords MUST Be adjacent code assumes!
+            long long datasize; // this includes padding, but not record headers
+            long long nrecords;
+        } stats;
+
+
+        int lastExtentSize;
+
+        int nIndexes;
+
+        // ofs 192
+        IndexDetails _indexes[NIndexesBase];
+
+    public:
+        // ofs 352 (16 byte aligned)
+        int isCapped;                         // there is wasted space here if I'm right (ERH)
+
+        int maxDocsInCapped;                  // max # of objects for a capped table, -1 for inf.
+
+        double paddingFactor;                 // 1.0 = no padding.
+        // ofs 368 (16)
+        int systemFlagsOldDoNotUse; // things that the system sets/cares about
+
+        DiskLoc capExtent; // the "current" extent we're writing too for a capped collection
+        DiskLoc capFirstNewRecord;
+
+        unsigned short _dataFileVersion;       // NamespaceDetails version.  So we can do backward compatibility in the future. See filever.h
+        unsigned short _indexFileVersion;
+
+        unsigned long long multiKeyIndexBits;
+
+        // ofs 400 (16)
+        unsigned long long _reservedA;
+        long long _extraOffset;               // where the $extra info is located (bytes relative to this)
+
+    public:
+        int indexBuildsInProgress;            // Number of indexes currently being built
+
+        int userFlags;
+
+        char _reserved[72];
+        /*-------- end data 496 bytes */
+    public:
+        explicit NamespaceDetails( const DiskLoc &loc, bool _capped );
+
+        class Extra {
+            long long _next;
+        public:
+            IndexDetails details[NIndexesExtra];
+        private:
+            unsigned reserved2;
+            unsigned reserved3;
+            Extra(const Extra&) { verify(false); }
+            Extra& operator=(const Extra& r) { verify(false); return *this; }
+        public:
+            Extra() { }
+            long ofsFrom(NamespaceDetails *d) {
+                return ((char *) this) - ((char *) d);
+            }
+            void init() { memset(this, 0, sizeof(Extra)); }
+            Extra* next(const NamespaceDetails *d) const {
+                if( _next == 0 ) return 0;
+                return (Extra*) (((char *) d) + _next);
+            }
+            void setNext(OperationContext* txn, long ofs);
+            void copy(NamespaceDetails *d, const Extra& e) {
+                memcpy(this, &e, sizeof(Extra));
+                _next = 0;
+            }
+        };
+        Extra* extra() const {
+            if( _extraOffset == 0 ) return 0;
+            return (Extra *) (((char *) this) + _extraOffset);
+        }
+        /* add extra space for indexes when more than 10 */
+        Extra* allocExtra( OperationContext* txn,
+                           const StringData& ns,
+                           NamespaceIndex& ni,
+                           int nindexessofar );
+
+        void copyingFrom( OperationContext* txn,
+                          const StringData& thisns,
+                          NamespaceIndex& ni,
+                          NamespaceDetails *src); // must be called when renaming a NS to fix up extra
+
+    public:
+        void setMaxCappedDocs( OperationContext* txn, long long max );
+
+        enum UserFlags {
+            Flag_UsePowerOf2Sizes = 1 << 0
+        };
+
+        IndexDetails& idx(int idxNo, bool missingExpected = false );
+        const IndexDetails& idx(int idxNo, bool missingExpected = false ) const;
+
+        class IndexIterator {
+        public:
+            int pos() { return i; } // note this is the next one to come
+            bool more() { return i < n; }
+            const IndexDetails& next() { return d->idx(i++); }
+        private:
+            friend class NamespaceDetails;
+            int i, n;
+            const NamespaceDetails *d;
+            IndexIterator(const NamespaceDetails *_d, bool includeBackgroundInProgress);
+        };
+
+        IndexIterator ii( bool includeBackgroundInProgress = false ) const {
+            return IndexIterator(this, includeBackgroundInProgress);
+        }
+
+        /**
+         * This fetches the IndexDetails for the next empty index slot. The caller must populate
+         * returned object.  This handles allocating extra index space, if necessary.
+         */
+        IndexDetails& getNextIndexDetails(OperationContext* txn, Collection* collection);
+
+        NamespaceDetails *writingWithoutExtra( OperationContext* txn );
+
+        /** Make all linked Extra objects writeable as well */
+        NamespaceDetails *writingWithExtra( OperationContext* txn );
+
+        /**
+         * Returns the offset of the specified index name within the array of indexes. Must be
+         * passed-in the owning collection to resolve the index record entries to objects.
+         *
+         * @return > 0 if index name was found, -1 otherwise.
+         */
+        int _catalogFindIndexByName(const Collection* coll,
+                                    const StringData& name, 
+                                    bool includeBackgroundInProgress) const;
+
+    private:
+
+        /**
+         * swaps all meta data for 2 indexes
+         * a and b are 2 index ids, whose contents will be swapped
+         * must have a lock on the entire collection to do this
+         */
+        void swapIndex( OperationContext* txn, int a, int b );
+
+        friend class IndexCatalog;
+        friend class IndexCatalogEntry;
+
+        /** Update cappedLastDelRecLastExtent() after capExtent changed in cappedTruncateAfter() */
+        void cappedTruncateLastDelUpdate();
+        BOOST_STATIC_ASSERT( NIndexesMax <= NIndexesBase + NIndexesExtra*2 );
+        BOOST_STATIC_ASSERT( NIndexesMax <= 64 ); // multiKey bits
+        BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) == 496 );
+    }; // NamespaceDetails
+    BOOST_STATIC_ASSERT( sizeof(NamespaceDetails) == 496 );
+#pragma pack()
+
+} // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp
new file mode 100644
index 00000000000..27957a297a5
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp
@@ -0,0 +1,333 @@
+// namespace_details_collection_entry.h
+
+/**
+*    Copyright (C) 2014 MongoDB Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h"
+
+#include "mongo/db/index/index_descriptor.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h"
+#include "mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h"
+#include "mongo/db/storage/record_store.h"
+#include "mongo/util/startup_test.h"
+
+namespace mongo {
+    NamespaceDetailsCollectionCatalogEntry::NamespaceDetailsCollectionCatalogEntry( const StringData& ns,
+                                                                                    NamespaceDetails* details,
+                                                                                    RecordStore* indexRecordStore,
+                                                                                    MMAPV1DatabaseCatalogEntry* db )
+        : CollectionCatalogEntry( ns ),
+          _details( details ),
+          _indexRecordStore( indexRecordStore ),
+          _db( db ) {
+    }
+
+    CollectionOptions NamespaceDetailsCollectionCatalogEntry::getCollectionOptions(OperationContext* txn) const {
+        return _db->getCollectionOptions( txn, ns().ns() );
+    }
+
+    int NamespaceDetailsCollectionCatalogEntry::getTotalIndexCount() const {
+        return _details->nIndexes + _details->indexBuildsInProgress;
+    }
+
+    int NamespaceDetailsCollectionCatalogEntry::getCompletedIndexCount() const {
+        return _details->nIndexes;
+    }
+
+    int NamespaceDetailsCollectionCatalogEntry::getMaxAllowedIndexes() const {
+        return NamespaceDetails::NIndexesMax;
+    }
+
+    void NamespaceDetailsCollectionCatalogEntry::getAllIndexes( std::vector<std::string>* names ) const {
+        NamespaceDetails::IndexIterator i = _details->ii( true );
+        while ( i.more() ) {
+            const IndexDetails& id = i.next();
+            const BSONObj obj = _indexRecordStore->dataFor( id.info ).toBson();
+            names->push_back( obj.getStringField("name") );
+        }
+    }
+
+    bool NamespaceDetailsCollectionCatalogEntry::isIndexMultikey(const StringData& idxName) const {
+        int idxNo = _findIndexNumber( idxName );
+        invariant( idxNo >= 0 );
+        return isIndexMultikey( idxNo );
+    }
+
+    bool NamespaceDetailsCollectionCatalogEntry::isIndexMultikey(int idxNo) const {
+        return (_details->multiKeyIndexBits & (((unsigned long long) 1) << idxNo)) != 0;
+    }
+
+    bool NamespaceDetailsCollectionCatalogEntry::setIndexIsMultikey(OperationContext* txn,
+                                                                    const StringData& indexName,
+                                                                    bool multikey ) {
+
+        int idxNo = _findIndexNumber( indexName );
+        invariant( idxNo >= 0 );
+        return setIndexIsMultikey( txn, idxNo, multikey );
+    }
+
+    bool NamespaceDetailsCollectionCatalogEntry::setIndexIsMultikey(OperationContext* txn,
+                                                                    int idxNo,
+                                                                    bool multikey ) {
+        unsigned long long mask = 1ULL << idxNo;
+
+        if (multikey) {
+            // Shortcut if the bit is already set correctly
+            if (_details->multiKeyIndexBits & mask) {
+                return false;
+            }
+
+            *txn->recoveryUnit()->writing(&_details->multiKeyIndexBits) |= mask;
+        }
+        else {
+            // Shortcut if the bit is already set correctly
+            if (!(_details->multiKeyIndexBits & mask)) {
+                return false;
+            }
+
+            // Invert mask: all 1's except a 0 at the ith bit
+            mask = ~mask;
+            *txn->recoveryUnit()->writing(&_details->multiKeyIndexBits) &= mask;
+        }
+
+        return true;
+    }
+
+    DiskLoc NamespaceDetailsCollectionCatalogEntry::getIndexHead( const StringData& idxName ) const {
+        int idxNo = _findIndexNumber( idxName );
+        invariant( idxNo >= 0 );
+        return _details->idx( idxNo ).head;
+    }
+
+    BSONObj NamespaceDetailsCollectionCatalogEntry::getIndexSpec( const StringData& idxName ) const {
+        int idxNo = _findIndexNumber( idxName );
+        invariant( idxNo >= 0 );
+        const IndexDetails& id = _details->idx( idxNo );
+        return _indexRecordStore->dataFor( id.info ).toBson();
+    }
+
+    void NamespaceDetailsCollectionCatalogEntry::setIndexHead( OperationContext* txn,
+                                                               const StringData& idxName,
+                                                               const DiskLoc& newHead ) {
+        int idxNo = _findIndexNumber( idxName );
+        invariant( idxNo >= 0 );
+        *txn->recoveryUnit()->writing( &_details->idx( idxNo ).head) = newHead;
+    }
+
+    bool NamespaceDetailsCollectionCatalogEntry::isIndexReady( const StringData& idxName ) const {
+        int idxNo = _findIndexNumber( idxName );
+        invariant( idxNo >= 0 );
+        return idxNo < getCompletedIndexCount();
+    }
+
+    int NamespaceDetailsCollectionCatalogEntry::_findIndexNumber( const StringData& idxName ) const {
+        NamespaceDetails::IndexIterator i = _details->ii( true );
+        while ( i.more() ) {
+            const IndexDetails& id = i.next();
+            int idxNo = i.pos() - 1;
+            const BSONObj obj = _indexRecordStore->dataFor( id.info ).toBson();
+            if ( idxName == obj.getStringField("name") )
+                return idxNo;
+        }
+        return -1;
+    }
+
+    /* remove bit from a bit array - actually remove its slot, not a clear
+       note: this function does not work with x == 63 -- that is ok
+             but keep in mind in the future if max indexes were extended to
+             exactly 64 it would be a problem
+    */
+    unsigned long long removeAndSlideBit(unsigned long long b, int x) {
+        unsigned long long tmp = b;
+        return
+            (tmp & ((((unsigned long long) 1) << x)-1)) |
+            ((tmp >> (x+1)) << x);
+    }
+
+    class IndexUpdateTest : public StartupTest {
+    public:
+        void run() {
+            verify( removeAndSlideBit(1, 0) == 0 );
+            verify( removeAndSlideBit(2, 0) == 1 );
+            verify( removeAndSlideBit(2, 1) == 0 );
+            verify( removeAndSlideBit(255, 1) == 127 );
+            verify( removeAndSlideBit(21, 2) == 9 );
+            verify( removeAndSlideBit(0x4000000000000001ULL, 62) == 1 );
+        }
+    } iu_unittest;
+
+    Status NamespaceDetailsCollectionCatalogEntry::removeIndex( OperationContext* txn,
+                                                                const StringData& indexName ) {
+        int idxNo = _findIndexNumber( indexName );
+        if ( idxNo < 0 )
+            return Status( ErrorCodes::NamespaceNotFound, "index not found to remove" );
+
+        DiskLoc infoLocation = _details->idx( idxNo ).info;
+
+        { // sanity check
+            BSONObj info = _indexRecordStore->dataFor( infoLocation ).toBson();
+            invariant( info["name"].String() == indexName );
+        }
+
+        { // drop the namespace
+            string indexNamespace = IndexDescriptor::makeIndexNamespace( ns().ns(), indexName );
+            Status status = _db->dropCollection( txn, indexNamespace );
+            if ( !status.isOK() ) {
+                return status;
+            }
+        }
+
+        { // all info in the .ns file
+            NamespaceDetails* d = _details->writingWithExtra( txn );
+
+            // fix the _multiKeyIndexBits, by moving all bits above me down one
+            d->multiKeyIndexBits = removeAndSlideBit(d->multiKeyIndexBits, idxNo);
+
+            if ( idxNo >= d->nIndexes )
+                d->indexBuildsInProgress--;
+            else
+                d->nIndexes--;
+
+            for ( int i = idxNo; i < getTotalIndexCount(); i++ )
+                d->idx(i) = d->idx(i+1);
+
+            d->idx( getTotalIndexCount() ) = IndexDetails();
+        }
+
+        // remove from system.indexes
+        _indexRecordStore->deleteRecord( txn, infoLocation );
+
+        return Status::OK();
+    }
+
+    Status NamespaceDetailsCollectionCatalogEntry::prepareForIndexBuild( OperationContext* txn,
+                                                                         const IndexDescriptor* desc ) {
+        BSONObj spec = desc->infoObj();
+        // 1) entry in system.indexs
+        StatusWith<DiskLoc> systemIndexesEntry = _indexRecordStore->insertRecord( txn,
+                                                                                  spec.objdata(),
+                                                                                  spec.objsize(),
+                                                                                  -1 );
+        if ( !systemIndexesEntry.isOK() )
+            return systemIndexesEntry.getStatus();
+
+        // 2) NamespaceDetails mods
+        IndexDetails *id;
+        try {
+            id = &_details->idx(getTotalIndexCount(), true);
+        }
+        catch( DBException& ) {
+            _details->allocExtra(txn,
+                                 ns().ns(),
+                                 _db->_namespaceIndex,
+                                 getTotalIndexCount());
+            id = &_details->idx(getTotalIndexCount(), false);
+        }
+
+        *txn->recoveryUnit()->writing( &id->info ) = systemIndexesEntry.getValue();
+        *txn->recoveryUnit()->writing( &id->head ) = DiskLoc();
+
+        txn->recoveryUnit()->writingInt( _details->indexBuildsInProgress ) += 1;
+
+        // 3) indexes entry in .ns file
+        NamespaceIndex& nsi = _db->_namespaceIndex;
+        invariant( nsi.details( desc->indexNamespace() ) == NULL );
+        nsi.add_ns( txn, desc->indexNamespace(), DiskLoc(), false );
+
+        // 4) system.namespaces entry index ns
+        _db->_addNamespaceToNamespaceCollection( txn, desc->indexNamespace(), NULL);
+
+        return Status::OK();
+    }
+
+    void NamespaceDetailsCollectionCatalogEntry::indexBuildSuccess( OperationContext* txn,
+                                                                    const StringData& indexName ) {
+        int idxNo = _findIndexNumber( indexName );
+        fassert( 17202, idxNo >= 0 );
+
+        // Make sure the newly created index is relocated to nIndexes, if it isn't already there
+        if ( idxNo != getCompletedIndexCount() ) {
+            int toIdxNo = getCompletedIndexCount();
+
+            //_details->swapIndex( txn, idxNo, toIdxNo );
+
+            // flip main meta data
+            IndexDetails temp = _details->idx(idxNo);
+            *txn->recoveryUnit()->writing(&_details->idx(idxNo)) = _details->idx(toIdxNo);
+            *txn->recoveryUnit()->writing(&_details->idx(toIdxNo)) = temp;
+
+            // flip multi key bits
+            bool tempMultikey = isIndexMultikey(idxNo);
+            setIndexIsMultikey( txn, idxNo, isIndexMultikey(toIdxNo) );
+            setIndexIsMultikey( txn, toIdxNo, tempMultikey );
+
+            idxNo = toIdxNo;
+            invariant( idxNo = _findIndexNumber( indexName ) );
+        }
+
+        txn->recoveryUnit()->writingInt( _details->indexBuildsInProgress ) -= 1;
+        txn->recoveryUnit()->writingInt( _details->nIndexes ) += 1;
+
+        invariant( isIndexReady( indexName ) );
+    }
+
+    void NamespaceDetailsCollectionCatalogEntry::updateTTLSetting( OperationContext* txn,
+                                                                   const StringData& idxName,
+                                                                   long long newExpireSeconds ) {
+        int idx = _findIndexNumber( idxName );
+        invariant( idx >= 0 );
+
+        IndexDetails& indexDetails = _details->idx( idx );
+
+        BSONObj obj = _indexRecordStore->dataFor( indexDetails.info ).toBson();
+        const BSONElement oldExpireSecs = obj.getField("expireAfterSeconds");
+
+        // Important that we set the new value in-place.  We are writing directly to the
+        // object here so must be careful not to overwrite with a longer numeric type.
+
+        char* nonConstPtr = const_cast<char*>(oldExpireSecs.value());
+        switch( oldExpireSecs.type() ) {
+        case EOO:
+            massert( 16631, "index does not have an 'expireAfterSeconds' field", false );
+            break;
+        case NumberInt:
+            *txn->recoveryUnit()->writing(reinterpret_cast<int*>(nonConstPtr)) = newExpireSeconds;
+            break;
+        case NumberDouble:
+            *txn->recoveryUnit()->writing(reinterpret_cast<double*>(nonConstPtr)) = newExpireSeconds;
+            break;
+        case NumberLong:
+            *txn->recoveryUnit()->writing(reinterpret_cast<long long*>(nonConstPtr)) = newExpireSeconds;
+            break;
+        default:
+            massert( 16632, "current 'expireAfterSeconds' is not a number", false );
+        }
+    }
+
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h
new file mode 100644
index 00000000000..78a5b96f181
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h
@@ -0,0 +1,109 @@
+// namespace_details_collection_entry.h
+
+#pragma once
+
+/**
+*    Copyright (C) 2014 MongoDB Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#include "mongo/base/string_data.h"
+#include "mongo/bson/bsonobj.h"
+#include "mongo/db/catalog/collection_catalog_entry.h"
+#include "mongo/db/diskloc.h"
+
+namespace mongo {
+
+    class NamespaceDetails;
+
+    class MMAPV1DatabaseCatalogEntry;;
+    class RecordStore;
+    class OperationContext;
+
+    class NamespaceDetailsCollectionCatalogEntry : public CollectionCatalogEntry {
+    public:
+        NamespaceDetailsCollectionCatalogEntry( const StringData& ns,
+                                                NamespaceDetails* details,
+                                                RecordStore* indexRecordStore,
+                                                MMAPV1DatabaseCatalogEntry* db );
+
+        virtual ~NamespaceDetailsCollectionCatalogEntry(){}
+
+        virtual CollectionOptions getCollectionOptions(OperationContext* txn) const;
+
+        virtual int getTotalIndexCount() const;
+
+        virtual int getCompletedIndexCount() const;
+
+        virtual int getMaxAllowedIndexes() const;
+
+        virtual void getAllIndexes( std::vector<std::string>* names ) const;
+
+        virtual BSONObj getIndexSpec( const StringData& idxName ) const;
+
+        virtual bool isIndexMultikey(const StringData& indexName) const;
+        virtual bool isIndexMultikey(int idxNo) const;
+
+        virtual bool setIndexIsMultikey(OperationContext* txn,
+                                        int idxNo,
+                                        bool multikey = true);
+        virtual bool setIndexIsMultikey(OperationContext* txn,
+                                        const StringData& indexName,
+                                        bool multikey = true);
+
+        virtual DiskLoc getIndexHead( const StringData& indexName ) const;
+
+        virtual void setIndexHead( OperationContext* txn,
+                                   const StringData& indexName,
+                                   const DiskLoc& newHead );
+
+        virtual bool isIndexReady( const StringData& indexName ) const;
+
+        virtual Status removeIndex( OperationContext* txn,
+                                    const StringData& indexName );
+
+        virtual Status prepareForIndexBuild( OperationContext* txn,
+                                             const IndexDescriptor* spec );
+
+        virtual void indexBuildSuccess( OperationContext* txn,
+                                        const StringData& indexName );
+
+        virtual void updateTTLSetting( OperationContext* txn,
+                                       const StringData& idxName,
+                                       long long newExpireSeconds );
+
+        // not part of interface, but available to my storage engine
+
+        int _findIndexNumber( const StringData& indexName) const;
+
+    private:
+        NamespaceDetails* _details;
+        RecordStore* _indexRecordStore;
+        MMAPV1DatabaseCatalogEntry* _db;
+
+        friend class MMAPV1DatabaseCatalogEntry;
+    };
+}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp
new file mode 100644
index 00000000000..2f168bd19a6
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp
@@ -0,0 +1,225 @@
+// namespace_details_rsv1_metadata.cpp
+
+/**
+ *    Copyright (C) 2014 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/ops/update.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h"
+
+namespace mongo {
+    NamespaceDetailsRSV1MetaData::NamespaceDetailsRSV1MetaData( const StringData& ns,
+                                                                NamespaceDetails* details,
+                                                                RecordStore* namespaceRecordStore )
+        : _ns( ns.toString() ),
+          _details( details ),
+          _namespaceRecordStore( namespaceRecordStore ) {
+    }
+
+    const DiskLoc& NamespaceDetailsRSV1MetaData::capExtent() const {
+        return _details->capExtent;
+    }
+
+    void NamespaceDetailsRSV1MetaData::setCapExtent( OperationContext* txn, const DiskLoc& loc ) {
+        *txn->recoveryUnit()->writing( &_details->capExtent ) = loc;
+    }
+
+    const DiskLoc& NamespaceDetailsRSV1MetaData::capFirstNewRecord() const {
+        return _details->capFirstNewRecord;
+    }
+
+    void NamespaceDetailsRSV1MetaData::setCapFirstNewRecord( OperationContext* txn,
+                                                             const DiskLoc& loc ) {
+        *txn->recoveryUnit()->writing( &_details->capFirstNewRecord ) = loc;
+    }
+
+    bool NamespaceDetailsRSV1MetaData::capLooped() const {
+        return _details->capFirstNewRecord.isValid();
+    }
+
+    long long NamespaceDetailsRSV1MetaData::dataSize() const {
+        return _details->stats.datasize;
+    }
+    long long NamespaceDetailsRSV1MetaData::numRecords() const {
+        return _details->stats.nrecords;
+    }
+
+    void NamespaceDetailsRSV1MetaData::incrementStats( OperationContext* txn,
+                                                       long long dataSizeIncrement,
+                                                       long long numRecordsIncrement ) {
+        // durability todo : this could be a bit annoying / slow to record constantly
+        NamespaceDetails::Stats* s = txn->recoveryUnit()->writing( &_details->stats );
+        s->datasize += dataSizeIncrement;
+        s->nrecords += numRecordsIncrement;
+    }
+
+    void NamespaceDetailsRSV1MetaData::setStats( OperationContext* txn,
+                                                 long long dataSize,
+                                                 long long numRecords ) {
+        NamespaceDetails::Stats* s = txn->recoveryUnit()->writing( &_details->stats );
+        s->datasize = dataSize;
+        s->nrecords = numRecords;
+    }
+
+    const DiskLoc& NamespaceDetailsRSV1MetaData::deletedListEntry( int bucket ) const {
+        return _details->deletedList[ bucket ];
+    }
+
+    void NamespaceDetailsRSV1MetaData::setDeletedListEntry( OperationContext* txn,
+                                                            int bucket,
+                                                            const DiskLoc& loc ) {
+        *txn->recoveryUnit()->writing( &_details->deletedList[bucket] ) = loc;
+    }
+
+    void NamespaceDetailsRSV1MetaData::orphanDeletedList( OperationContext* txn ) {
+        for( int i = 0; i < Buckets; i++ ) {
+            setDeletedListEntry( txn, i, DiskLoc() );
+        }
+    }
+
+    const DiskLoc& NamespaceDetailsRSV1MetaData::firstExtent( OperationContext* txn ) const {
+        return _details->firstExtent;
+    }
+
+    void NamespaceDetailsRSV1MetaData::setFirstExtent( OperationContext* txn, const DiskLoc& loc ) {
+        *txn->recoveryUnit()->writing( &_details->firstExtent ) = loc;
+    }
+
+    const DiskLoc& NamespaceDetailsRSV1MetaData::lastExtent( OperationContext* txn ) const {
+        return _details->lastExtent;
+    }
+
+    void NamespaceDetailsRSV1MetaData::setLastExtent( OperationContext* txn, const DiskLoc& loc ) {
+        *txn->recoveryUnit()->writing( &_details->lastExtent ) = loc;
+    }
+
+    bool NamespaceDetailsRSV1MetaData::isCapped() const {
+        return _details->isCapped;
+    }
+
+    bool NamespaceDetailsRSV1MetaData::isUserFlagSet( int flag ) const {
+        return _details->userFlags & flag;
+    }
+
+    int NamespaceDetailsRSV1MetaData::userFlags() const {
+        return _details->userFlags;
+    }
+
+    bool NamespaceDetailsRSV1MetaData::setUserFlag( OperationContext* txn, int flag ) {
+        if ( ( _details->userFlags & flag ) == flag )
+            return false;
+
+        txn->recoveryUnit()->writingInt( _details->userFlags) |= flag;
+        _syncUserFlags( txn );
+        return true;
+    }
+
+    bool NamespaceDetailsRSV1MetaData::clearUserFlag( OperationContext* txn, int flag ) {
+        if ( ( _details->userFlags & flag ) == 0 )
+            return false;
+
+        txn->recoveryUnit()->writingInt(_details->userFlags) &= ~flag;
+        _syncUserFlags( txn );
+        return true;
+    }
+
+    bool NamespaceDetailsRSV1MetaData::replaceUserFlags( OperationContext* txn, int flags ) {
+        if ( _details->userFlags == flags )
+            return false;
+
+        txn->recoveryUnit()->writingInt(_details->userFlags) = flags;
+        _syncUserFlags( txn );
+        return true;
+    }
+
+    int NamespaceDetailsRSV1MetaData::lastExtentSize( OperationContext* txn ) const {
+        return _details->lastExtentSize;
+    }
+
+    void NamespaceDetailsRSV1MetaData::setLastExtentSize( OperationContext* txn, int newMax ) {
+        if ( _details->lastExtentSize == newMax )
+            return;
+        txn->recoveryUnit()->writingInt(_details->lastExtentSize) = newMax;
+    }
+
+    long long NamespaceDetailsRSV1MetaData::maxCappedDocs() const {
+        invariant( _details->isCapped );
+        if ( _details->maxDocsInCapped == 0x7fffffff )
+            return numeric_limits<long long>::max();
+        return _details->maxDocsInCapped;
+    }
+
+    double NamespaceDetailsRSV1MetaData::paddingFactor() const {
+        return _details->paddingFactor;
+    }
+
+    void NamespaceDetailsRSV1MetaData::setPaddingFactor( OperationContext* txn, double paddingFactor ) {
+        if ( paddingFactor == _details->paddingFactor )
+            return;
+
+        if ( _details->isCapped )
+            return;
+
+        *txn->recoveryUnit()->writing(&_details->paddingFactor) = paddingFactor;
+    }
+
+    void NamespaceDetailsRSV1MetaData::_syncUserFlags( OperationContext* txn ) {
+        if ( !_namespaceRecordStore )
+            return;
+
+        scoped_ptr<RecordIterator> iterator( _namespaceRecordStore->getIterator( txn,
+                                                                                 DiskLoc(),
+                                                                                 false,
+                                                                                 CollectionScanParams::FORWARD ) );
+        while ( !iterator->isEOF() ) {
+            DiskLoc loc = iterator->getNext();
+
+            BSONObj oldEntry = iterator->dataFor( loc ).toBson();
+            BSONElement e = oldEntry["name"];
+            if ( e.type() != String )
+                continue;
+
+            if ( e.String() != _ns )
+                continue;
+
+            BSONObj newEntry = applyUpdateOperators( oldEntry,
+                                                     BSON( "$set" << BSON( "options.flags" << userFlags() ) ) );
+
+            StatusWith<DiskLoc> result = _namespaceRecordStore->updateRecord( txn,
+                                                                              loc,
+                                                                              newEntry.objdata(),
+                                                                              newEntry.objsize(),
+                                                                              -1,
+                                                                              NULL );
+            fassert( 17486, result.isOK() );
+            return;
+        }
+
+        fassertFailed( 17488 );
+    }
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h
new file mode 100644
index 00000000000..9f933d003e5
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h
@@ -0,0 +1,111 @@
+// namespace_details_rsv1_metadata.h
+
+/**
+ *    Copyright (C) 2014 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include <string>
+
+#include "mongo/base/string_data.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_base.h"
+
+namespace mongo {
+
+    class RecordStore;
+
+    /*
+     * NOTE: NamespaceDetails will become a struct
+     *      all dur, etc... will move here
+     */
+    class NamespaceDetailsRSV1MetaData : public RecordStoreV1MetaData {
+    public:
+        explicit NamespaceDetailsRSV1MetaData( const StringData& ns,
+                                               NamespaceDetails* details,
+                                               RecordStore* namespaceRecordStore );
+
+        virtual ~NamespaceDetailsRSV1MetaData(){}
+
+        virtual const DiskLoc& capExtent() const;
+        virtual void setCapExtent( OperationContext* txn, const DiskLoc& loc );
+
+        virtual const DiskLoc& capFirstNewRecord() const;
+        virtual void setCapFirstNewRecord( OperationContext* txn, const DiskLoc& loc );
+
+        virtual bool capLooped() const;
+
+        virtual long long dataSize() const;
+        virtual long long numRecords() const;
+
+        virtual void incrementStats( OperationContext* txn,
+                                     long long dataSizeIncrement,
+                                     long long numRecordsIncrement );
+
+        virtual void setStats( OperationContext* txn,
+                               long long dataSize,
+                               long long numRecords );
+
+        virtual const DiskLoc& deletedListEntry( int bucket ) const;
+        virtual void setDeletedListEntry( OperationContext* txn,
+                                          int bucket,
+                                          const DiskLoc& loc );
+        virtual void orphanDeletedList(OperationContext* txn);
+
+        virtual const DiskLoc& firstExtent( OperationContext* txn ) const;
+        virtual void setFirstExtent( OperationContext* txn, const DiskLoc& loc );
+
+        virtual const DiskLoc& lastExtent( OperationContext* txn ) const;
+        virtual void setLastExtent( OperationContext* txn, const DiskLoc& loc );
+
+        virtual bool isCapped() const;
+
+        virtual bool isUserFlagSet( int flag ) const;
+        virtual int userFlags() const;
+        virtual bool setUserFlag( OperationContext* txn, int flag );
+        virtual bool clearUserFlag( OperationContext* txn, int flag );
+        virtual bool replaceUserFlags( OperationContext* txn, int flags );
+
+        virtual int lastExtentSize( OperationContext* txn ) const;
+        virtual void setLastExtentSize( OperationContext* txn, int newMax );
+
+        virtual long long maxCappedDocs() const;
+
+        virtual double paddingFactor() const;
+        virtual void setPaddingFactor( OperationContext* txn, double paddingFactor );
+
+    private:
+
+        void _syncUserFlags( OperationContext* txn );
+
+        std::string _ns;
+        NamespaceDetails* _details;
+        RecordStore* _namespaceRecordStore;
+    };
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp
new file mode 100644
index 00000000000..9bbf8ef6303
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp
@@ -0,0 +1,205 @@
+// namespace_index.cpp
+
+/**
+ *    Copyright (C) 2013 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/platform/basic.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_index.h"
+
+#include <boost/filesystem/operations.hpp>
+
+#include "mongo/db/d_concurrency.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h"
+#include "mongo/util/exit.h"
+#include "mongo/util/log.h"
+
+namespace mongo {
+
+    MONGO_LOG_DEFAULT_COMPONENT_FILE(::mongo::logger::LogComponent::kIndexing);
+
+    NamespaceDetails* NamespaceIndex::details(const StringData& ns) {
+        Namespace n(ns);
+        return details(n);
+    }
+
+    NamespaceDetails* NamespaceIndex::details(const Namespace& ns) {
+        if ( !_ht.get() )
+            return 0;
+        return _ht->get(ns);
+    }
+
+    void NamespaceIndex::add_ns( OperationContext* txn,
+                                 const StringData& ns, const DiskLoc& loc, bool capped) {
+        NamespaceDetails details( loc, capped );
+        add_ns( txn, ns, &details );
+    }
+
+    void NamespaceIndex::add_ns( OperationContext* txn,
+                                 const StringData& ns, const NamespaceDetails* details ) {
+        Namespace n(ns);
+        add_ns( txn, n, details );
+    }
+
+    void NamespaceIndex::add_ns( OperationContext* txn,
+                                 const Namespace& ns, const NamespaceDetails* details ) {
+        string nsString = ns.toString();
+        txn->lockState()->assertWriteLocked( nsString );
+        massert( 17315, "no . in ns", nsString.find( '.' ) != string::npos );
+        init( txn );
+        uassert( 10081, "too many namespaces/collections", _ht->put(txn, ns, *details));
+    }
+
+    void NamespaceIndex::kill_ns( OperationContext* txn, const StringData& ns) {
+        txn->lockState()->assertWriteLocked(ns);
+        if ( !_ht.get() )
+            return;
+        Namespace n(ns);
+        _ht->kill(txn, n);
+
+        if (ns.size() <= Namespace::MaxNsColletionLen) {
+            // Larger namespace names don't have room for $extras so they can't exist. The code
+            // below would cause an "$extra: ns too large" error and stacktrace to be printed to the
+            // log even though everything is fine.
+            for( int i = 0; i<=1; i++ ) {
+                try {
+                    Namespace extra(n.extraName(i));
+                    _ht->kill(txn, extra);
+                }
+                catch(DBException&) {
+                    LOG(3) << "caught exception in kill_ns" << endl;
+                }
+            }
+        }
+    }
+
+    bool NamespaceIndex::pathExists() const {
+        return boost::filesystem::exists(path());
+    }
+
+    boost::filesystem::path NamespaceIndex::path() const {
+        boost::filesystem::path ret( _dir );
+        if (storageGlobalParams.directoryperdb)
+            ret /= _database;
+        ret /= ( _database + ".ns" );
+        return ret;
+    }
+
+    static void namespaceGetNamespacesCallback( const Namespace& k , NamespaceDetails& v , list<string>* l ) {
+        if ( ! k.hasDollarSign() || k == "local.oplog.$main" ) {
+            // we call out local.oplog.$main specifically as its the only "normal"
+            // collection that has a $, so we make sure it gets added
+            l->push_back( k.toString() );
+        }
+    }
+
+    void NamespaceIndex::getCollectionNamespaces( list<string>* tofill ) const {
+        if ( _ht.get() )
+            _ht->iterAll( stdx::bind( namespaceGetNamespacesCallback,
+                                      stdx::placeholders::_1, stdx::placeholders::_2, tofill) );
+    }
+
+    void NamespaceIndex::maybeMkdir() const {
+        if (!storageGlobalParams.directoryperdb)
+            return;
+        boost::filesystem::path dir( _dir );
+        dir /= _database;
+        if ( !boost::filesystem::exists( dir ) )
+            MONGO_ASSERT_ON_EXCEPTION_WITH_MSG( boost::filesystem::create_directory( dir ), "create dir for db " );
+    }
+
+    NOINLINE_DECL void NamespaceIndex::_init( OperationContext* txn ) {
+        verify( !_ht.get() );
+
+        txn->lockState()->assertWriteLocked(_database);
+
+        /* if someone manually deleted the datafiles for a database,
+           we need to be sure to clear any cached info for the database in
+           local.*.
+        */
+        /*
+        if ( "local" != _database ) {
+            DBInfo i(_database.c_str());
+            i.dbDropped();
+        }
+        */
+
+        unsigned long long len = 0;
+        boost::filesystem::path nsPath = path();
+        string pathString = nsPath.string();
+        void *p = 0;
+        if ( boost::filesystem::exists(nsPath) ) {
+            if( _f.open(pathString, true) ) {
+                len = _f.length();
+                if ( len % (1024*1024) != 0 ) {
+                    log() << "bad .ns file: " << pathString << endl;
+                    uassert( 10079 ,  "bad .ns file length, cannot open database", len % (1024*1024) == 0 );
+                }
+                p = _f.getView();
+            }
+        }
+        else {
+            // use storageGlobalParams.lenForNewNsFiles, we are making a new database
+            massert(10343, "bad storageGlobalParams.lenForNewNsFiles",
+                    storageGlobalParams.lenForNewNsFiles >= 1024*1024);
+            maybeMkdir();
+            unsigned long long l = storageGlobalParams.lenForNewNsFiles;
+            if ( _f.create(pathString, l, true) ) {
+                // The writes done in this function must not be rolled back. If the containing
+                // UnitOfWork rolls back it should roll back to the state *after* these writes. This
+                // will leave the file empty, but available for future use. That is why we go
+                // directly to the global dur dirty list rather than going through the
+                // OperationContext.
+                getDur().createdFile(pathString, l); // always a new file
+                len = l;
+                verify(len == storageGlobalParams.lenForNewNsFiles);
+                p = _f.getView();
+
+                if ( p ) {
+                    // we do this so the durability system isn't mad at us for
+                    // only initiating file and not doing a write
+                    // grep for 17388
+                    getDur().writingPtr( p, 5 ); // throw away
+                }
+            }
+        }
+
+        if ( p == 0 ) {
+            /** TODO: this shouldn't terminate? */
+            log() << "error couldn't open file " << pathString << " terminating" << endl;
+            dbexit( EXIT_FS );
+        }
+
+
+        verify( len <= 0x7fffffff );
+        _ht.reset(new HashTable<Namespace,NamespaceDetails>(p, (int) len, "namespace index"));
+    }
+
+
+}
+
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_index.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.h
new file mode 100644
index 00000000000..3ce2c2e0194
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.h
@@ -0,0 +1,94 @@
+// namespace_index.h
+
+/**
+ *    Copyright (C) 2013 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include <list>
+#include <string>
+
+#include "mongo/base/disallow_copying.h"
+#include "mongo/db/diskloc.h"
+#include "mongo/db/storage/mmap_v1/catalog/hashtab.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace.h"
+
+namespace mongo {
+
+    class NamespaceDetails;
+    class OperationContext;
+
+    /* NamespaceIndex is the ".ns" file you see in the data directory.  It is the "system catalog"
+       if you will: at least the core parts.  (Additional info in system.* collections.)
+    */
+    class NamespaceIndex {
+        MONGO_DISALLOW_COPYING(NamespaceIndex);
+    public:
+        NamespaceIndex(const std::string &dir, const std::string &database) :
+            _ht( 0 ), _dir( dir ), _database( database ) {}
+
+        /* returns true if the file represented by this file exists on disk */
+        bool pathExists() const;
+
+        void init( OperationContext* txn ) {
+            if ( !_ht.get() )
+                _init( txn );
+        }
+
+        void add_ns( OperationContext* txn,
+                     const StringData& ns, const DiskLoc& loc, bool capped);
+        void add_ns( OperationContext* txn,
+                     const StringData& ns, const NamespaceDetails* details );
+        void add_ns( OperationContext* txn,
+                     const Namespace& ns, const NamespaceDetails* details );
+
+        NamespaceDetails* details(const StringData& ns);
+        NamespaceDetails* details(const Namespace& ns);
+
+        void kill_ns( OperationContext* txn,
+                      const StringData& ns);
+
+        bool allocated() const { return _ht.get() != 0; }
+
+        void getCollectionNamespaces( std::list<std::string>* tofill ) const;
+
+        boost::filesystem::path path() const;
+
+        unsigned long long fileLength() const { return _f.length(); }
+
+    private:
+        void _init( OperationContext* txn );
+        void maybeMkdir() const;
+
+        DurableMappedFile _f;
+        scoped_ptr<HashTable<Namespace,NamespaceDetails> > _ht;
+        std::string _dir;
+        std::string _database;
+    };
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp
new file mode 100644
index 00000000000..7c50b86a5bf
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp
@@ -0,0 +1,67 @@
+// namespace_test.h
+
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#include "mongo/unittest/unittest.h"
+
+#include "mongo/db/storage/mmap_v1/catalog/namespace.h"
+
+namespace mongo {
+
+    TEST( NamespaceTest, Basics ) {
+        Namespace foo( "foo.bar" );
+        Namespace bar( "bar.foo" );
+
+        ASSERT_EQUALS( foo.toString(), foo.toString() );
+        ASSERT_EQUALS( foo.hash(), foo.hash() );
+
+        ASSERT_NOT_EQUALS( foo.hash(), bar.hash() );
+
+        ASSERT( foo == foo );
+        ASSERT( !( foo != foo ) );
+        ASSERT( foo != bar );
+        ASSERT( !( foo == bar ) );
+    }
+
+    TEST( NamespaceTest, ExtraName ) {
+        Namespace foo( "foo.bar" );
+        ASSERT_FALSE( foo.isExtra() );
+
+        string str0 = foo.extraName( 0 );
+        ASSERT_EQUALS( "foo.bar$extra", str0 );
+        Namespace ex0( str0 );
+        ASSERT_TRUE( ex0.isExtra() );
+
+        string str1 = foo.extraName( 1 );
+        ASSERT_EQUALS( "foo.bar$extrb", str1 );
+        Namespace ex1( str1 );
+        ASSERT_TRUE( ex1.isExtra() );
+
+    }
+}
diff --git a/src/mongo/db/storage/mmap_v1/dur_recover.cpp b/src/mongo/db/storage/mmap_v1/dur_recover.cpp
index 9d4e679808a..52836e7977f 100644
--- a/src/mongo/db/storage/mmap_v1/dur_recover.cpp
+++ b/src/mongo/db/storage/mmap_v1/dur_recover.cpp
@@ -40,6 +40,7 @@
 #include "mongo/db/catalog/database.h"
 #include "mongo/db/db.h"
 #include "mongo/db/storage/storage_engine.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace.h"
 #include "mongo/db/storage/mmap_v1/dur.h"
 #include "mongo/db/storage/mmap_v1/dur_commitjob.h"
 #include "mongo/db/storage/mmap_v1/dur_journal.h"
diff --git a/src/mongo/db/storage/mmap_v1/extent.h b/src/mongo/db/storage/mmap_v1/extent.h
index 8a27e271c04..f009e283380 100644
--- a/src/mongo/db/storage/mmap_v1/extent.h
+++ b/src/mongo/db/storage/mmap_v1/extent.h
@@ -34,7 +34,7 @@
 #include <vector>
 
 #include "mongo/db/diskloc.h"
-#include "mongo/db/structure/catalog/namespace.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace.h"
 
 namespace mongo {
 
diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp
index 303ac49e507..f8ca6265c5f 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp
+++ b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp
@@ -42,11 +42,12 @@
 #include "mongo/db/pdfile_version.h"
 #include "mongo/db/server_parameters.h"
 #include "mongo/db/storage/mmap_v1/data_file.h"
-#include "mongo/db/structure/catalog/namespace_details.h"
-#include "mongo/db/structure/catalog/namespace_details_collection_entry.h"
-#include "mongo/db/structure/catalog/namespace_details_rsv1_metadata.h"
-#include "mongo/db/structure/record_store_v1_capped.h"
-#include "mongo/db/structure/record_store_v1_simple.h"
+#include "mongo/db/storage/mmap_v1/btree/btree_interface.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h"
 
 namespace mongo {
 
@@ -444,7 +445,7 @@ namespace mongo {
 
     void MMAPV1DatabaseCatalogEntry::_lazyInit( OperationContext* txn ) {
         // this is sort of insane
-        // it's because the whole structure is highly recursive
+        // it's because the whole storage/mmap_v1 is highly recursive
 
         _namespaceIndex.init( txn );
 
@@ -682,13 +683,13 @@ namespace mongo {
             rs = entry->recordStore.get();
         }
 
-        std::auto_ptr<BtreeInterface> btree(
-            BtreeInterface::getInterface(entry->headManager(),
-                                         rs,
-                                         entry->ordering(),
-                                         entry->descriptor()->indexNamespace(),
-                                         entry->descriptor()->version(),
-                                         &BtreeBasedAccessMethod::invalidateCursors));
+        std::auto_ptr<SortedDataInterface> btree(
+            getMMAPV1Interface(entry->headManager(),
+                               rs,
+                               entry->ordering(),
+                               entry->descriptor()->indexNamespace(),
+                               entry->descriptor()->version(),
+                               &BtreeBasedAccessMethod::invalidateCursors));
 
         if (IndexNames::HASHED == type)
             return new HashAccessMethod( entry, btree.release() );
diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h
index 16a88b84ede..fa5a5874061 100644
--- a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h
+++ b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h
@@ -33,8 +33,8 @@
 #include "mongo/base/status.h"
 #include "mongo/base/string_data.h"
 #include "mongo/db/catalog/database_catalog_entry.h"
+#include "mongo/db/storage/mmap_v1/catalog/namespace_index.h"
 #include "mongo/db/storage/mmap_v1/mmap_v1_extent_manager.h"
-#include "mongo/db/structure/catalog/namespace_index.h"
 
 namespace mongo {
 
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp
new file mode 100644
index 00000000000..3a1bed72dd9
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp
@@ -0,0 +1,974 @@
+// record_store_v1_base.cpp
+
+/**
+ *    Copyright (C) 2013-2014 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_base.h"
+
+#include "mongo/db/catalog/collection.h"
+#include "mongo/db/concurrency/lock_mgr.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/db/storage/mmap_v1/extent.h"
+#include "mongo/db/storage/mmap_v1/extent_manager.h"
+#include "mongo/db/storage/mmap_v1/record.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h"
+#include "mongo/util/progress_meter.h"
+#include "mongo/util/timer.h"
+#include "mongo/util/touch_pages.h"
+
+namespace mongo {
+
+    const int RecordStoreV1Base::Buckets = 19;
+    const int RecordStoreV1Base::MaxBucket = 18;
+
+    /* Deleted list buckets are used to quickly locate free space based on size.  Each bucket
+       contains records up to that size.  All records >= 4mb are placed into the 16mb bucket.
+    */
+    const int RecordStoreV1Base::bucketSizes[] = {
+        0x20,     0x40,     0x80,     0x100,      // 32,   64,   128,  256
+        0x200,    0x400,    0x800,    0x1000,     // 512,  1K,   2K,   4K
+        0x2000,   0x4000,   0x8000,   0x10000,    // 8K,   16K,  32K,  64K
+        0x20000,  0x40000,  0x80000,  0x100000,   // 128K, 256K, 512K, 1M
+        0x200000, 0x400000, 0x1000000,            // 2M,   4M,   16M (see above)
+     };
+
+
+    RecordStoreV1Base::RecordStoreV1Base( const StringData& ns,
+                                          RecordStoreV1MetaData* details,
+                                          ExtentManager* em,
+                                          bool isSystemIndexes )
+        : RecordStore( ns ),
+          _details( details ),
+          _extentManager( em ),
+          _isSystemIndexes( isSystemIndexes ) {
+    }
+
+    RecordStoreV1Base::~RecordStoreV1Base() {
+    }
+
+
+    int64_t RecordStoreV1Base::storageSize( OperationContext* txn,
+                                            BSONObjBuilder* extraInfo,
+                                            int level ) const {
+        BSONArrayBuilder extentInfo;
+
+        int64_t total = 0;
+        int n = 0;
+
+        DiskLoc cur = _details->firstExtent(txn);
+
+        while ( !cur.isNull() ) {
+            Extent* e = _extentManager->getExtent( cur );
+
+            total += e->length;
+            n++;
+
+            if ( extraInfo && level > 0 ) {
+                extentInfo.append( BSON( "len" << e->length << "loc: " << e->myLoc.toBSONObj() ) );
+            }
+            cur = e->xnext;            
+        }
+
+        if ( extraInfo ) {
+            extraInfo->append( "numExtents", n );
+            if ( level > 0 )
+                extraInfo->append( "extents", extentInfo.arr() );
+        }
+
+        return total;
+    }
+
+    RecordData RecordStoreV1Base::dataFor( const DiskLoc& loc ) const {
+        return recordFor(loc)->toRecordData();
+    }
+
+    Record* RecordStoreV1Base::recordFor( const DiskLoc& loc ) const {
+        return _extentManager->recordForV1( loc );
+    }
+
+    const DeletedRecord* RecordStoreV1Base::deletedRecordFor( const DiskLoc& loc ) const {
+        invariant( loc.a() != -1 );
+        return reinterpret_cast<const DeletedRecord*>( recordFor( loc ) );
+    }
+
+    DeletedRecord* RecordStoreV1Base::drec( const DiskLoc& loc ) const {
+        invariant( loc.a() != -1 );
+        return reinterpret_cast<DeletedRecord*>( recordFor( loc ) );
+    }
+
+    Extent* RecordStoreV1Base::_getExtent( OperationContext* txn, const DiskLoc& loc ) const {
+        return _extentManager->getExtent( loc );
+    }
+
+    DiskLoc RecordStoreV1Base::_getExtentLocForRecord( OperationContext* txn, const DiskLoc& loc ) const {
+        return _extentManager->extentLocForV1( loc );
+    }
+
+
+    DiskLoc RecordStoreV1Base::getNextRecord( OperationContext* txn, const DiskLoc& loc ) const {
+        DiskLoc next = getNextRecordInExtent( txn, loc );
+        if ( !next.isNull() ) {
+            return next;
+        }
+
+        // now traverse extents
+
+        Extent* e = _getExtent( txn, _getExtentLocForRecord(txn, loc) );
+        while ( 1 ) {
+            if ( e->xnext.isNull() )
+                return DiskLoc(); // end of collection
+            e = _getExtent( txn, e->xnext );
+            if ( !e->firstRecord.isNull() )
+                break;
+            // entire extent could be empty, keep looking
+        }
+        return e->firstRecord;
+    }
+
+    DiskLoc RecordStoreV1Base::getPrevRecord( OperationContext* txn, const DiskLoc& loc ) const {
+        DiskLoc prev = getPrevRecordInExtent( txn, loc );
+        if ( !prev.isNull() ) {
+            return prev;
+        }
+
+        // now traverse extents
+
+        Extent *e = _getExtent(txn, _getExtentLocForRecord(txn, loc));
+        while ( 1 ) {
+            if ( e->xprev.isNull() )
+                return DiskLoc(); // end of collection
+            e = _getExtent( txn, e->xprev );
+            if ( !e->firstRecord.isNull() )
+                break;
+            // entire extent could be empty, keep looking
+        }
+        return e->lastRecord;
+
+    }
+
+    DiskLoc RecordStoreV1Base::_findFirstSpot( OperationContext* txn,
+                                               const DiskLoc& extDiskLoc,
+                                               Extent* e ) {
+        DiskLoc emptyLoc = extDiskLoc;
+        emptyLoc.inc( Extent::HeaderSize() );
+        int delRecLength = e->length - Extent::HeaderSize();
+        if ( delRecLength >= 32*1024 && _ns.find('$') != string::npos && !isCapped() ) {
+            // probably an index. so skip forward to keep its records page aligned
+            int& ofs = emptyLoc.GETOFS();
+            int newOfs = (ofs + 0xfff) & ~0xfff;
+            delRecLength -= (newOfs-ofs);
+            dassert( delRecLength > 0 );
+            ofs = newOfs;
+        }
+
+        DeletedRecord* empty = txn->recoveryUnit()->writing(drec(emptyLoc));
+        empty->lengthWithHeaders() = delRecLength;
+        empty->extentOfs() = e->myLoc.getOfs();
+        empty->nextDeleted().Null();
+        return emptyLoc;
+
+    }
+
+    DiskLoc RecordStoreV1Base::getNextRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const {
+        int nextOffset = recordFor( loc )->nextOfs();
+
+        if ( nextOffset == DiskLoc::NullOfs )
+            return DiskLoc();
+
+        fassert( 17441, abs(nextOffset) >= 8 ); // defensive
+        DiskLoc result( loc.a(), nextOffset );
+        return result;
+    }
+
+    DiskLoc RecordStoreV1Base::getPrevRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const {
+        int prevOffset = recordFor( loc )->prevOfs();
+
+        if ( prevOffset == DiskLoc::NullOfs )
+            return DiskLoc();
+
+        fassert( 17442, abs(prevOffset) >= 8 ); // defensive
+        DiskLoc result( loc.a(), prevOffset );
+        return result;
+    }
+
+
+    StatusWith<DiskLoc> RecordStoreV1Base::insertRecord( OperationContext* txn,
+                                                         const DocWriter* doc,
+                                                         bool enforceQuota ) {
+        int docSize = doc->documentSize();
+        if ( docSize < 4 ) {
+            return StatusWith<DiskLoc>( ErrorCodes::InvalidLength,
+                                        "record has to be >= 4 bytes" );
+        }
+        int lenWHdr = docSize + Record::HeaderSize;
+        if ( doc->addPadding() )
+            lenWHdr = getRecordAllocationSize( lenWHdr );
+
+        StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, enforceQuota );
+        if ( !loc.isOK() )
+            return loc;
+
+        Record *r = recordFor( loc.getValue() );
+        fassert( 17319, r->lengthWithHeaders() >= lenWHdr );
+
+        r = reinterpret_cast<Record*>( txn->recoveryUnit()->writingPtr(r, lenWHdr) );
+        doc->writeDocument( r->data() );
+
+        _addRecordToRecListInExtent(txn, r, loc.getValue());
+
+        _details->incrementStats( txn, r->netLength(), 1 );
+
+        _paddingFits( txn );
+
+        return loc;
+    }
+
+
+    StatusWith<DiskLoc> RecordStoreV1Base::insertRecord( OperationContext* txn,
+                                                         const char* data,
+                                                         int len,
+                                                         bool enforceQuota ) {
+        if ( len < 4 ) {
+            return StatusWith<DiskLoc>( ErrorCodes::InvalidLength,
+                                        "record has to be >= 4 bytes" );
+        }
+
+        StatusWith<DiskLoc> status = _insertRecord( txn, data, len, enforceQuota );
+        if ( status.isOK() )
+            _paddingFits( txn );
+
+        return status;
+    }
+
+    StatusWith<DiskLoc> RecordStoreV1Base::_insertRecord( OperationContext* txn,
+                                                          const char* data,
+                                                          int len,
+                                                          bool enforceQuota ) {
+
+        int lenWHdr = getRecordAllocationSize( len + Record::HeaderSize );
+        fassert( 17208, lenWHdr >= ( len + Record::HeaderSize ) );
+
+        StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, enforceQuota );
+        if ( !loc.isOK() )
+            return loc;
+
+        Record *r = recordFor( loc.getValue() );
+        fassert( 17210, r->lengthWithHeaders() >= lenWHdr );
+
+        // copy the data
+        r = reinterpret_cast<Record*>( txn->recoveryUnit()->writingPtr(r, lenWHdr) );
+        memcpy( r->data(), data, len );
+
+        _addRecordToRecListInExtent(txn, r, loc.getValue());
+
+        _details->incrementStats( txn, r->netLength(), 1 );
+
+        return loc;
+    }
+
+    StatusWith<DiskLoc> RecordStoreV1Base::updateRecord( OperationContext* txn,
+                                                         const DiskLoc& oldLocation,
+                                                         const char* data,
+                                                         int dataSize,
+                                                         bool enforceQuota,
+                                                         UpdateMoveNotifier* notifier ) {
+        Record* oldRecord = recordFor( oldLocation );
+        if ( oldRecord->netLength() >= dataSize ) {
+            // we fit
+            _paddingFits( txn );
+            memcpy( txn->recoveryUnit()->writingPtr( oldRecord->data(), dataSize ), data, dataSize );
+            return StatusWith<DiskLoc>( oldLocation );
+        }
+
+        if ( isCapped() )
+            return StatusWith<DiskLoc>( ErrorCodes::InternalError,
+                                        "failing update: objects in a capped ns cannot grow",
+                                        10003 );
+
+        // we have to move
+
+        _paddingTooSmall( txn );
+
+        StatusWith<DiskLoc> newLocation = _insertRecord( txn, data, dataSize, enforceQuota );
+        if ( !newLocation.isOK() )
+            return newLocation;
+
+        // insert worked, so we delete old record
+        if ( notifier ) {
+            Status moveStatus = notifier->recordStoreGoingToMove( txn,
+                                                                  oldLocation,
+                                                                  oldRecord->data(),
+                                                                  oldRecord->netLength() );
+            if ( !moveStatus.isOK() )
+                return StatusWith<DiskLoc>( moveStatus );
+        }
+
+        deleteRecord( txn, oldLocation );
+
+        return newLocation;
+    }
+
+
+    Status RecordStoreV1Base::updateWithDamages( OperationContext* txn,
+                                                 const DiskLoc& loc,
+                                                 const char* damageSource,
+                                                 const mutablebson::DamageVector& damages ) {
+        _paddingFits( txn );
+
+        Record* rec = recordFor( loc );
+        char* root = rec->data();
+
+        // All updates were in place. Apply them via durability and writing pointer.
+        mutablebson::DamageVector::const_iterator where = damages.begin();
+        const mutablebson::DamageVector::const_iterator end = damages.end();
+        for( ; where != end; ++where ) {
+            const char* sourcePtr = damageSource + where->sourceOffset;
+            void* targetPtr = txn->recoveryUnit()->writingPtr(root + where->targetOffset, where->size);
+            std::memcpy(targetPtr, sourcePtr, where->size);
+        }
+
+        return Status::OK();
+    }
+
+    void RecordStoreV1Base::deleteRecord( OperationContext* txn, const DiskLoc& dl ) {
+
+        Record* todelete = recordFor( dl );
+        invariant( todelete->netLength() >= 4 ); // this is required for defensive code
+
+        /* remove ourself from the record next/prev chain */
+        {
+            if ( todelete->prevOfs() != DiskLoc::NullOfs ) {
+                DiskLoc prev = getPrevRecordInExtent( txn, dl );
+                Record* prevRecord = recordFor( prev );
+                txn->recoveryUnit()->writingInt( prevRecord->nextOfs() ) = todelete->nextOfs();
+            }
+
+            if ( todelete->nextOfs() != DiskLoc::NullOfs ) {
+                DiskLoc next = getNextRecord( txn, dl );
+                Record* nextRecord = recordFor( next );
+                txn->recoveryUnit()->writingInt( nextRecord->prevOfs() ) = todelete->prevOfs();
+            }
+        }
+
+        /* remove ourself from extent pointers */
+        {
+            DiskLoc extentLoc = todelete->myExtentLoc(dl);
+            Extent *e =  _getExtent( txn, extentLoc );
+            if ( e->firstRecord == dl ) {
+                txn->recoveryUnit()->writing(&e->firstRecord);
+                if ( todelete->nextOfs() == DiskLoc::NullOfs )
+                    e->firstRecord.Null();
+                else
+                    e->firstRecord.set(dl.a(), todelete->nextOfs() );
+            }
+            if ( e->lastRecord == dl ) {
+                txn->recoveryUnit()->writing(&e->lastRecord);
+                if ( todelete->prevOfs() == DiskLoc::NullOfs )
+                    e->lastRecord.Null();
+                else
+                    e->lastRecord.set(dl.a(), todelete->prevOfs() );
+            }
+        }
+
+        /* add to the free list */
+        {
+            _details->incrementStats( txn, -1 * todelete->netLength(), -1 );
+
+            if ( _isSystemIndexes ) {
+                /* temp: if in system.indexes, don't reuse, and zero out: we want to be
+                   careful until validated more, as IndexDetails has pointers
+                   to this disk location.  so an incorrectly done remove would cause
+                   a lot of problems.
+                */
+                memset( txn->recoveryUnit()->writingPtr(todelete, todelete->lengthWithHeaders() ),
+                        0, todelete->lengthWithHeaders() );
+            }
+            else {
+                // this is defensive so we can detect if we are still using a location
+                // that was deleted
+                memset(txn->recoveryUnit()->writingPtr(todelete->data(), 4), 0xee, 4);
+                addDeletedRec(txn, dl);
+            }
+        }
+
+    }
+
+    RecordIterator* RecordStoreV1Base::getIteratorForRepair(OperationContext* txn) const {
+        return new RecordStoreV1RepairIterator(txn, this);
+    }
+
+    void RecordStoreV1Base::_addRecordToRecListInExtent(OperationContext* txn,
+                                                        Record *r,
+                                                        DiskLoc loc) {
+        dassert( recordFor(loc) == r );
+        DiskLoc extentLoc = _getExtentLocForRecord( txn, loc );
+        Extent *e = _getExtent( txn, extentLoc );
+        if ( e->lastRecord.isNull() ) {
+            *txn->recoveryUnit()->writing(&e->firstRecord) = loc;
+            *txn->recoveryUnit()->writing(&e->lastRecord) = loc;
+            r->prevOfs() = r->nextOfs() = DiskLoc::NullOfs;
+        }
+        else {
+            Record *oldlast = recordFor(e->lastRecord);
+            r->prevOfs() = e->lastRecord.getOfs();
+            r->nextOfs() = DiskLoc::NullOfs;
+            txn->recoveryUnit()->writingInt(oldlast->nextOfs()) = loc.getOfs();
+            *txn->recoveryUnit()->writing(&e->lastRecord) = loc;
+        }
+    }
+
+    void RecordStoreV1Base::increaseStorageSize( OperationContext* txn,
+                                                 int size,
+                                                 bool enforceQuota ) {
+        DiskLoc eloc = _extentManager->allocateExtent( txn,
+                                                       isCapped(),
+                                                       size,
+                                                       enforceQuota );
+        Extent *e = _extentManager->getExtent( eloc );
+        invariant( e );
+
+        *txn->recoveryUnit()->writing( &e->nsDiagnostic ) = _ns;
+
+        txn->recoveryUnit()->writing( &e->xnext )->Null();
+        txn->recoveryUnit()->writing( &e->xprev )->Null();
+        txn->recoveryUnit()->writing( &e->firstRecord )->Null();
+        txn->recoveryUnit()->writing( &e->lastRecord )->Null();
+
+        DiskLoc emptyLoc = _findFirstSpot( txn, eloc, e );
+
+        if ( _details->lastExtent(txn).isNull() ) {
+            invariant( _details->firstExtent(txn).isNull() );
+            _details->setFirstExtent( txn, eloc );
+            _details->setLastExtent( txn, eloc );
+            _details->setCapExtent( txn, eloc );
+            invariant( e->xprev.isNull() );
+            invariant( e->xnext.isNull() );
+        }
+        else {
+            invariant( !_details->firstExtent(txn).isNull() );
+            *txn->recoveryUnit()->writing(&e->xprev) = _details->lastExtent(txn);
+            *txn->recoveryUnit()->writing(&_extentManager->getExtent(_details->lastExtent(txn))->xnext) = eloc;
+            _details->setLastExtent( txn, eloc );
+        }
+
+        _details->setLastExtentSize( txn, e->length );
+
+        addDeletedRec(txn, emptyLoc);
+    }
+
+    Status RecordStoreV1Base::validate( OperationContext* txn,
+                                        bool full, bool scanData,
+                                        ValidateAdaptor* adaptor,
+                                        ValidateResults* results, BSONObjBuilder* output ) const {
+
+        // 1) basic status that require no iteration
+        // 2) extent level info
+        // 3) check extent start and end
+        // 4) check each non-deleted record
+        // 5) check deleted list
+
+        // -------------
+
+        // 1111111111111111111
+        if ( isCapped() ){
+            output->appendBool("capped", true);
+            output->appendNumber("max", _details->maxCappedDocs());
+        }
+
+        output->appendNumber("datasize", _details->dataSize());
+        output->appendNumber("nrecords", _details->numRecords());
+        output->appendNumber("lastExtentSize", _details->lastExtentSize(txn));
+        output->appendNumber("padding", _details->paddingFactor());
+
+        if ( _details->firstExtent(txn).isNull() )
+            output->append( "firstExtent", "null" );
+        else
+            output->append( "firstExtent",
+                            str::stream() << _details->firstExtent(txn).toString()
+                            << " ns:"
+                            << _getExtent( txn, _details->firstExtent(txn) )->nsDiagnostic.toString());
+        if ( _details->lastExtent(txn).isNull() )
+            output->append( "lastExtent", "null" );
+        else
+            output->append( "lastExtent", str::stream() << _details->lastExtent(txn).toString()
+                            << " ns:"
+                            << _getExtent( txn, _details->lastExtent(txn) )->nsDiagnostic.toString());
+
+        // 22222222222222222222222222
+        { // validate extent basics
+            BSONArrayBuilder extentData;
+            int extentCount = 0;
+            DiskLoc extentDiskLoc;
+            try {
+                if ( !_details->firstExtent(txn).isNull() ) {
+                    _getExtent( txn, _details->firstExtent(txn) )->assertOk();
+                    _getExtent( txn, _details->lastExtent(txn) )->assertOk();
+                }
+
+                extentDiskLoc = _details->firstExtent(txn);
+                while (!extentDiskLoc.isNull()) {
+                    Extent* thisExtent = _getExtent( txn, extentDiskLoc );
+                    if (full) {
+                        extentData << thisExtent->dump();
+                    }
+                    if (!thisExtent->validates(extentDiskLoc, &results->errors)) {
+                        results->valid = false;
+                    }
+                    DiskLoc nextDiskLoc = thisExtent->xnext;
+                    
+                    if (extentCount > 0 && !nextDiskLoc.isNull()
+                        &&  _getExtent( txn, nextDiskLoc )->xprev != extentDiskLoc) {
+                        StringBuilder sb;
+                        sb << "'xprev' pointer " << _getExtent( txn, nextDiskLoc )->xprev.toString()
+                           << " in extent " << nextDiskLoc.toString()
+                           << " does not point to extent " << extentDiskLoc.toString();
+                        results->errors.push_back( sb.str() );
+                        results->valid = false;
+                    }
+                    if (nextDiskLoc.isNull() && extentDiskLoc != _details->lastExtent(txn)) {
+                        StringBuilder sb;
+                        sb << "'lastExtent' pointer " << _details->lastExtent(txn).toString()
+                           << " does not point to last extent in list " << extentDiskLoc.toString();
+                        results->errors.push_back( sb.str() );
+                        results->valid = false;
+                    }
+                    extentDiskLoc = nextDiskLoc;
+                    extentCount++;
+                    txn->checkForInterrupt();
+                }
+            }
+            catch (const DBException& e) {
+                StringBuilder sb;
+                sb << "exception validating extent " << extentCount
+                   << ": " << e.what();
+                results->errors.push_back( sb.str() );
+                results->valid = false;
+                return Status::OK();
+            }
+            output->append("extentCount", extentCount);
+
+            if ( full )
+                output->appendArray( "extents" , extentData.arr() );
+
+        }
+
+        try {
+            // 333333333333333333333333333
+            bool testingLastExtent = false;
+            try {
+                DiskLoc firstExtentLoc = _details->firstExtent(txn);
+                if (firstExtentLoc.isNull()) {
+                    // this is ok
+                }
+                else {
+                    output->append("firstExtentDetails", _getExtent(txn, firstExtentLoc)->dump());
+                    if (!_getExtent(txn, firstExtentLoc)->xprev.isNull()) {
+                        StringBuilder sb;
+                        sb << "'xprev' pointer in 'firstExtent' " << _details->firstExtent(txn).toString()
+                           << " is " << _getExtent(txn, firstExtentLoc)->xprev.toString()
+                           << ", should be null";
+                        results->errors.push_back( sb.str() );
+                        results->valid = false;
+                    }
+                }
+                testingLastExtent = true;
+                DiskLoc lastExtentLoc = _details->lastExtent(txn);
+                if (lastExtentLoc.isNull()) {
+                    // this is ok
+                }
+                else {
+                    if (firstExtentLoc != lastExtentLoc) {
+                        output->append("lastExtentDetails", _getExtent(txn, lastExtentLoc)->dump());
+                        if (!_getExtent(txn, lastExtentLoc)->xnext.isNull()) {
+                            StringBuilder sb;
+                            sb << "'xnext' pointer in 'lastExtent' " << lastExtentLoc.toString()
+                               << " is " << _getExtent(txn, lastExtentLoc)->xnext.toString()
+                               << ", should be null";
+                            results->errors.push_back( sb.str() );
+                            results->valid = false;
+                        }
+                    }
+                }
+            }
+            catch (const DBException& e) {
+                StringBuilder sb;
+                sb << "exception processing '"
+                   << (testingLastExtent ? "lastExtent" : "firstExtent")
+                   << "': " << e.what();
+                results->errors.push_back( sb.str() );
+                results->valid = false;
+            }
+
+            // 4444444444444444444444444
+
+            set<DiskLoc> recs;
+            if( scanData ) {
+                int n = 0;
+                int nInvalid = 0;
+                long long nQuantizedSize = 0;
+                long long nPowerOf2QuantizedSize = 0;
+                long long len = 0;
+                long long nlen = 0;
+                long long bsonLen = 0;
+                int outOfOrder = 0;
+                DiskLoc cl_last;
+
+                scoped_ptr<RecordIterator> iterator( getIterator( txn,
+                                                                  DiskLoc(),
+                                                                  false,
+                                                                  CollectionScanParams::FORWARD ) );
+                DiskLoc cl;
+                while ( !( cl = iterator->getNext() ).isNull() ) {
+                    n++;
+
+                    if ( n < 1000000 )
+                        recs.insert(cl);
+                    if ( isCapped() ) {
+                        if ( cl < cl_last )
+                            outOfOrder++;
+                        cl_last = cl;
+                    }
+
+                    Record *r = recordFor(cl);
+                    len += r->lengthWithHeaders();
+                    nlen += r->netLength();
+
+                    if ( r->lengthWithHeaders() ==
+                         quantizeAllocationSpace( r->lengthWithHeaders() ) ) {
+                        // Count the number of records having a size consistent with
+                        // the quantizeAllocationSpace quantization implementation.
+                        ++nQuantizedSize;
+                    }
+
+                    if ( r->lengthWithHeaders() ==
+                         quantizePowerOf2AllocationSpace( r->lengthWithHeaders() ) ) {
+                        // Count the number of records having a size consistent with the
+                        // quantizePowerOf2AllocationSpace quantization implementation.
+                        ++nPowerOf2QuantizedSize;
+                    }
+
+                    if (full){
+                        size_t dataSize = 0;
+                        const Status status = adaptor->validate( r->toRecordData(), &dataSize );
+                        if (!status.isOK()) {
+                            results->valid = false;
+                            if (nInvalid == 0) // only log once;
+                                results->errors.push_back( "invalid object detected (see logs)" );
+
+                            nInvalid++;
+                            log() << "Invalid object detected in " << _ns
+                                  << ": " << status.reason();
+                        }
+                        else {
+                            bsonLen += dataSize;
+                        }
+                    }
+                }
+
+                if ( isCapped() && !_details->capLooped() ) {
+                    output->append("cappedOutOfOrder", outOfOrder);
+                    if ( outOfOrder > 1 ) {
+                        results->valid = false;
+                        results->errors.push_back( "too many out of order records" );
+                    }
+                }
+                output->append("objectsFound", n);
+
+                if (full) {
+                    output->append("invalidObjects", nInvalid);
+                }
+
+                output->appendNumber("nQuantizedSize", nQuantizedSize);
+                output->appendNumber("nPowerOf2QuantizedSize", nPowerOf2QuantizedSize);
+                output->appendNumber("bytesWithHeaders", len);
+                output->appendNumber("bytesWithoutHeaders", nlen);
+
+                if (full) {
+                    output->appendNumber("bytesBson", bsonLen);
+                }
+            } // end scanData
+
+            // 55555555555555555555555555
+            BSONArrayBuilder deletedListArray;
+            for ( int i = 0; i < Buckets; i++ ) {
+                deletedListArray << _details->deletedListEntry(i).isNull();
+            }
+
+            int ndel = 0;
+            long long delSize = 0;
+            BSONArrayBuilder delBucketSizes;
+            int incorrect = 0;
+            for ( int i = 0; i < Buckets; i++ ) {
+                DiskLoc loc = _details->deletedListEntry(i);
+                try {
+                    int k = 0;
+                    while ( !loc.isNull() ) {
+                        if ( recs.count(loc) )
+                            incorrect++;
+                        ndel++;
+
+                        if ( loc.questionable() ) {
+                            if( isCapped() && !loc.isValid() && i == 1 ) {
+                                /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid
+                                   see comments in namespace.h
+                                */
+                                break;
+                            }
+
+                            string err( str::stream() << "bad pointer in deleted record list: "
+                                        << loc.toString()
+                                        << " bucket: " << i
+                                        << " k: " << k );
+                            results->errors.push_back( err );
+                            results->valid = false;
+                            break;
+                        }
+
+                        const DeletedRecord* d = deletedRecordFor(loc);
+                        delSize += d->lengthWithHeaders();
+                        loc = d->nextDeleted();
+                        k++;
+                        txn->checkForInterrupt();
+                    }
+                    delBucketSizes << k;
+                }
+                catch (...) {
+                    results->errors.push_back( (string)"exception in deleted chain for bucket " +
+                                               BSONObjBuilder::numStr(i) );
+                    results->valid = false;
+                }
+            }
+            output->appendNumber("deletedCount", ndel);
+            output->appendNumber("deletedSize", delSize);
+            if ( full ) {
+                output->append( "delBucketSizes", delBucketSizes.arr() );
+            }
+
+            if ( incorrect ) {
+                results->errors.push_back( BSONObjBuilder::numStr(incorrect) +
+                                           " records from datafile are in deleted list" );
+                results->valid = false;
+            }
+
+        }
+        catch (AssertionException) {
+            results->errors.push_back( "exception during validate" );
+            results->valid = false;
+        }
+
+        return Status::OK();
+    }
+
+    void RecordStoreV1Base::appendCustomStats( OperationContext* txn,
+                                               BSONObjBuilder* result,
+                                               double scale ) const {
+        result->append( "lastExtentSize", _details->lastExtentSize(txn) / scale );
+        result->append( "paddingFactor", _details->paddingFactor() );
+        result->append( "userFlags", _details->userFlags() );
+
+        if ( isCapped() ) {
+            result->appendBool( "capped", true );
+            result->appendNumber( "max", _details->maxCappedDocs() );
+        }
+    }
+
+
+    namespace {
+        struct touch_location {
+            const char* root;
+            size_t length;
+        };
+    }
+
+    Status RecordStoreV1Base::touch( OperationContext* txn, BSONObjBuilder* output ) const {
+        Timer t;
+
+        std::vector<touch_location> ranges;
+        {
+            DiskLoc nextLoc = _details->firstExtent(txn);
+            Extent* ext = _getExtent( txn, nextLoc );
+            while ( ext ) {
+                touch_location tl;
+                tl.root = reinterpret_cast<const char*>(ext);
+                tl.length = ext->length;
+                ranges.push_back(tl);
+
+                nextLoc = ext->xnext;
+                if ( nextLoc.isNull() )
+                    ext = NULL;
+                else
+                    ext = _getExtent( txn, nextLoc );
+            }
+        }
+
+        std::string progress_msg = "touch " + std::string(txn->getNS()) + " extents";
+        ProgressMeterHolder pm(*txn->setMessage(progress_msg.c_str(),
+                                                "Touch Progress",
+                                                ranges.size()));
+
+        for ( std::vector<touch_location>::iterator it = ranges.begin(); it != ranges.end(); ++it ) {
+            touch_pages( it->root, it->length );
+            pm.hit();
+            txn->checkForInterrupt();
+        }
+        pm.finished();
+
+        if ( output ) {
+            output->append( "numRanges", static_cast<int>( ranges.size() ) );
+            output->append( "millis", t.millis() );
+        }
+
+        return Status::OK();
+    }
+
+    int RecordStoreV1Base::getRecordAllocationSize( int minRecordSize ) const {
+
+        if ( isCapped() )
+            return minRecordSize;
+
+        invariant( _details->paddingFactor() >= 1 );
+
+        if ( _details->isUserFlagSet( Flag_UsePowerOf2Sizes ) ) {
+            // quantize to the nearest bucketSize (or nearest 1mb boundary for large sizes).
+            return quantizePowerOf2AllocationSpace(minRecordSize);
+        }
+
+        // adjust for padding factor
+        return static_cast<int>(minRecordSize * _details->paddingFactor());
+    }
+
+    DiskLoc RecordStoreV1Base::IntraExtentIterator::getNext() {
+        if (_curr.isNull())
+            return DiskLoc();
+
+        const DiskLoc out = _curr; // we always return where we were, not where we will be.
+        const Record* rec = recordFor(_curr);
+        const int nextOfs = _forward ? rec->nextOfs() : rec->prevOfs();
+        _curr = (nextOfs == DiskLoc::NullOfs ? DiskLoc() : DiskLoc(_curr.a(), nextOfs));
+        return out;
+    }
+
+    void RecordStoreV1Base::IntraExtentIterator::invalidate(const DiskLoc& dl) {
+        if (dl == _curr) {
+            getNext();
+        }
+    }
+
+    /* @return the size for an allocated record quantized to 1/16th of the BucketSize
+       @param allocSize    requested size to allocate
+    */
+    int RecordStoreV1Base::quantizeAllocationSpace(int allocSize) {
+        const int bucketIdx = bucket(allocSize);
+        int bucketSize = bucketSizes[bucketIdx];
+        int quantizeUnit = bucketSize / 16;
+        if (allocSize >= (1 << 22)) // 4mb
+            // all allocatons >= 4mb result in 4mb/16 quantization units, even if >= 8mb.  idea is
+            // to reduce quantization overhead of large records at the cost of increasing the
+            // DeletedRecord size distribution in the largest bucket by factor of 4.
+            quantizeUnit = (1 << 18); // 256k
+        if (allocSize % quantizeUnit == 0)
+            // size is already quantized
+            return allocSize;
+        const int quantizedSpace = (allocSize | (quantizeUnit - 1)) + 1;
+        fassert(16484, quantizedSpace >= allocSize);
+        return quantizedSpace;
+    }
+
+    int RecordStoreV1Base::quantizePowerOf2AllocationSpace(int allocSize) {
+        for ( int i = 0; i < MaxBucket; i++ ) { // skips the largest (16MB) bucket
+            if ( bucketSizes[i] >= allocSize ) {
+                // Return the size of the first bucket sized >= the requested size.
+                return bucketSizes[i];
+            }
+        }
+
+        // if we get here, it means we're allocating more than 4mb, so round up
+        // to the nearest megabyte >= allocSize
+        const int MB = 1024*1024;
+        invariant(allocSize > 4*MB);
+        return (allocSize + (MB - 1)) & ~(MB - 1); // round up to MB alignment
+    }
+
+    int RecordStoreV1Base::bucket(int size) {
+        for ( int i = 0; i < Buckets; i++ ) {
+            if ( bucketSizes[i] > size ) {
+                // Return the first bucket sized _larger_ than the requested size.
+                return i;
+            }
+        }
+        return MaxBucket;
+    }
+
+    void RecordStoreV1Base::_paddingFits( OperationContext* txn ) {
+        MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less
+            double x = max(1.0, _details->paddingFactor() - 0.001 );
+            _details->setPaddingFactor( txn, x );
+        }
+    }
+
+    void RecordStoreV1Base::_paddingTooSmall( OperationContext* txn ) {
+        MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less
+            /* the more indexes we have, the higher the cost of a move.  so we take that into
+               account herein.  note on a move that insert() calls paddingFits(), thus
+               here for example with no inserts and nIndexes = 1 we have
+               .001*4-.001 or a 3:1 ratio to non moves -> 75% nonmoves.  insert heavy
+               can pushes this down considerably. further tweaking will be a good idea but
+               this should be an adequate starting point.
+            */
+            double N = 4; // magic
+            double x = min(2.0,_details->paddingFactor() + (0.001 * N));
+            _details->setPaddingFactor( txn, x );
+        }
+    }
+
+    Status RecordStoreV1Base::setCustomOption( OperationContext* txn,
+                                               const BSONElement& option,
+                                               BSONObjBuilder* info ) {
+        if ( str::equals( "usePowerOf2Sizes", option.fieldName() ) ) {
+            bool oldPowerOf2 = _details->isUserFlagSet( Flag_UsePowerOf2Sizes );
+            bool newPowerOf2 = option.trueValue();
+
+            if ( oldPowerOf2 != newPowerOf2 ) {
+                // change userFlags
+                info->appendBool( "usePowerOf2Sizes_old", oldPowerOf2 );
+
+                if ( newPowerOf2 )
+                    _details->setUserFlag( txn, Flag_UsePowerOf2Sizes );
+                else
+                    _details->clearUserFlag( txn, Flag_UsePowerOf2Sizes );
+
+                info->appendBool( "usePowerOf2Sizes_new", newPowerOf2 );
+            }
+
+            return Status::OK();
+        }
+
+        return Status( ErrorCodes::InvalidOptions,
+                       str::stream() << "no such option: " << option.fieldName() );
+    }
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_base.h b/src/mongo/db/storage/mmap_v1/record_store_v1_base.h
new file mode 100644
index 00000000000..72466c2b645
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_base.h
@@ -0,0 +1,303 @@
+// record_store_v1_base.h
+
+/**
+*    Copyright (C) 2013-2014 MongoDB Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#pragma once
+
+#include "mongo/db/diskloc.h"
+#include "mongo/db/storage/record_store.h"
+
+namespace mongo {
+
+    class DeletedRecord;
+    class DocWriter;
+    class ExtentManager;
+    class Record;
+    class OperationContext;
+
+    struct Extent;
+
+    class RecordStoreV1MetaData {
+    public:
+        virtual ~RecordStoreV1MetaData(){}
+
+        virtual const DiskLoc& capExtent() const = 0;
+        virtual void setCapExtent( OperationContext* txn, const DiskLoc& loc ) = 0;
+
+        virtual const DiskLoc& capFirstNewRecord() const = 0;
+        virtual void setCapFirstNewRecord( OperationContext* txn, const DiskLoc& loc ) = 0;
+
+        bool capLooped() const { return capFirstNewRecord().isValid(); }
+
+        virtual long long dataSize() const = 0;
+        virtual long long numRecords() const = 0;
+
+        virtual void incrementStats( OperationContext* txn,
+                                     long long dataSizeIncrement,
+                                     long long numRecordsIncrement ) = 0;
+
+        virtual void setStats( OperationContext* txn,
+                               long long dataSizeIncrement,
+                               long long numRecordsIncrement ) = 0;
+
+        virtual const DiskLoc& deletedListEntry( int bucket ) const = 0;
+        virtual void setDeletedListEntry( OperationContext* txn,
+                                          int bucket,
+                                          const DiskLoc& loc ) = 0;
+        virtual void orphanDeletedList(OperationContext* txn) = 0;
+
+        virtual const DiskLoc& firstExtent( OperationContext* txn ) const = 0;
+        virtual void setFirstExtent( OperationContext* txn, const DiskLoc& loc ) = 0;
+
+        virtual const DiskLoc& lastExtent( OperationContext* txn ) const = 0;
+        virtual void setLastExtent( OperationContext* txn, const DiskLoc& loc ) = 0;
+
+        virtual bool isCapped() const = 0;
+
+        virtual bool isUserFlagSet( int flag ) const = 0;
+        virtual int userFlags() const = 0;
+        virtual bool setUserFlag( OperationContext* txn, int flag ) = 0;
+        virtual bool clearUserFlag( OperationContext* txn, int flag ) = 0;
+        virtual bool replaceUserFlags( OperationContext* txn, int flags ) = 0;
+
+        virtual int lastExtentSize( OperationContext* txn) const = 0;
+        virtual void setLastExtentSize( OperationContext* txn, int newMax ) = 0;
+
+        virtual long long maxCappedDocs() const = 0;
+
+        virtual double paddingFactor() const = 0;
+
+        virtual void setPaddingFactor( OperationContext* txn, double paddingFactor ) = 0;
+
+    };
+
+    class RecordStoreV1Base : public RecordStore {
+    public:
+
+        static const int Buckets;
+        static const int MaxBucket;
+
+        static const int bucketSizes[];
+
+        enum UserFlags {
+            Flag_UsePowerOf2Sizes = 1 << 0
+        };
+
+        // ------------
+
+        class IntraExtentIterator;
+
+        /**
+         * @param details - takes ownership
+         * @param em - does NOT take ownership
+         */
+        RecordStoreV1Base( const StringData& ns,
+                           RecordStoreV1MetaData* details,
+                           ExtentManager* em,
+                           bool isSystemIndexes );
+
+        virtual ~RecordStoreV1Base();
+
+        virtual long long dataSize() const { return _details->dataSize(); }
+        virtual long long numRecords() const { return _details->numRecords(); }
+
+        virtual int64_t storageSize( OperationContext* txn,
+                                     BSONObjBuilder* extraInfo = NULL,
+                                     int level = 0 ) const;
+
+        virtual RecordData dataFor( const DiskLoc& loc ) const;
+
+        void deleteRecord( OperationContext* txn,
+                           const DiskLoc& dl );
+
+        StatusWith<DiskLoc> insertRecord( OperationContext* txn,
+                                          const char* data,
+                                          int len,
+                                          bool enforceQuota );
+
+        StatusWith<DiskLoc> insertRecord( OperationContext* txn,
+                                          const DocWriter* doc,
+                                          bool enforceQuota );
+
+        virtual StatusWith<DiskLoc> updateRecord( OperationContext* txn,
+                                                  const DiskLoc& oldLocation,
+                                                  const char* data,
+                                                  int len,
+                                                  bool enforceQuota,
+                                                  UpdateMoveNotifier* notifier );
+
+        virtual Status updateWithDamages( OperationContext* txn,
+                                          const DiskLoc& loc,
+                                          const char* damangeSource,
+                                          const mutablebson::DamageVector& damages );
+
+        virtual RecordIterator* getIteratorForRepair( OperationContext* txn ) const;
+
+        void increaseStorageSize( OperationContext* txn, int size, bool enforceQuota );
+
+        virtual Status validate( OperationContext* txn,
+                                 bool full, bool scanData,
+                                 ValidateAdaptor* adaptor,
+                                 ValidateResults* results, BSONObjBuilder* output ) const;
+
+        virtual void appendCustomStats( OperationContext* txn,
+                                        BSONObjBuilder* result,
+                                        double scale ) const;
+
+        virtual Status touch( OperationContext* txn, BSONObjBuilder* output ) const;
+
+        const RecordStoreV1MetaData* details() const { return _details.get(); }
+
+        /**
+         * @return the actual size to create
+         *         will be >= oldRecordSize
+         *         based on padding and any other flags
+         */
+        int getRecordAllocationSize( int minRecordSize ) const;
+
+        DiskLoc getExtentLocForRecord( OperationContext* txn, const DiskLoc& loc ) const;
+
+        DiskLoc getNextRecord( OperationContext* txn, const DiskLoc& loc ) const;
+        DiskLoc getPrevRecord( OperationContext* txn, const DiskLoc& loc ) const;
+
+        DiskLoc getNextRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const;
+        DiskLoc getPrevRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const;
+
+        /* @return the size for an allocated record quantized to 1/16th of the BucketSize.
+           @param allocSize    requested size to allocate
+           The returned size will be greater than or equal to 'allocSize'.
+        */
+        static int quantizeAllocationSpace(int allocSize);
+
+        /**
+         * Quantize 'allocSize' to the nearest bucketSize (or nearest 1mb boundary for large sizes).
+         */
+        static int quantizePowerOf2AllocationSpace(int allocSize);
+
+        /* return which "deleted bucket" for this size object */
+        static int bucket(int size);
+
+        virtual Status setCustomOption( OperationContext* txn,
+                                        const BSONElement& option,
+                                        BSONObjBuilder* info = NULL );
+    protected:
+
+        virtual Record* recordFor( const DiskLoc& loc ) const;
+
+        const DeletedRecord* deletedRecordFor( const DiskLoc& loc ) const;
+
+        virtual bool isCapped() const = 0;
+
+        virtual StatusWith<DiskLoc> allocRecord( OperationContext* txn,
+                                                 int lengthWithHeaders,
+                                                 bool enforceQuota ) = 0;
+
+        // TODO: document, remove, what have you
+        virtual void addDeletedRec( OperationContext* txn, const DiskLoc& dloc) = 0;
+
+        // TODO: another sad one
+        virtual DeletedRecord* drec( const DiskLoc& loc ) const;
+
+        // just a wrapper for _extentManager->getExtent( loc );
+        Extent* _getExtent( OperationContext* txn, const DiskLoc& loc ) const;
+
+        DiskLoc _getExtentLocForRecord( OperationContext* txn, const DiskLoc& loc ) const;
+
+        DiskLoc _getNextRecord( OperationContext* txn, const DiskLoc& loc ) const;
+        DiskLoc _getPrevRecord( OperationContext* txn, const DiskLoc& loc ) const;
+
+        DiskLoc _getNextRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const;
+        DiskLoc _getPrevRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const;
+
+        /**
+         * finds the first suitable DiskLoc for data
+         * will return the DiskLoc of a newly created DeletedRecord
+         */
+        DiskLoc _findFirstSpot( OperationContext* txn, const DiskLoc& extDiskLoc, Extent* e );
+
+        /** add a record to the end of the linked list chain within this extent.
+            require: you must have already declared write intent for the record header.
+        */
+        void _addRecordToRecListInExtent(OperationContext* txn, Record* r, DiskLoc loc);
+
+        void _paddingTooSmall( OperationContext* txn );
+        void _paddingFits( OperationContext* txn );
+
+        /**
+         * internal
+         * doesn't check inputs or change padding
+         */
+        StatusWith<DiskLoc> _insertRecord( OperationContext* txn,
+                                           const char* data,
+                                           int len,
+                                           bool enforceQuota );
+
+        scoped_ptr<RecordStoreV1MetaData> _details;
+        ExtentManager* _extentManager;
+        bool _isSystemIndexes;
+
+        friend class RecordStoreV1RepairIterator;
+    };
+
+    /**
+     * Iterates over all records within a single extent.
+     *
+     * EOF at end of extent, even if there are more extents.
+     */
+    class RecordStoreV1Base::IntraExtentIterator : public RecordIterator {
+    public:
+        IntraExtentIterator(OperationContext* txn,
+                            DiskLoc start,
+                            const RecordStoreV1Base* rs,
+                            bool forward = true)
+            : _txn(txn), _curr(start), _rs(rs), _forward(forward) {}
+
+        virtual bool isEOF() { return _curr.isNull(); }
+
+        virtual DiskLoc curr() { return _curr; }
+
+        virtual DiskLoc getNext( );
+
+        virtual void invalidate(const DiskLoc& dl);
+
+        virtual void prepareToYield() {}
+
+        virtual bool recoverFromYield() { return true; }
+
+        virtual RecordData dataFor( const DiskLoc& loc ) const { return _rs->dataFor(loc); }
+
+    private:
+        virtual const Record* recordFor( const DiskLoc& loc ) const { return _rs->recordFor(loc); }
+        OperationContext* _txn;
+        DiskLoc _curr;
+        const RecordStoreV1Base* _rs;
+        bool _forward;
+    };
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp
new file mode 100644
index 00000000000..c8524c76e22
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp
@@ -0,0 +1,717 @@
+// record_store_v1_capped.cpp
+
+/**
+ *    Copyright (C) 2013 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h"
+
+#include "mongo/db/operation_context_impl.h"
+#include "mongo/db/storage/mmap_v1/extent.h"
+#include "mongo/db/storage/mmap_v1/extent_manager.h"
+#include "mongo/db/storage/mmap_v1/record.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h"
+#include "mongo/util/mmap.h"
+#include "mongo/util/mongoutils/str.h"
+
+/*
+ capped collection layout
+
+ d's below won't exist if things align perfectly:
+
+ extent1             -> extent2                 -> extent3
+ -------------------    -----------------------    ---------------------
+ d r r r r r r r r d    d r r r r d r r r r r d    d r r r r r r r r r d
+                                ^   ^
+                           oldest   newest
+
+                        ^cappedFirstDeletedInCurExtent()
+                   ^cappedLastDelRecLastExtent()
+ ^cappedListOfAllDeletedRecords()
+*/
+
+#define DDD(x)
+
+namespace mongo {
+
+    CappedRecordStoreV1::CappedRecordStoreV1( OperationContext* txn,
+                                              CappedDocumentDeleteCallback* collection,
+                                              const StringData& ns,
+                                              RecordStoreV1MetaData* details,
+                                              ExtentManager* em,
+                                              bool isSystemIndexes )
+        : RecordStoreV1Base( ns, details, em, isSystemIndexes ),
+          _deleteCallback( collection ) {
+
+        DiskLoc extentLoc = details->firstExtent(txn);
+        while ( !extentLoc.isNull() ) {
+            _extentAdvice.push_back( _extentManager->cacheHint( extentLoc,
+                                                                ExtentManager::Sequential ) );
+            Extent* extent = em->getExtent( extentLoc );
+            extentLoc = extent->xnext;
+        }
+
+        // this is for VERY VERY old versions of capped collections
+        cappedCheckMigrate(txn);
+    }
+
+    CappedRecordStoreV1::~CappedRecordStoreV1() {
+    }
+
+    StatusWith<DiskLoc> CappedRecordStoreV1::allocRecord( OperationContext* txn,
+                                                          int lenToAlloc,
+                                                          bool enforceQuota ) {
+        {
+            // align very slightly.
+            lenToAlloc = (lenToAlloc + 3) & 0xfffffffc;
+        }
+
+        if ( lenToAlloc > theCapExtent()->length ) {
+            // the extent check is a way to try and improve performance
+            // since we have to iterate all the extents (for now) to get
+            // storage size
+            if ( lenToAlloc > storageSize(txn) ) {
+                return StatusWith<DiskLoc>( ErrorCodes::BadValue,
+                                            mongoutils::str::stream()
+                                            << "document is larger than capped size "
+                                            << lenToAlloc << " > " << storageSize(txn),
+                                            16328 );
+            }
+
+        }
+        DiskLoc loc;
+        { // do allocation
+
+            // signal done allocating new extents.
+            if ( !cappedLastDelRecLastExtent().isValid() )
+                setLastDelRecLastExtent( txn, DiskLoc() );
+
+            invariant( lenToAlloc < 400000000 );
+            int passes = 0;
+            int maxPasses = ( lenToAlloc / 30 ) + 2; // 30 is about the smallest entry that could go in the oplog
+            if ( maxPasses < 5000 ) {
+                // this is for bacwards safety since 5000 was the old value
+                maxPasses = 5000;
+            }
+
+            // delete records until we have room and the max # objects limit achieved.
+
+            /* this fails on a rename -- that is ok but must keep commented out */
+            //invariant( theCapExtent()->ns == ns );
+
+            theCapExtent()->assertOk();
+            DiskLoc firstEmptyExtent;
+            while ( 1 ) {
+                if ( _details->numRecords() < _details->maxCappedDocs() ) {
+                    loc = __capAlloc( txn, lenToAlloc );
+                    if ( !loc.isNull() )
+                        break;
+                }
+
+                // If on first iteration through extents, don't delete anything.
+                if ( !_details->capFirstNewRecord().isValid() ) {
+                    advanceCapExtent( txn, _ns );
+
+                    if ( _details->capExtent() != _details->firstExtent(txn) )
+                        _details->setCapFirstNewRecord( txn, DiskLoc().setInvalid() );
+                    // else signal done with first iteration through extents.
+                    continue;
+                }
+
+                if ( !_details->capFirstNewRecord().isNull() &&
+                     theCapExtent()->firstRecord == _details->capFirstNewRecord() ) {
+                    // We've deleted all records that were allocated on the previous
+                    // iteration through this extent.
+                    advanceCapExtent( txn, _ns );
+                    continue;
+                }
+
+                if ( theCapExtent()->firstRecord.isNull() ) {
+                    if ( firstEmptyExtent.isNull() )
+                        firstEmptyExtent = _details->capExtent();
+                    advanceCapExtent( txn, _ns );
+                    if ( firstEmptyExtent == _details->capExtent() ) {
+                        _maybeComplain( txn, lenToAlloc );
+                        return StatusWith<DiskLoc>( ErrorCodes::InternalError,
+                                                    "no space in capped collection" );
+                    }
+                    continue;
+                }
+
+                DiskLoc fr = theCapExtent()->firstRecord;
+                Status status = _deleteCallback->aboutToDeleteCapped( txn, fr );
+                if ( !status.isOK() )
+                    return StatusWith<DiskLoc>( status );
+                deleteRecord( txn, fr );
+
+                compact(txn);
+                if( ++passes > maxPasses ) {
+                    StringBuilder sb;
+                    sb << "passes >= maxPasses in CappedRecordStoreV1::cappedAlloc: ns: " << _ns
+                       << ", lenToAlloc: " << lenToAlloc
+                       << ", maxPasses: " << maxPasses
+                       << ", _maxDocsInCapped: " << _details->maxCappedDocs()
+                       << ", nrecords: " << _details->numRecords()
+                       << ", datasize: " << _details->dataSize();
+
+                    return StatusWith<DiskLoc>( ErrorCodes::InternalError, sb.str() );
+                }
+            }
+
+            // Remember first record allocated on this iteration through capExtent.
+            if ( _details->capFirstNewRecord().isValid() && _details->capFirstNewRecord().isNull() )
+                _details->setCapFirstNewRecord( txn, loc );
+        }
+
+        invariant( !loc.isNull() );
+
+        // possibly slice up if we've allocated too much space
+
+        DeletedRecord *r = drec( loc );
+
+        /* note we want to grab from the front so our next pointers on disk tend
+        to go in a forward direction which is important for performance. */
+        int regionlen = r->lengthWithHeaders();
+        invariant( r->extentOfs() < loc.getOfs() );
+
+        int left = regionlen - lenToAlloc;
+
+        /* split off some for further use. */
+        txn->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc;
+        DiskLoc newDelLoc = loc;
+        newDelLoc.inc(lenToAlloc);
+        DeletedRecord* newDel = drec( newDelLoc );
+        DeletedRecord* newDelW = txn->recoveryUnit()->writing(newDel);
+        newDelW->extentOfs() = r->extentOfs();
+        newDelW->lengthWithHeaders() = left;
+        newDelW->nextDeleted().Null();
+
+        addDeletedRec(txn, newDelLoc);
+
+        return StatusWith<DiskLoc>( loc );
+    }
+
+    Status CappedRecordStoreV1::truncate(OperationContext* txn) {
+        setLastDelRecLastExtent( txn, DiskLoc() );
+        setListOfAllDeletedRecords( txn, DiskLoc() );
+
+        // preserve firstExtent/lastExtent
+        _details->setCapExtent( txn, _details->firstExtent(txn) );
+        _details->setStats( txn, 0, 0 );
+        // preserve lastExtentSize
+        // nIndexes preserve 0
+        // capped preserve true
+        // max preserve
+        _details->setPaddingFactor( txn, 1.0 );
+        _details->setCapFirstNewRecord( txn, DiskLoc().setInvalid() );
+        setLastDelRecLastExtent( txn, DiskLoc().setInvalid() );
+        // dataFileVersion preserve
+        // indexFileVersion preserve
+
+        // Reset all existing extents and recreate the deleted list.
+        Extent* ext;
+        for( DiskLoc extLoc = _details->firstExtent(txn);
+             !extLoc.isNull();
+             extLoc = ext->xnext ) {
+            ext = _extentManager->getExtent(extLoc);
+
+            txn->recoveryUnit()->writing( &ext->firstRecord )->Null();
+            txn->recoveryUnit()->writing( &ext->lastRecord )->Null();
+
+            addDeletedRec( txn, _findFirstSpot( txn, extLoc, ext ) );
+        }
+
+        return Status::OK();
+    }
+
+    void CappedRecordStoreV1::temp_cappedTruncateAfter( OperationContext* txn,
+                                                        DiskLoc end,
+                                                        bool inclusive ) {
+        cappedTruncateAfter( txn, _ns.c_str(), end, inclusive );
+    }
+
+    /* combine adjacent deleted records *for the current extent* of the capped collection
+
+       this is O(n^2) but we call it for capped tables where typically n==1 or 2!
+       (or 3...there will be a little unused sliver at the end of the extent.)
+    */
+    void CappedRecordStoreV1::compact(OperationContext* txn) {
+        DDD( "CappedRecordStoreV1::compact enter" );
+
+        vector<DiskLoc> drecs;
+
+        // Pull out capExtent's DRs from deletedList
+        DiskLoc i = cappedFirstDeletedInCurExtent();
+        for (; !i.isNull() && inCapExtent( i ); i = deletedRecordFor( i )->nextDeleted() ) {
+            DDD( "\t" << i );
+            drecs.push_back( i );
+        }
+
+        setFirstDeletedInCurExtent( txn, i );
+
+        std::sort( drecs.begin(), drecs.end() );
+        DDD( "\t drecs.size(): " << drecs.size() );
+
+        vector<DiskLoc>::const_iterator j = drecs.begin();
+        invariant( j != drecs.end() );
+        DiskLoc a = *j;
+        while ( 1 ) {
+            j++;
+            if ( j == drecs.end() ) {
+                DDD( "\t compact adddelrec" );
+                addDeletedRec(txn, a);
+                break;
+            }
+            DiskLoc b = *j;
+            while ( a.a() == b.a() &&
+                    a.getOfs() + drec( a )->lengthWithHeaders() == b.getOfs() ) {
+
+                // a & b are adjacent.  merge.
+                txn->recoveryUnit()->writingInt( drec(a)->lengthWithHeaders() ) += drec(b)->lengthWithHeaders();
+                j++;
+                if ( j == drecs.end() ) {
+                    DDD( "\t compact adddelrec2" );
+                    addDeletedRec(txn, a);
+                    return;
+                }
+                b = *j;
+            }
+            DDD( "\t compact adddelrec3" );
+            addDeletedRec(txn, a);
+            a = b;
+        }
+
+    }
+
+    const DiskLoc &CappedRecordStoreV1::cappedFirstDeletedInCurExtent() const {
+        if ( cappedLastDelRecLastExtent().isNull() )
+            return cappedListOfAllDeletedRecords();
+        else
+            return drec(cappedLastDelRecLastExtent())->nextDeleted();
+    }
+
+    void CappedRecordStoreV1::setFirstDeletedInCurExtent( OperationContext* txn,
+                                                          const DiskLoc& loc ) {
+        if ( cappedLastDelRecLastExtent().isNull() )
+            setListOfAllDeletedRecords( txn, loc );
+        else
+            *txn->recoveryUnit()->writing( &drec(cappedLastDelRecLastExtent())->nextDeleted() ) = loc;
+    }
+
+    void CappedRecordStoreV1::cappedCheckMigrate(OperationContext* txn) {
+        // migrate old RecordStoreV1MetaData format
+        if ( _details->capExtent().a() == 0 && _details->capExtent().getOfs() == 0 ) {
+            _details->setCapFirstNewRecord( txn, DiskLoc().setInvalid() );
+            // put all the DeletedRecords in cappedListOfAllDeletedRecords()
+            for ( int i = 1; i < Buckets; ++i ) {
+                DiskLoc first = _details->deletedListEntry( i );
+                if ( first.isNull() )
+                    continue;
+                DiskLoc last = first;
+                for (; !drec(last)->nextDeleted().isNull(); last = drec(last)->nextDeleted() );
+                *txn->recoveryUnit()->writing(&drec(last)->nextDeleted()) = cappedListOfAllDeletedRecords();
+                setListOfAllDeletedRecords( txn, first );
+                _details->setDeletedListEntry(txn, i, DiskLoc());
+            }
+            // NOTE cappedLastDelRecLastExtent() set to DiskLoc() in above
+
+            // Last, in case we're killed before getting here
+            _details->setCapExtent( txn, _details->firstExtent(txn) );
+        }
+    }
+
+    bool CappedRecordStoreV1::inCapExtent( const DiskLoc &dl ) const {
+        invariant( !dl.isNull() );
+
+        if ( dl.a() != _details->capExtent().a() )
+            return false;
+
+        if ( dl.getOfs() < _details->capExtent().getOfs() )
+            return false;
+
+        const Extent* e = theCapExtent();
+        int end = _details->capExtent().getOfs() + e->length;
+        return dl.getOfs() <= end;
+    }
+
+    bool CappedRecordStoreV1::nextIsInCapExtent( const DiskLoc &dl ) const {
+        invariant( !dl.isNull() );
+        DiskLoc next = drec(dl)->nextDeleted();
+        if ( next.isNull() )
+            return false;
+        return inCapExtent( next );
+    }
+
+    void CappedRecordStoreV1::advanceCapExtent( OperationContext* txn, const StringData& ns ) {
+        // We want cappedLastDelRecLastExtent() to be the last DeletedRecord of the prev cap extent
+        // (or DiskLoc() if new capExtent == firstExtent)
+        if ( _details->capExtent() == _details->lastExtent(txn) )
+            setLastDelRecLastExtent( txn, DiskLoc() );
+        else {
+            DiskLoc i = cappedFirstDeletedInCurExtent();
+            for (; !i.isNull() && nextIsInCapExtent( i ); i = drec(i)->nextDeleted() );
+            setLastDelRecLastExtent( txn, i );
+        }
+
+        _details->setCapExtent( txn,
+                                theCapExtent()->xnext.isNull() ?  _details->firstExtent(txn)
+                                                               : theCapExtent()->xnext );
+
+        /* this isn't true if a collection has been renamed...that is ok just used for diagnostics */
+        //dassert( theCapExtent()->ns == ns );
+
+        theCapExtent()->assertOk();
+        _details->setCapFirstNewRecord( txn, DiskLoc() );
+    }
+
+    DiskLoc CappedRecordStoreV1::__capAlloc( OperationContext* txn, int len ) {
+        DiskLoc prev = cappedLastDelRecLastExtent();
+        DiskLoc i = cappedFirstDeletedInCurExtent();
+        DiskLoc ret;
+        for (; !i.isNull() && inCapExtent( i ); prev = i, i = drec(i)->nextDeleted() ) {
+            // We need to keep at least one DR per extent in cappedListOfAllDeletedRecords(),
+            // so make sure there's space to create a DR at the end.
+            if ( drec(i)->lengthWithHeaders() >= len + 24 ) {
+                ret = i;
+                break;
+            }
+        }
+
+        /* unlink ourself from the deleted list */
+        if ( !ret.isNull() ) {
+            if ( prev.isNull() )
+                setListOfAllDeletedRecords( txn, drec(ret)->nextDeleted() );
+            else
+                *txn->recoveryUnit()->writing(&drec(prev)->nextDeleted()) = drec(ret)->nextDeleted();
+            *txn->recoveryUnit()->writing(&drec(ret)->nextDeleted()) = DiskLoc().setInvalid(); // defensive.
+            invariant( drec(ret)->extentOfs() < ret.getOfs() );
+        }
+
+        return ret;
+    }
+
+    void CappedRecordStoreV1::cappedTruncateLastDelUpdate(OperationContext* txn) {
+        if ( _details->capExtent() == _details->firstExtent(txn) ) {
+            // Only one extent of the collection is in use, so there
+            // is no deleted record in a previous extent, so nullify
+            // cappedLastDelRecLastExtent().
+            setLastDelRecLastExtent( txn, DiskLoc() );
+        }
+        else {
+            // Scan through all deleted records in the collection
+            // until the last deleted record for the extent prior
+            // to the new capExtent is found.  Then set
+            // cappedLastDelRecLastExtent() to that deleted record.
+            DiskLoc i = cappedListOfAllDeletedRecords();
+            for( ;
+                 !drec(i)->nextDeleted().isNull() &&
+                     !inCapExtent( drec(i)->nextDeleted() );
+                 i = drec(i)->nextDeleted() );
+            // In our capped storage model, every extent must have at least one
+            // deleted record.  Here we check that 'i' is not the last deleted
+            // record.  (We expect that there will be deleted records in the new
+            // capExtent as well.)
+            invariant( !drec(i)->nextDeleted().isNull() );
+            setLastDelRecLastExtent( txn, i );
+        }
+    }
+
+    void CappedRecordStoreV1::cappedTruncateAfter(OperationContext* txn,
+                                                  const char* ns,
+                                                  DiskLoc end,
+                                                  bool inclusive) {
+        invariant( cappedLastDelRecLastExtent().isValid() );
+
+        // We iteratively remove the newest document until the newest document
+        // is 'end', then we remove 'end' if requested.
+        bool foundLast = false;
+        while( 1 ) {
+            if ( foundLast ) {
+                // 'end' has been found and removed, so break.
+                break;
+            }
+            txn->recoveryUnit()->commitIfNeeded();
+            // 'curr' will point to the newest document in the collection.
+            DiskLoc curr = theCapExtent()->lastRecord;
+            invariant( !curr.isNull() );
+            if ( curr == end ) {
+                if ( inclusive ) {
+                    // 'end' has been found, so break next iteration.
+                    foundLast = true;
+                }
+                else {
+                    // 'end' has been found, so break.
+                    break;
+                }
+            }
+
+            // TODO The algorithm used in this function cannot generate an
+            // empty collection, but we could call emptyCappedCollection() in
+            // this case instead of asserting.
+            uassert( 13415, "emptying the collection is not allowed", _details->numRecords() > 1 );
+
+            // Delete the newest record, and coalesce the new deleted
+            // record with existing deleted records.
+            Status status = _deleteCallback->aboutToDeleteCapped( txn, curr );
+            uassertStatusOK( status );
+            deleteRecord( txn, curr );
+            compact(txn);
+
+            // This is the case where we have not yet had to remove any
+            // documents to make room for other documents, and we are allocating
+            // documents from free space in fresh extents instead of reusing
+            // space from familiar extents.
+            if ( !_details->capLooped() ) {
+
+                // We just removed the last record from the 'capExtent', and
+                // the 'capExtent' can't be empty, so we set 'capExtent' to
+                // capExtent's prev extent.
+                if ( theCapExtent()->lastRecord.isNull() ) {
+                    invariant( !theCapExtent()->xprev.isNull() );
+                    // NOTE Because we didn't delete the last document, and
+                    // capLooped() is false, capExtent is not the first extent
+                    // so xprev will be nonnull.
+                    _details->setCapExtent( txn, theCapExtent()->xprev );
+                    theCapExtent()->assertOk();
+
+                    // update cappedLastDelRecLastExtent()
+                    cappedTruncateLastDelUpdate(txn);
+                }
+                continue;
+            }
+
+            // This is the case where capLooped() is true, and we just deleted
+            // from capExtent, and we just deleted capFirstNewRecord, which was
+            // the last record on the fresh side of capExtent.
+            // NOTE In this comparison, curr and potentially capFirstNewRecord
+            // may point to invalid data, but we can still compare the
+            // references themselves.
+            if ( curr == _details->capFirstNewRecord() ) {
+
+                // Set 'capExtent' to the first nonempty extent prior to the
+                // initial capExtent.  There must be such an extent because we
+                // have not deleted the last document in the collection.  It is
+                // possible that all extents other than the capExtent are empty.
+                // In this case we will keep the initial capExtent and specify
+                // that all records contained within are on the fresh rather than
+                // stale side of the extent.
+                DiskLoc newCapExtent = _details->capExtent();
+                do {
+                    // Find the previous extent, looping if necessary.
+                    newCapExtent = ( newCapExtent == _details->firstExtent(txn) ) ?
+                        _details->lastExtent(txn) :
+                        _extentManager->getExtent(newCapExtent)->xprev;
+                    _extentManager->getExtent(newCapExtent)->assertOk();
+                }
+                while ( _extentManager->getExtent(newCapExtent)->firstRecord.isNull() );
+                _details->setCapExtent( txn, newCapExtent );
+
+                // Place all documents in the new capExtent on the fresh side
+                // of the capExtent by setting capFirstNewRecord to the first
+                // document in the new capExtent.
+                _details->setCapFirstNewRecord( txn, theCapExtent()->firstRecord );
+
+                // update cappedLastDelRecLastExtent()
+                cappedTruncateLastDelUpdate(txn);
+            }
+        }
+    }
+
+    const DiskLoc& CappedRecordStoreV1::cappedListOfAllDeletedRecords() const {
+        return _details->deletedListEntry(0);
+    }
+
+    void CappedRecordStoreV1::setListOfAllDeletedRecords( OperationContext* txn,
+                                                          const DiskLoc& loc ) {
+        return _details->setDeletedListEntry(txn, 0, loc);
+    }
+
+    const DiskLoc& CappedRecordStoreV1::cappedLastDelRecLastExtent() const {
+        return _details->deletedListEntry(1);
+    }
+
+    void CappedRecordStoreV1::setLastDelRecLastExtent( OperationContext* txn,
+                                                       const DiskLoc& loc ) {
+        return _details->setDeletedListEntry(txn, 1, loc);
+    }
+
+    Extent* CappedRecordStoreV1::theCapExtent() const {
+        return _extentManager->getExtent(_details->capExtent());
+    }
+
+    void CappedRecordStoreV1::addDeletedRec( OperationContext* txn, const DiskLoc& dloc ) {
+        DeletedRecord* d = txn->recoveryUnit()->writing( drec( dloc ) );
+
+        DEBUGGING log() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs() << endl;
+        if ( !cappedLastDelRecLastExtent().isValid() ) {
+            // Initial extent allocation.  Insert at end.
+            d->nextDeleted() = DiskLoc();
+            if ( cappedListOfAllDeletedRecords().isNull() )
+                setListOfAllDeletedRecords( txn, dloc );
+            else {
+                DiskLoc i = cappedListOfAllDeletedRecords();
+                for (; !drec(i)->nextDeleted().isNull(); i = drec(i)->nextDeleted() )
+                    ;
+                *txn->recoveryUnit()->writing(&drec(i)->nextDeleted()) = dloc;
+            }
+        }
+        else {
+            d->nextDeleted() = cappedFirstDeletedInCurExtent();
+            setFirstDeletedInCurExtent( txn, dloc );
+            // always compact() after this so order doesn't matter
+        }
+    }
+
+    RecordIterator* CappedRecordStoreV1::getIterator( OperationContext* txn,
+                                                      const DiskLoc& start,
+                                                      bool tailable,
+                                                      const CollectionScanParams::Direction& dir) const {
+        return new CappedRecordStoreV1Iterator( txn, this, start, tailable, dir );
+    }
+
+    vector<RecordIterator*> CappedRecordStoreV1::getManyIterators( OperationContext* txn ) const {
+        OwnedPointerVector<RecordIterator> iterators;
+
+        if (!_details->capLooped()) {
+            // if we haven't looped yet, just spit out all extents (same as non-capped impl)
+            const Extent* ext;
+            for (DiskLoc extLoc = details()->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) {
+                ext = _getExtent(txn, extLoc);
+                if (ext->firstRecord.isNull())
+                    continue;
+
+                iterators.push_back(new RecordStoreV1Base::IntraExtentIterator(txn,
+                                                                               ext->firstRecord,
+                                                                               this));
+            }
+        }
+        else {
+            // if we've looped we need to iterate the extents, starting and ending with the
+            // capExtent
+            const DiskLoc capExtent = details()->capExtent();
+            invariant(!capExtent.isNull());
+            invariant(capExtent.isValid());
+
+            // First do the "old" portion of capExtent if there is any
+            DiskLoc extLoc = capExtent;
+            {
+                const Extent* ext = _getExtent(txn, extLoc);
+                if (ext->firstRecord != details()->capFirstNewRecord()) {
+                    // this means there is old data in capExtent
+                    iterators.push_back(new RecordStoreV1Base::IntraExtentIterator(txn,
+                                                                                   ext->firstRecord,
+                                                                                   this));
+                }
+
+                extLoc = ext->xnext.isNull() ? details()->firstExtent(txn) : ext->xnext;
+            }
+
+            // Next handle all the other extents
+            while (extLoc != capExtent) {
+                const Extent* ext = _getExtent(txn, extLoc);
+                iterators.push_back(new RecordStoreV1Base::IntraExtentIterator(txn,
+                                                                               ext->firstRecord,
+                                                                               this));
+
+                extLoc = ext->xnext.isNull() ? details()->firstExtent(txn) : ext->xnext;
+            }
+
+            // Finally handle the "new" data in the capExtent
+            iterators.push_back(
+                new RecordStoreV1Base::IntraExtentIterator(txn,
+                                                           details()->capFirstNewRecord(),
+                                                           this));
+        }
+
+        return iterators.release();
+    }
+
+    Status CappedRecordStoreV1::compact( OperationContext* txn,
+                                         RecordStoreCompactAdaptor* adaptor,
+                                         const CompactOptions* options,
+                                         CompactStats* stats ) {
+        invariant(false);
+    }
+
+    void CappedRecordStoreV1::_maybeComplain( OperationContext* txn, int len ) const {
+        RARELY {
+            std::stringstream buf;
+            buf << "couldn't make room for record len: " << len << " in capped ns " << _ns << '\n';
+            buf << "numRecords: " << numRecords() << '\n';
+            int i = 0;
+            for ( DiskLoc e = _details->firstExtent(txn);
+                  !e.isNull();
+                  e = _extentManager->getExtent( e )->xnext, ++i ) {
+                buf << "  Extent " << i;
+                if ( e == _details->capExtent() )
+                    buf << " (capExtent)";
+                buf << ' ' << e;
+                buf << '\n';
+
+                buf << "    magic: " << hex << _extentManager->getExtent( e )->magic << dec
+                    << " extent->ns: " << _extentManager->getExtent( e )->nsDiagnostic.toString()
+                    << '\n';
+                buf << "    fr: " << _extentManager->getExtent( e )->firstRecord.toString()
+                    << " lr: " << _extentManager->getExtent( e )->lastRecord.toString()
+                    << " extent->len: " << _extentManager->getExtent( e )->length << '\n';
+            }
+
+            warning() << buf.str();
+
+            // assume it is unusually large record; if not, something is broken
+            fassert( 17438, len * 5 > _details->lastExtentSize(txn) );
+        }
+    }
+
+    DiskLoc CappedRecordStoreV1::firstRecord( OperationContext* txn,
+                                              const DiskLoc &startExtent ) const {
+        for (DiskLoc i = startExtent.isNull() ? _details->firstExtent(txn) : startExtent;
+                !i.isNull();
+             i = _extentManager->getExtent( i )->xnext ) {
+
+            Extent* e = _extentManager->getExtent( i );
+
+            if ( !e->firstRecord.isNull() )
+                return e->firstRecord;
+        }
+        return DiskLoc();
+    }
+
+    DiskLoc CappedRecordStoreV1::lastRecord( OperationContext* txn,
+                                             const DiskLoc &startExtent ) const {
+        for (DiskLoc i = startExtent.isNull() ? _details->lastExtent(txn) : startExtent;
+                !i.isNull();
+             i = _extentManager->getExtent( i )->xprev ) {
+
+            Extent* e = _extentManager->getExtent( i );
+            if ( !e->lastRecord.isNull() )
+                return e->lastRecord;
+        }
+        return DiskLoc();
+    }
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped.h b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.h
new file mode 100644
index 00000000000..4422b5d451b
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.h
@@ -0,0 +1,139 @@
+// record_store_v1_capped.h
+
+/**
+*    Copyright (C) 2013 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#pragma once
+
+#include "mongo/base/owned_pointer_vector.h"
+#include "mongo/db/diskloc.h"
+#include "mongo/db/storage/capped_callback.h"
+#include "mongo/db/storage/mmap_v1/extent_manager.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_base.h"
+
+namespace mongo {
+
+    class CappedRecordStoreV1 : public RecordStoreV1Base {
+    public:
+        CappedRecordStoreV1( OperationContext* txn,
+                             CappedDocumentDeleteCallback* collection,
+                             const StringData& ns,
+                             RecordStoreV1MetaData* details,
+                             ExtentManager* em,
+                             bool isSystemIndexes );
+
+        virtual ~CappedRecordStoreV1();
+
+        const char* name() const { return "CappedRecordStoreV1"; }
+
+        virtual Status truncate(OperationContext* txn);
+
+        /**
+         * Truncate documents newer than the document at 'end' from the capped
+         * collection.  The collection cannot be completely emptied using this
+         * function.  An assertion will be thrown if that is attempted.
+         * @param inclusive - Truncate 'end' as well iff true
+         * XXX: this will go away soon, just needed to move for now
+         */
+        virtual void temp_cappedTruncateAfter( OperationContext* txn, DiskLoc end, bool inclusive );
+
+        virtual RecordIterator* getIterator( OperationContext* txn,
+                                             const DiskLoc& start, bool tailable,
+                                             const CollectionScanParams::Direction& dir) const;
+
+        virtual std::vector<RecordIterator*> getManyIterators( OperationContext* txn ) const;
+
+        virtual bool compactSupported() const { return false; }
+
+        virtual Status compact( OperationContext* txn,
+                                RecordStoreCompactAdaptor* adaptor,
+                                const CompactOptions* options,
+                                CompactStats* stats );
+
+        // Start from firstExtent by default.
+        DiskLoc firstRecord( OperationContext* txn,
+                             const DiskLoc &startExtent = DiskLoc() ) const;
+        // Start from lastExtent by default.
+        DiskLoc lastRecord( OperationContext* txn,
+                            const DiskLoc &startExtent = DiskLoc() ) const;
+
+    protected:
+
+        virtual bool isCapped() const { return true; }
+
+        virtual void setCappedDeleteCallback( CappedDocumentDeleteCallback* cb ) {
+            _deleteCallback = cb;
+        }
+
+        virtual StatusWith<DiskLoc> allocRecord( OperationContext* txn,
+                                                 int lengthWithHeaders,
+                                                 bool enforceQuota );
+
+        virtual void addDeletedRec(OperationContext* txn, const DiskLoc& dloc);
+
+    private:
+        // -- start copy from cap.cpp --
+        void compact(OperationContext* txn);
+        const DiskLoc& cappedFirstDeletedInCurExtent() const;
+        void setFirstDeletedInCurExtent( OperationContext* txn, const DiskLoc& loc );
+        void cappedCheckMigrate(OperationContext* txn);
+        DiskLoc __capAlloc( OperationContext* txn, int len );
+        bool inCapExtent( const DiskLoc &dl ) const;
+        const DiskLoc& cappedListOfAllDeletedRecords() const;
+        const DiskLoc& cappedLastDelRecLastExtent() const;
+        void setListOfAllDeletedRecords( OperationContext* txn, const DiskLoc& loc );
+        void setLastDelRecLastExtent( OperationContext* txn, const DiskLoc& loc );
+        Extent *theCapExtent() const;
+        bool nextIsInCapExtent( const DiskLoc &dl ) const;
+        void advanceCapExtent( OperationContext* txn, const StringData& ns );
+        void cappedTruncateLastDelUpdate(OperationContext* txn);
+
+        /**
+         * Truncate documents newer than the document at 'end' from the capped
+         * collection.  The collection cannot be completely emptied using this
+         * function.  An assertion will be thrown if that is attempted.
+         * @param inclusive - Truncate 'end' as well iff true
+         */
+        void cappedTruncateAfter(OperationContext* txn,
+                                 const char* ns,
+                                 DiskLoc end,
+                                 bool inclusive);
+
+        void _maybeComplain( OperationContext* txn, int len ) const;
+
+        // -- end copy from cap.cpp --
+
+        CappedDocumentDeleteCallback* _deleteCallback;
+
+        OwnedPointerVector<ExtentManager::CacheHint> _extentAdvice;
+
+        friend class CappedRecordStoreV1Iterator;
+    };
+
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp
new file mode 100644
index 00000000000..11f7894fe77
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp
@@ -0,0 +1,237 @@
+/**
+ *    Copyright (C) 2013 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h"
+
+#include "mongo/db/catalog/collection.h"
+#include "mongo/db/storage/mmap_v1/extent.h"
+#include "mongo/db/storage/mmap_v1/extent_manager.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h"
+
+namespace mongo {
+
+
+    //
+    // Capped collection traversal
+    //
+    CappedRecordStoreV1Iterator::CappedRecordStoreV1Iterator( OperationContext* txn,
+                                                              const CappedRecordStoreV1* collection,
+                                                              const DiskLoc& start, bool tailable,
+                                                              const CollectionScanParams::Direction& dir)
+        : _txn(txn), _recordStore(collection), _curr(start), _tailable(tailable),
+          _direction(dir), _killedByInvalidate(false) {
+
+        if (_curr.isNull()) {
+
+            const RecordStoreV1MetaData* nsd = _recordStore->details();
+
+            // If a start position isn't specified, we fill one out from the start of the
+            // collection.
+            if (CollectionScanParams::FORWARD == _direction) {
+                // Going forwards.
+                if (!nsd->capLooped()) {
+                    // If our capped collection doesn't loop around, the first record is easy.
+                    _curr = collection->firstRecord(_txn);
+                }
+                else {
+                    // Our capped collection has "looped' around.
+                    // Copied verbatim from ForwardCappedCursor::init.
+                    // TODO ELABORATE
+                    _curr = _getExtent( nsd->capExtent() )->firstRecord;
+                    if (!_curr.isNull() && _curr == nsd->capFirstNewRecord()) {
+                        _curr = _getExtent( nsd->capExtent() )->lastRecord;
+                        _curr = nextLoop(_curr);
+                    }
+                }
+            }
+            else {
+                // Going backwards
+                if (!nsd->capLooped()) {
+                    // Start at the end.
+                    _curr = collection->lastRecord(_txn);
+                }
+                else {
+                    _curr = _getExtent( nsd->capExtent() )->lastRecord;
+                }
+            }
+        }
+    }
+
+    bool CappedRecordStoreV1Iterator::isEOF() { return _curr.isNull(); }
+
+    DiskLoc CappedRecordStoreV1Iterator::curr() { return _curr; }
+
+    DiskLoc CappedRecordStoreV1Iterator::getNext() {
+        DiskLoc ret = _curr;
+
+        // Move to the next thing.
+        if (!isEOF()) {
+            _prev = _curr;
+            _curr = getNextCapped(_curr);
+        }
+        else if (_tailable && !_prev.isNull()) {
+            // If we're tailable, there COULD have been something inserted even though we were
+            // previously EOF.  Look at the next thing from 'prev' and see.
+            DiskLoc newCurr = getNextCapped(_prev);
+
+            if (!newCurr.isNull()) {
+                // There's something new to return.  _curr always points to the next thing to
+                // return.  Update it, and move _prev to the thing we just returned.
+                _prev = ret = newCurr;
+                _curr = getNextCapped(_prev);
+            }
+        }
+
+        return ret;
+    }
+
+    void CappedRecordStoreV1Iterator::invalidate(const DiskLoc& dl) {
+        if ((_tailable && _curr.isNull() && dl == _prev) || (dl == _curr)) {
+            // In the _tailable case, we're about to kill the DiskLoc that we're tailing.  Nothing
+            // that we can possibly do to survive that.
+            //
+            // In the _curr case, we *could* move to the next thing, since there is actually a next
+            // thing, but according to clientcursor.cpp:
+            // "note we cannot advance here. if this condition occurs, writes to the oplog
+            //  have "caught" the reader.  skipping ahead, the reader would miss postentially
+            //  important data."
+            _curr = _prev = DiskLoc();
+            _killedByInvalidate = true;
+        }
+    }
+
+    void CappedRecordStoreV1Iterator::prepareToYield() {
+    }
+
+    bool CappedRecordStoreV1Iterator::recoverFromYield() {
+        // If invalidate invalidated the DiskLoc we relied on, give up now.
+        if (_killedByInvalidate) {
+            _recordStore = NULL;
+            return false;
+        }
+
+        return true;
+    }
+
+    DiskLoc CappedRecordStoreV1Iterator::getNextCapped(const DiskLoc& dl) {
+        invariant(!dl.isNull());
+        const RecordStoreV1MetaData* details = _recordStore->details();
+
+        if (CollectionScanParams::FORWARD == _direction) {
+            // If it's not looped, it's easy.
+            if (!_recordStore->details()->capLooped()) {
+                return _getNextRecord( dl );
+            }
+
+            // TODO ELABORATE
+            // EOF.
+            if (dl == _getExtent( details->capExtent() )->lastRecord) {
+                return DiskLoc();
+            }
+
+            DiskLoc ret = nextLoop(dl);
+
+            // If we become capFirstNewRecord from same extent, advance to next extent.
+            if (ret == details->capFirstNewRecord() && ret != _getExtent( details->capExtent() )->firstRecord) {
+                ret = nextLoop(_getExtent( details->capExtent() )->lastRecord);
+            }
+
+            // If we have just gotten to beginning of capExtent, skip to capFirstNewRecord
+            if (ret == _getExtent( details->capExtent() )->firstRecord) { ret = details->capFirstNewRecord(); }
+
+            return ret;
+        }
+        else {
+            if (!details->capLooped()) { return _getPrevRecord( dl ); }
+
+            // TODO ELABORATE
+            // Last record
+            if (details->capFirstNewRecord() == _getExtent( details->capExtent() )->firstRecord) {
+                if (dl == nextLoop(_getExtent( details->capExtent() )->lastRecord)) {
+                    return DiskLoc();
+                }
+            }
+            else {
+                if (dl == _getExtent( details->capExtent() )->firstRecord) { return DiskLoc(); }
+            }
+
+            DiskLoc ret;
+            // If we are capFirstNewRecord, advance to prev extent, otherwise just get prev.
+            if (dl == details->capFirstNewRecord()) {
+                ret = prevLoop(_getExtent( details->capExtent() )->firstRecord);
+            }
+            else {
+                ret = prevLoop(dl);
+            }
+
+            // If we just became last in cap extent, advance past capFirstNewRecord
+            // (We know ext(capExtent)->firstRecord != capFirstNewRecord, since would
+            // have returned DiskLoc() earlier otherwise.)
+            if (ret == _getExtent( details->capExtent() )->lastRecord) {
+                ret = _getPrevRecord( details->capFirstNewRecord() );
+            }
+
+            return ret;
+        }
+    }
+
+    DiskLoc CappedRecordStoreV1Iterator::nextLoop(const DiskLoc& prev) {
+        // TODO ELABORATE
+        DiskLoc next = _getNextRecord( prev );
+        if (!next.isNull()) {
+            return next;
+        }
+        return _recordStore->firstRecord(_txn);
+    }
+
+    DiskLoc CappedRecordStoreV1Iterator::prevLoop(const DiskLoc& curr) {
+        // TODO ELABORATE
+        DiskLoc prev = _getPrevRecord( curr );
+        if (!prev.isNull()) {
+            return prev;
+        }
+        return _recordStore->lastRecord(_txn);
+    }
+
+    RecordData CappedRecordStoreV1Iterator::dataFor( const DiskLoc& loc ) const {
+        return _recordStore->dataFor( loc );
+    }
+
+    Extent* CappedRecordStoreV1Iterator::_getExtent( const DiskLoc& loc ) {
+        return _recordStore->_extentManager->getExtent( loc );
+    }
+
+    DiskLoc CappedRecordStoreV1Iterator::_getNextRecord( const DiskLoc& loc ) {
+        return _recordStore->getNextRecord( _txn, loc );
+    }
+
+    DiskLoc CappedRecordStoreV1Iterator::_getPrevRecord( const DiskLoc& loc ) {
+        return _recordStore->getPrevRecord( _txn, loc );
+    }
+
+}  // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h
new file mode 100644
index 00000000000..501986d98fa
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h
@@ -0,0 +1,100 @@
+/**
+ *    Copyright (C) 2013 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/storage/record_store.h"
+
+namespace mongo {
+
+    class CappedRecordStoreV1;
+
+    struct Extent;
+
+    /**
+     * This class iterates over a capped collection identified by 'ns'.
+     * The collection must exist when the constructor is called.
+     *
+     * If start is not DiskLoc(), the iteration begins at that DiskLoc.
+     *
+     * If tailable is true, getNext() can be called after isEOF.  It will use the last valid
+     * returned DiskLoc and try to find the next record from that.
+     */
+    class CappedRecordStoreV1Iterator : public RecordIterator {
+    public:
+        CappedRecordStoreV1Iterator( OperationContext* txn,
+                                     const CappedRecordStoreV1* collection,
+                                     const DiskLoc& start,
+                                     bool tailable,
+                                     const CollectionScanParams::Direction& dir );
+        virtual ~CappedRecordStoreV1Iterator() { }
+
+        // If this is a tailable cursor, isEOF could change its mind after a call to getNext().
+        virtual bool isEOF();
+        virtual DiskLoc getNext();
+        virtual DiskLoc curr();
+
+        virtual void invalidate(const DiskLoc& dl);
+        virtual void prepareToYield();
+        virtual bool recoverFromYield();
+
+        virtual RecordData dataFor( const DiskLoc& loc ) const;
+    private:
+        /**
+         * Internal collection navigation helper methods.
+         */
+        DiskLoc getNextCapped(const DiskLoc& dl);
+        DiskLoc prevLoop(const DiskLoc& curr);
+        DiskLoc nextLoop(const DiskLoc& prev);
+
+        // some helpers - these move to RecordStore probably
+        Extent* _getExtent( const DiskLoc& loc );
+        DiskLoc _getNextRecord( const DiskLoc& loc );
+        DiskLoc _getPrevRecord( const DiskLoc& loc );
+
+        // transactional context for read locks. Not owned by us
+        OperationContext* _txn;
+
+        // The collection we're iterating over.
+        const CappedRecordStoreV1* _recordStore;
+
+        // The result returned on the next call to getNext().
+        DiskLoc _curr;
+
+        // If we're tailable, we try to progress from the last valid result when we hit the end.
+        DiskLoc _prev;
+        bool _tailable;
+
+        CollectionScanParams::Direction _direction;
+
+        // If invalidate kills the DiskLoc we need to move forward, we kill the iterator.  See the
+        // comment in the body of invalidate(...).
+        bool _killedByInvalidate;
+    };
+
+}  // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp
new file mode 100644
index 00000000000..6e423b9e073
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp
@@ -0,0 +1,558 @@
+// record_store_v1_capped_test.cpp
+
+/**
+ *    Copyright (C) 2014 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h"
+
+#include "mongo/db/operation_context_noop.h"
+#include "mongo/db/storage/mmap_v1/record.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h"
+#include "mongo/unittest/unittest.h"
+
+using namespace mongo;
+
+namespace {
+
+    // Provides data to be inserted. Must be large enough for largest possible record.
+    // Should be in BSS so unused portions should be free.
+    char zeros[20*1024*1024] = {};
+
+    class DummyCappedDocumentDeleteCallback : public CappedDocumentDeleteCallback {
+    public:
+        Status aboutToDeleteCapped( OperationContext* txn, const DiskLoc& loc ) {
+            deleted.push_back( loc );
+            return Status::OK();
+        }
+        vector<DiskLoc> deleted;
+    };
+
+    void simpleInsertTest( const char* buf, int size ) {
+
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+        DummyCappedDocumentDeleteCallback cb;
+
+        string myns = "test.simple1";
+        CappedRecordStoreV1 rs( &txn, &cb, myns, md, &em, false );
+
+        rs.increaseStorageSize( &txn, 1024, -1 );
+
+        ASSERT_NOT_OK( rs.insertRecord( &txn, buf, 3, 1000 ).getStatus() );
+
+        rs.insertRecord( &txn, buf, size, 10000 );
+
+        {
+            BSONObjBuilder b;
+            int64_t storageSize = rs.storageSize( &txn, &b );
+            BSONObj obj = b.obj();
+            ASSERT_EQUALS( 1, obj["numExtents"].numberInt() );
+            ASSERT_EQUALS( storageSize, em.quantizeExtentSize( 1024 ) );
+        }
+
+        for ( int i = 0; i < 1000; i++ ) {
+            ASSERT_OK( rs.insertRecord( &txn, buf, size, 10000 ).getStatus() );
+        }
+
+        long long start = md->numRecords();
+        for ( int i = 0; i < 1000; i++ ) {
+            ASSERT_OK( rs.insertRecord( &txn, buf, size, 10000 ).getStatus() );
+        }
+        ASSERT_EQUALS( start, md->numRecords() );
+        ASSERT_GREATER_THAN( start, 100 );
+        ASSERT_LESS_THAN( start, 1000 );
+    }
+
+    TEST(CappedRecordStoreV1, SimpleInsertSize4) {
+        simpleInsertTest("abcd", 4);
+    }
+    TEST(CappedRecordStoreV1, SimpleInsertSize8) {
+        simpleInsertTest("abcdefgh", 8);
+    }
+
+    TEST(CappedRecordStoreV1, EmptySingleExtent) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+        DummyCappedDocumentDeleteCallback cb;
+        CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+        {
+            LocAndSize records[] = {
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1000), 1000},
+                {}
+            };
+            md->setCapExtent(&txn, DiskLoc(0, 0));
+            md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid());
+            initializeV1RS(&txn, records, drecs, &em, md);
+        }
+
+        rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 1000), 100},
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1100), 900},
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+            ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+            ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc().setInvalid()); // unlooped
+        }
+    }
+
+    TEST(CappedRecordStoreV1, FirstLoopWithSingleExtentExactSize) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+        DummyCappedDocumentDeleteCallback cb;
+        CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+        {
+            LocAndSize records[] = {
+                {DiskLoc(0, 1000), 100},
+                {DiskLoc(0, 1100), 100},
+                {DiskLoc(0, 1200), 100},
+                {DiskLoc(0, 1300), 100},
+                {DiskLoc(0, 1400), 100},
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1500), 50},
+                {}
+            };
+            md->setCapExtent(&txn, DiskLoc(0, 0));
+            md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped
+            initializeV1RS(&txn, records, drecs, &em, md);
+        }
+
+        rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 1200), 100}, // first old record
+                {DiskLoc(0, 1300), 100},
+                {DiskLoc(0, 1400), 100}, // last old record
+                {DiskLoc(0, 1000), 100}, // first new record
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1100), 100}, // gap after newest record XXX this is probably a bug
+                {DiskLoc(0, 1500), 50}, // gap at end of extent
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+            ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+            ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+        }
+    }
+
+    TEST(CappedRecordStoreV1, NonFirstLoopWithSingleExtentExactSize) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+        DummyCappedDocumentDeleteCallback cb;
+        CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+        {
+            LocAndSize records[] = {
+                {DiskLoc(0, 1000), 100},
+                {DiskLoc(0, 1100), 100},
+                {DiskLoc(0, 1200), 100},
+                {DiskLoc(0, 1300), 100},
+                {DiskLoc(0, 1400), 100},
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1500), 50},
+                {}
+            };
+            md->setCapExtent(&txn, DiskLoc(0, 0));
+            md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000));
+            initializeV1RS(&txn, records, drecs, &em, md);
+        }
+
+        rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 1200), 100}, // first old record
+                {DiskLoc(0, 1300), 100},
+                {DiskLoc(0, 1400), 100}, // last old record
+                {DiskLoc(0, 1000), 100}, // first new record
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1100), 100}, // gap after newest record XXX this is probably a bug
+                {DiskLoc(0, 1500), 50}, // gap at end of extent
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+            ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+            ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+        }
+    }
+
+    /**
+     * Current code always tries to leave 24 bytes to create a DeletedRecord.
+     */
+    TEST(CappedRecordStoreV1, WillLoopWithout24SpareBytes) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+        DummyCappedDocumentDeleteCallback cb;
+        CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+        {
+            LocAndSize records[] = {
+                {DiskLoc(0, 1000), 100},
+                {DiskLoc(0, 1100), 100},
+                {DiskLoc(0, 1200), 100},
+                {DiskLoc(0, 1300), 100},
+                {DiskLoc(0, 1400), 100},
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1500), 123},
+                {}
+            };
+            md->setCapExtent(&txn, DiskLoc(0, 0));
+            md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000));
+            initializeV1RS(&txn, records, drecs, &em, md);
+        }
+
+        rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 1200), 100}, // first old record
+                {DiskLoc(0, 1300), 100},
+                {DiskLoc(0, 1400), 100}, // last old record
+                {DiskLoc(0, 1000), 100}, // first new record
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1100), 100}, // gap after newest record
+                {DiskLoc(0, 1500), 123}, // gap at end of extent
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+            ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+            ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+        }
+    }
+
+    TEST(CappedRecordStoreV1, WontLoopWith24SpareBytes) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+        DummyCappedDocumentDeleteCallback cb;
+        CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+        {
+            LocAndSize records[] = {
+                {DiskLoc(0, 1000), 100},
+                {DiskLoc(0, 1100), 100},
+                {DiskLoc(0, 1200), 100},
+                {DiskLoc(0, 1300), 100},
+                {DiskLoc(0, 1400), 100},
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1500), 124},
+                {}
+            };
+            md->setCapExtent(&txn, DiskLoc(0, 0));
+            md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000));
+            initializeV1RS(&txn, records, drecs, &em, md);
+        }
+
+        rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 1000), 100},
+                {DiskLoc(0, 1100), 100},
+                {DiskLoc(0, 1200), 100},
+                {DiskLoc(0, 1300), 100},
+                {DiskLoc(0, 1400), 100},
+                {DiskLoc(0, 1500), 100},
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1600), 24}, // gap at end of extent
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+            ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+            ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+        }
+    }
+
+    TEST(CappedRecordStoreV1, MoveToSecondExtentUnLooped) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+        DummyCappedDocumentDeleteCallback cb;
+        CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+        {
+            // Two extents, each with 1000 bytes.
+            LocAndSize records[] = {
+                {DiskLoc(0, 1000), 500},
+                {DiskLoc(0, 1500), 300},
+                {DiskLoc(0, 1800), 100},
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1900),  100},
+                {DiskLoc(1, 1000), 1000},
+                {}
+            };
+            md->setCapExtent(&txn, DiskLoc(0, 0));
+            md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid());
+            initializeV1RS(&txn, records, drecs, &em, md);
+        }
+
+        rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 1000), 500},
+                {DiskLoc(0, 1500), 300},
+                {DiskLoc(0, 1800), 100},
+
+                {DiskLoc(1, 1000), 100},
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1900), 100},
+                {DiskLoc(1, 1100), 900},
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+            ASSERT_EQUALS(md->capExtent(), DiskLoc(1, 0));
+            ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc().setInvalid()); // unlooped
+        }
+    }
+
+    TEST(CappedRecordStoreV1, MoveToSecondExtentLooped) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+        DummyCappedDocumentDeleteCallback cb;
+        CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+        {
+            // Two extents, each with 1000 bytes.
+            LocAndSize records[] = {
+                {DiskLoc(0, 1800), 100}, // old
+                {DiskLoc(0, 1000), 500}, // first new
+                {DiskLoc(0, 1500), 400},
+
+                {DiskLoc(1, 1000), 300},
+                {DiskLoc(1, 1300), 600},
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1900), 100},
+                {DiskLoc(1, 1900), 100},
+                {}
+            };
+            md->setCapExtent(&txn, DiskLoc(0, 0));
+            md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000));
+            initializeV1RS(&txn, records, drecs, &em, md);
+        }
+
+        rs.insertRecord(&txn, zeros, 200 - Record::HeaderSize, false);
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 1000), 500},
+                {DiskLoc(0, 1500), 400},
+
+                {DiskLoc(1, 1300), 600}, // old
+                {DiskLoc(1, 1000), 200}, // first new
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1800), 200},
+                {DiskLoc(1, 1200), 100},
+                {DiskLoc(1, 1900), 100},
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+            ASSERT_EQUALS(md->capExtent(), DiskLoc(1, 0));
+            ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(1, 1000));
+        }
+    }
+
+    //
+    // XXX The CappedRecordStoreV1Scrambler suite of tests describe existing behavior that is less
+    // than ideal. Any improved implementation will need to be able to handle a collection that has
+    // been scrambled like this.
+    //
+
+    /**
+     * This is a minimal example that shows the current allocator laying out records out-of-order.
+     */
+    TEST(CappedRecordStoreV1Scrambler, Minimal) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+        DummyCappedDocumentDeleteCallback cb;
+        CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+        {
+            // Starting with a single empty 1000 byte extent.
+            LocAndSize records[] = {
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1000), 1000},
+                {}
+            };
+            md->setCapExtent(&txn, DiskLoc(0, 0));
+            md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped
+            initializeV1RS(&txn, records, drecs, &em, md);
+        }
+
+        rs.insertRecord(&txn, zeros, 500 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 300 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 400 - Record::HeaderSize, false); // won't fit at end so wraps
+        rs.insertRecord(&txn, zeros, 120 - Record::HeaderSize, false); // fits at end
+        rs.insertRecord(&txn, zeros,  60 - Record::HeaderSize, false); // fits in earlier hole
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 1500), 300}, // 2nd insert
+                {DiskLoc(0, 1000), 400}, // 3rd (1st new)
+                {DiskLoc(0, 1800), 120}, // 4th
+                {DiskLoc(0, 1400),  60}, // 5th
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1460), 40},
+                {DiskLoc(0, 1920), 80},
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+            ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+            ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+        }
+    }
+
+    /**
+     * This tests a specially crafted set of inserts that scrambles a capped collection in a way
+     * that leaves 4 deleted records in a single extent.
+     */
+    TEST(CappedRecordStoreV1Scrambler, FourDeletedRecordsInSingleExtent) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 );
+        DummyCappedDocumentDeleteCallback cb;
+        CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false);
+
+        {
+            // Starting with a single empty 1000 byte extent.
+            LocAndSize records[] = {
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1000), 1000},
+                {}
+            };
+            md->setCapExtent(&txn, DiskLoc(0, 0));
+            md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped
+            initializeV1RS(&txn, records, drecs, &em, md);
+        }
+
+        // This list of sizes was empirically generated to achieve this outcome. Don't think too
+        // much about them.
+        rs.insertRecord(&txn, zeros, 500 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 300 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 304 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 76 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 96 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 76 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 200 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 200 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 56 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 96 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 104 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 96 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 60 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 60 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 146 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 146 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 40 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 40 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 36 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 96 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 200 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 60 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 64 - Record::HeaderSize, false);
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 1148), 148},
+                {DiskLoc(0, 1936),  40},
+                {DiskLoc(0, 1712),  40},
+                {DiskLoc(0, 1296),  36},
+                {DiskLoc(0, 1752), 100},
+                {DiskLoc(0, 1332),  96},
+                {DiskLoc(0, 1428), 200},
+                {DiskLoc(0, 1852),  60},
+                {DiskLoc(0, 1000),  64}, // (1st new)
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1064), 84},
+                {DiskLoc(0, 1976), 24},
+                {DiskLoc(0, 1912), 24},
+                {DiskLoc(0, 1628), 84},
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+            ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0));
+            ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000));
+        }
+    }
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp
new file mode 100644
index 00000000000..a210c0dc0f3
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp
@@ -0,0 +1,192 @@
+/**
+ *    Copyright (C) 2014 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h"
+
+#include "mongo/db/catalog/collection.h"
+#include "mongo/db/storage/mmap_v1/extent.h"
+#include "mongo/db/storage/mmap_v1/extent_manager.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h"
+
+namespace mongo {
+
+    RecordStoreV1RepairIterator::RecordStoreV1RepairIterator(OperationContext* txn,
+                                                             const RecordStoreV1Base* recordStore)
+        : _txn(txn), _recordStore(recordStore), _stage(FORWARD_SCAN) {
+        
+        // Position the iterator at the first record
+        //
+        getNext();
+    }
+
+    bool RecordStoreV1RepairIterator::isEOF() {
+        return _currRecord.isNull();
+    }
+
+    DiskLoc RecordStoreV1RepairIterator::curr() { return _currRecord; }
+
+    DiskLoc RecordStoreV1RepairIterator::getNext() {
+        DiskLoc retVal = _currRecord;
+
+        const ExtentManager* em = _recordStore->_extentManager;
+
+        while (true) {
+            if (_currRecord.isNull()) {
+
+                if (!_advanceToNextValidExtent()) {
+                    return retVal;
+                }
+
+                _seenInCurrentExtent.clear();
+
+                // Otherwise _advanceToNextValidExtent would have returned false
+                //
+                invariant(!_currExtent.isNull());
+
+                const Extent* e = em->getExtent(_currExtent, false);
+                _currRecord = (FORWARD_SCAN == _stage ? e->firstRecord : e->lastRecord);
+            }
+            else {
+                switch (_stage) {
+                case FORWARD_SCAN:
+                    _currRecord = _recordStore->getNextRecordInExtent(_txn, _currRecord);
+                    break;
+                case BACKWARD_SCAN:
+                    _currRecord = _recordStore->getPrevRecordInExtent(_txn, _currRecord);
+                    break;
+                default:
+                    invariant(!"This should never be reached.");
+                    break;
+                }
+            }
+
+            if (_currRecord.isNull()) {
+                continue;
+            }
+
+            // Validate the contents of the record's disk location and deduplicate
+            //
+            if (!_seenInCurrentExtent.insert(_currRecord).second) {
+                error() << "infinite loop in extent, seen: " << _currRecord << " before" << endl;
+                _currRecord = DiskLoc();
+                continue;
+            }
+
+            if (_currRecord.getOfs() <= 0){
+                error() << "offset is 0 for record which should be impossible" << endl;
+                _currRecord = DiskLoc();
+                continue;
+            }
+
+            return retVal;
+        }
+    }
+
+    bool RecordStoreV1RepairIterator::_advanceToNextValidExtent() {
+        const ExtentManager* em = _recordStore->_extentManager;
+
+        while (true) {
+            if (_currExtent.isNull()) {
+                switch (_stage) {
+                case FORWARD_SCAN:
+                    _currExtent = _recordStore->details()->firstExtent(_txn);
+                    break;
+                case BACKWARD_SCAN:
+                    _currExtent = _recordStore->details()->lastExtent(_txn);
+                    break;
+                default:
+                    invariant(DONE == _stage);
+                    return false;
+                }
+            }
+            else {
+                // If _currExtent is not NULL, then it must point to a valid extent, so no extra
+                // checks here.
+                //
+                const Extent* e = em->getExtent(_currExtent, false);
+                _currExtent = (FORWARD_SCAN == _stage ? e->xnext : e->xprev);
+            }
+
+            bool hasNextExtent = !_currExtent.isNull();
+
+            // Sanity checks for the extent's disk location
+            //
+            if (hasNextExtent && (!_currExtent.isValid() || (_currExtent.getOfs() <= 0))) {
+                error() << "Invalid extent location: " << _currExtent << endl;
+
+                // Switch the direction of scan
+                //
+                hasNextExtent = false;
+            }
+
+            if (hasNextExtent) {
+                break;
+            }
+
+            // Swap the direction of scan and loop again
+            //
+            switch (_stage) {
+            case FORWARD_SCAN:
+                _stage = BACKWARD_SCAN;
+                break;
+            case BACKWARD_SCAN:
+                _stage = DONE;
+                break;
+            default:
+                invariant(!"This should never be reached.");
+                break;
+            }
+
+            _currExtent = DiskLoc();
+        }
+
+
+        // Check _currExtent's contents for validity, but do not count is as failure if they
+        // don't check out.
+        //
+        const Extent* e = em->getExtent(_currExtent, false);
+        if (!e->isOk()){
+            warning() << "Extent not ok magic: " << e->magic << " going to try to continue"
+                << endl;
+        }
+
+        log() << (FORWARD_SCAN == _stage ? "FORWARD" : "BACKWARD") << "  Extent loc: " 
+              << _currExtent << ", length: " << e->length << endl;
+
+        return true;
+    }
+
+    void RecordStoreV1RepairIterator::invalidate(const DiskLoc& dl) {
+        verify(!"Invalidate is not supported for RecordStoreV1RepairIterator.");
+    }
+
+    RecordData RecordStoreV1RepairIterator::dataFor(const DiskLoc& loc) const {
+        return _recordStore->dataFor( loc );
+    }
+
+}  // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h
new file mode 100644
index 00000000000..c75c1c790c1
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h
@@ -0,0 +1,96 @@
+/**
+ *    Copyright (C) 2014 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include <set>
+
+#include "mongo/db/storage/record_store.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_base.h"
+
+namespace mongo {
+
+    /**
+     * This iterator will go over the collection twice - once going forward (first extent -> last
+     * extent) and once backwards in an attempt to salvage potentially corrupted or unreachable 
+     * records. It is used by the mongodump --repair option.
+     */
+    class RecordStoreV1RepairIterator : public RecordIterator {
+    public:
+        RecordStoreV1RepairIterator(OperationContext* txn,
+                                    const RecordStoreV1Base* recordStore);
+        virtual ~RecordStoreV1RepairIterator() { }
+
+        virtual bool isEOF();
+        virtual DiskLoc getNext();
+        virtual DiskLoc curr();
+
+        virtual void invalidate(const DiskLoc& dl);
+        virtual void prepareToYield() { }
+        virtual bool recoverFromYield() {
+            return true;
+        }
+
+        virtual RecordData dataFor( const DiskLoc& loc ) const;
+
+    private:
+
+        /**
+         * Based on the direction of scan, finds the next valid (un-corrupted) extent in the chain
+         * and sets _currExtent to point to that.
+         *
+         * @return true if valid extent was found (_currExtent will not be null)
+         *         false otherwise and _currExtent will be null
+         */
+        bool _advanceToNextValidExtent();
+
+        // transactional context for read locks. Not owned by us
+        OperationContext* _txn;
+
+        // Reference to the owning RecordStore. The store must not be deleted while there are 
+        // active iterators on it.
+        //
+        const RecordStoreV1Base* _recordStore;
+
+        DiskLoc _currExtent;
+        DiskLoc _currRecord;
+
+        enum Stage {
+            FORWARD_SCAN = 0,
+            BACKWARD_SCAN = 1,
+            DONE = 2
+        };
+
+        Stage _stage;
+
+        // Used to find cycles within an extent. Cleared after each extent has been processed.
+        //
+        std::set<DiskLoc> _seenInCurrentExtent;
+    };
+
+}  // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp
new file mode 100644
index 00000000000..7a9d17974eb
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp
@@ -0,0 +1,505 @@
+// record_store_v1_simple.cpp
+
+/**
+ *    Copyright (C) 2013-2014 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h"
+
+#include "mongo/base/counter.h"
+#include "mongo/db/catalog/collection.h"
+#include "mongo/db/curop.h"
+#include "mongo/db/commands/server_status_metric.h"
+#include "mongo/db/storage/mmap_v1/extent.h"
+#include "mongo/db/storage/mmap_v1/extent_manager.h"
+#include "mongo/db/storage/mmap_v1/record.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h"
+#include "mongo/util/log.h"
+#include "mongo/util/progress_meter.h"
+#include "mongo/util/timer.h"
+#include "mongo/util/touch_pages.h"
+
+namespace mongo {
+
+    MONGO_LOG_DEFAULT_COMPONENT_FILE(::mongo::logger::LogComponent::kStorage);
+
+    static Counter64 freelistAllocs;
+    static Counter64 freelistBucketExhausted;
+    static Counter64 freelistIterations;
+
+    static ServerStatusMetricField<Counter64> dFreelist1( "storage.freelist.search.requests",
+                                                          &freelistAllocs );
+
+    static ServerStatusMetricField<Counter64> dFreelist2( "storage.freelist.search.bucketExhausted",
+                                                          &freelistBucketExhausted );
+
+    static ServerStatusMetricField<Counter64> dFreelist3( "storage.freelist.search.scanned",
+                                                          &freelistIterations );
+
+    SimpleRecordStoreV1::SimpleRecordStoreV1( OperationContext* txn,
+                                              const StringData& ns,
+                                              RecordStoreV1MetaData* details,
+                                              ExtentManager* em,
+                                              bool isSystemIndexes )
+        : RecordStoreV1Base( ns, details, em, isSystemIndexes ) {
+
+        invariant( !details->isCapped() );
+        _normalCollection = NamespaceString::normal( ns );
+        if ( _details->paddingFactor() == 0 ) {
+            warning() << "implicit updgrade of paddingFactor of very old collection" << endl;
+            _details->setPaddingFactor(txn, 1.0);
+        }
+
+    }
+
+    SimpleRecordStoreV1::~SimpleRecordStoreV1() {
+    }
+
+    DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents( OperationContext* txn,
+                                                            int lenToAlloc ) {
+        // align size up to a multiple of 4
+        lenToAlloc = (lenToAlloc + (4-1)) & ~(4-1);
+
+        freelistAllocs.increment();
+        DiskLoc loc;
+        {
+            DiskLoc *prev = 0;
+            DiskLoc *bestprev = 0;
+            DiskLoc bestmatch;
+            int bestmatchlen = INT_MAX; // sentinel meaning we haven't found a record big enough
+            int b = bucket(lenToAlloc);
+            DiskLoc cur = _details->deletedListEntry(b);
+            
+            int extra = 5; // look for a better fit, a little.
+            int chain = 0;
+            while ( 1 ) {
+                { // defensive check
+                    int fileNumber = cur.a();
+                    int fileOffset = cur.getOfs();
+                    if (fileNumber < -1 || fileNumber >= 100000 || fileOffset < 0) {
+                        StringBuilder sb;
+                        sb << "Deleted record list corrupted in collection " << _ns
+                           << ", bucket " << b
+                           << ", link number " << chain
+                           << ", invalid link is " << cur.toString()
+                           << ", throwing Fatal Assertion";
+                        log() << sb.str() << endl;
+                        fassertFailed(16469);
+                    }
+                }
+                if ( cur.isNull() ) {
+                    // move to next bucket.  if we were doing "extra", just break
+                    if ( bestmatchlen < INT_MAX )
+                        break;
+
+                    if ( chain > 0 ) {
+                        // if we looked at things in the right bucket, but they were not suitable
+                        freelistBucketExhausted.increment();
+                    }
+
+                    b++;
+                    if ( b > MaxBucket ) {
+                        // out of space. alloc a new extent.
+                        freelistIterations.increment( 1 + chain );
+                        return DiskLoc();
+                    }
+                    cur = _details->deletedListEntry(b);
+                    prev = 0;
+                    continue;
+                }
+                DeletedRecord *r = drec(cur);
+                if ( r->lengthWithHeaders() >= lenToAlloc &&
+                     r->lengthWithHeaders() < bestmatchlen ) {
+                    bestmatchlen = r->lengthWithHeaders();
+                    bestmatch = cur;
+                    bestprev = prev;
+                    if (r->lengthWithHeaders() == lenToAlloc)
+                        // exact match, stop searching
+                        break;
+                }
+                if ( bestmatchlen < INT_MAX && --extra <= 0 )
+                    break;
+                if ( ++chain > 30 && b <= MaxBucket ) {
+                    // too slow, force move to next bucket to grab a big chunk
+                    //b++;
+                    freelistIterations.increment( chain );
+                    chain = 0;
+                    cur.Null();
+                }
+                else {
+                    cur = r->nextDeleted();
+                    prev = &r->nextDeleted();
+                }
+            }
+
+            // unlink ourself from the deleted list
+            DeletedRecord *bmr = drec(bestmatch);
+            if ( bestprev ) {
+                *txn->recoveryUnit()->writing(bestprev) = bmr->nextDeleted();
+            }
+            else {
+                // should be the front of a free-list
+                int myBucket = bucket(bmr->lengthWithHeaders());
+                invariant( _details->deletedListEntry(myBucket) == bestmatch );
+                _details->setDeletedListEntry(txn, myBucket, bmr->nextDeleted());
+            }
+            *txn->recoveryUnit()->writing(&bmr->nextDeleted()) = DiskLoc().setInvalid(); // defensive.
+            invariant(bmr->extentOfs() < bestmatch.getOfs());
+
+            freelistIterations.increment( 1 + chain );
+            loc = bestmatch;
+        }
+
+        if ( loc.isNull() )
+            return loc;
+
+        // determine if we should chop up
+
+        DeletedRecord *r = drec(loc);
+
+        /* note we want to grab from the front so our next pointers on disk tend
+        to go in a forward direction which is important for performance. */
+        int regionlen = r->lengthWithHeaders();
+        invariant( r->extentOfs() < loc.getOfs() );
+
+        int left = regionlen - lenToAlloc;
+        if ( left < 24 || left < (lenToAlloc / 8) ) {
+            // you get the whole thing.
+            return loc;
+        }
+
+        // don't quantize:
+        //   - $ collections (indexes) as we already have those aligned the way we want SERVER-8425
+        if ( _normalCollection ) {
+            // we quantize here so that it only impacts newly sized records
+            // this prevents oddities with older records and space re-use SERVER-8435
+            lenToAlloc = std::min( r->lengthWithHeaders(),
+                                   quantizeAllocationSpace( lenToAlloc ) );
+            left = regionlen - lenToAlloc;
+
+            if ( left < 24 ) {
+                // you get the whole thing.
+                return loc;
+            }
+        }
+
+        /* split off some for further use. */
+        txn->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc;
+        DiskLoc newDelLoc = loc;
+        newDelLoc.inc(lenToAlloc);
+        DeletedRecord* newDel = drec(newDelLoc);
+        DeletedRecord* newDelW = txn->recoveryUnit()->writing(newDel);
+        newDelW->extentOfs() = r->extentOfs();
+        newDelW->lengthWithHeaders() = left;
+        newDelW->nextDeleted().Null();
+
+        addDeletedRec( txn, newDelLoc );
+        return loc;
+    }
+
+    StatusWith<DiskLoc> SimpleRecordStoreV1::allocRecord( OperationContext* txn,
+                                                          int lengthWithHeaders,
+                                                          bool enforceQuota ) {
+        DiskLoc loc = _allocFromExistingExtents( txn, lengthWithHeaders );
+        if ( !loc.isNull() )
+            return StatusWith<DiskLoc>( loc );
+
+        LOG(1) << "allocating new extent";
+
+        increaseStorageSize( txn,
+                             _extentManager->followupSize( lengthWithHeaders,
+                                                           _details->lastExtentSize(txn)),
+                             enforceQuota );
+
+        loc = _allocFromExistingExtents( txn, lengthWithHeaders );
+        if ( !loc.isNull() ) {
+            // got on first try
+            return StatusWith<DiskLoc>( loc );
+        }
+
+        log() << "warning: alloc() failed after allocating new extent. "
+              << "lengthWithHeaders: " << lengthWithHeaders << " last extent size:"
+              << _details->lastExtentSize(txn) << "; trying again";
+
+        for ( int z = 0; z < 10 && lengthWithHeaders > _details->lastExtentSize(txn); z++ ) {
+            log() << "try #" << z << endl;
+
+            increaseStorageSize( txn,
+                                 _extentManager->followupSize( lengthWithHeaders,
+                                                               _details->lastExtentSize(txn)),
+                                 enforceQuota );
+
+            loc = _allocFromExistingExtents( txn, lengthWithHeaders );
+            if ( ! loc.isNull() )
+                return StatusWith<DiskLoc>( loc );
+        }
+
+        return StatusWith<DiskLoc>( ErrorCodes::InternalError, "cannot allocate space" );
+    }
+
+    Status SimpleRecordStoreV1::truncate(OperationContext* txn) {
+        return Status( ErrorCodes::InternalError,
+                       "SimpleRecordStoreV1::truncate not implemented" );
+    }
+
+    void SimpleRecordStoreV1::addDeletedRec( OperationContext* txn, const DiskLoc& dloc ) {
+        DeletedRecord* d = drec( dloc );
+
+        DEBUGGING log() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs() << endl;
+
+        int b = bucket(d->lengthWithHeaders());
+        *txn->recoveryUnit()->writing(&d->nextDeleted()) = _details->deletedListEntry(b);
+        _details->setDeletedListEntry(txn, b, dloc);
+    }
+
+    RecordIterator* SimpleRecordStoreV1::getIterator( OperationContext* txn,
+                                                      const DiskLoc& start,
+                                                      bool tailable,
+                                                      const CollectionScanParams::Direction& dir) const {
+        return new SimpleRecordStoreV1Iterator( txn, this, start, dir );
+    }
+
+    vector<RecordIterator*> SimpleRecordStoreV1::getManyIterators( OperationContext* txn ) const {
+        OwnedPointerVector<RecordIterator> iterators;
+        const Extent* ext;
+        for (DiskLoc extLoc = details()->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) {
+            ext = _getExtent(txn, extLoc);
+            if (ext->firstRecord.isNull())
+                continue;
+            iterators.push_back(
+                new RecordStoreV1Base::IntraExtentIterator(txn, ext->firstRecord, this));
+        }
+
+        return iterators.release();
+    }
+
+    class CompactDocWriter : public DocWriter {
+    public:
+        /**
+         * param allocationSize - allocation size WITH header
+         */
+        CompactDocWriter( const Record* rec, unsigned dataSize, size_t allocationSize )
+            : _rec( rec ),
+              _dataSize( dataSize ),
+              _allocationSize( allocationSize ) {
+        }
+
+        virtual ~CompactDocWriter() {}
+
+        virtual void writeDocument( char* buf ) const {
+            memcpy( buf, _rec->data(), _dataSize );
+        }
+
+        virtual size_t documentSize() const {
+            return _allocationSize - Record::HeaderSize;
+        }
+
+        virtual bool addPadding() const {
+            return false;
+        }
+
+    private:
+        const Record* _rec;
+        size_t _dataSize;
+        size_t _allocationSize;
+    };
+
+    void SimpleRecordStoreV1::_compactExtent(OperationContext* txn,
+                                             const DiskLoc diskloc,
+                                             int extentNumber,
+                                             RecordStoreCompactAdaptor* adaptor,
+                                             const CompactOptions* compactOptions,
+                                             CompactStats* stats ) {
+
+        log() << "compact begin extent #" << extentNumber
+              << " for namespace " << _ns << " " << diskloc;
+
+        unsigned oldObjSize = 0; // we'll report what the old padding was
+        unsigned oldObjSizeWithPadding = 0;
+
+        Extent *e = _extentManager->getExtent( diskloc );
+        e->assertOk();
+        fassert( 17437, e->validates(diskloc) );
+
+        {
+            // the next/prev pointers within the extent might not be in order so we first
+            // page the whole thing in sequentially
+            log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
+            Timer t;
+            size_t length = e->length;
+
+            touch_pages( reinterpret_cast<const char*>(e), length );
+            int ms = t.millis();
+            if( ms > 1000 )
+                log() << "compact end paging in " << ms << "ms "
+                      << e->length/1000000.0/t.seconds() << "MB/sec" << endl;
+        }
+
+        {
+            log() << "compact copying records" << endl;
+            long long datasize = 0;
+            long long nrecords = 0;
+            DiskLoc L = e->firstRecord;
+            if( !L.isNull() ) {
+                while( 1 ) {
+                    Record *recOld = recordFor(L);
+                    RecordData oldData = recOld->toRecordData();
+                    L = getNextRecordInExtent(txn, L);
+
+                    if ( compactOptions->validateDocuments && !adaptor->isDataValid( oldData ) ) {
+                        // object is corrupt!
+                        log() << "compact skipping corrupt document!";
+                        stats->corruptDocuments++;
+                    }
+                    else {
+                        unsigned dataSize = adaptor->dataSize( oldData );
+                        unsigned docSize = dataSize;
+
+                        nrecords++;
+                        oldObjSize += docSize;
+                        oldObjSizeWithPadding += recOld->netLength();
+
+                        unsigned lenWHdr = docSize + Record::HeaderSize;
+                        unsigned lenWPadding = lenWHdr;
+
+                        switch( compactOptions->paddingMode ) {
+                        case CompactOptions::NONE:
+                            if ( _details->isUserFlagSet(Flag_UsePowerOf2Sizes) )
+                                lenWPadding = quantizePowerOf2AllocationSpace(lenWPadding);
+                            break;
+                        case CompactOptions::PRESERVE:
+                            // if we are preserving the padding, the record should not change size
+                            lenWPadding = recOld->lengthWithHeaders();
+                            break;
+                        case CompactOptions::MANUAL:
+                            lenWPadding = compactOptions->computeRecordSize(lenWPadding);
+                            if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) {
+                                lenWPadding = lenWHdr;
+                            }
+                            break;
+                        }
+
+                        CompactDocWriter writer( recOld, dataSize, lenWPadding );
+                        StatusWith<DiskLoc> status = insertRecord( txn, &writer, false );
+                        uassertStatusOK( status.getStatus() );
+                        datasize += recordFor( status.getValue() )->netLength();
+
+                        adaptor->inserted( dataFor( status.getValue() ), status.getValue() );
+                    }
+
+                    if( L.isNull() ) {
+                        // we just did the very last record from the old extent.  it's still pointed to
+                        // by the old extent ext, but that will be fixed below after this loop
+                        break;
+                    }
+
+                    // remove the old records (orphan them) periodically so our commit block doesn't get too large
+                    bool stopping = false;
+                    RARELY stopping = !txn->checkForInterruptNoAssert().isOK();
+                    if( stopping || txn->recoveryUnit()->isCommitNeeded() ) {
+                        *txn->recoveryUnit()->writing(&e->firstRecord) = L;
+                        Record *r = recordFor(L);
+                        txn->recoveryUnit()->writingInt(r->prevOfs()) = DiskLoc::NullOfs;
+                        txn->recoveryUnit()->commitIfNeeded();
+                        txn->checkForInterrupt();
+                    }
+                }
+            } // if !L.isNull()
+
+            invariant( _details->firstExtent(txn) == diskloc );
+            invariant( _details->lastExtent(txn) != diskloc );
+            DiskLoc newFirst = e->xnext;
+            _details->setFirstExtent( txn, newFirst );
+            *txn->recoveryUnit()->writing(&_extentManager->getExtent( newFirst )->xprev) = DiskLoc();
+            _extentManager->freeExtent( txn, diskloc );
+
+            txn->recoveryUnit()->commitIfNeeded();
+
+            {
+                double op = 1.0;
+                if( oldObjSize )
+                    op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize;
+                log() << "compact finished extent #" << extentNumber << " containing " << nrecords
+                      << " documents (" << datasize/1000000.0 << "MB)"
+                      << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100;
+            }
+        }
+
+    }
+
+    Status SimpleRecordStoreV1::compact( OperationContext* txn,
+                                         RecordStoreCompactAdaptor* adaptor,
+                                         const CompactOptions* options,
+                                         CompactStats* stats ) {
+
+        // this is a big job, so might as well make things tidy before we start just to be nice.
+        txn->recoveryUnit()->commitIfNeeded();
+
+        list<DiskLoc> extents;
+        for( DiskLoc extLocation = _details->firstExtent(txn);
+             !extLocation.isNull();
+             extLocation = _extentManager->getExtent( extLocation )->xnext ) {
+            extents.push_back( extLocation );
+        }
+        log() << "compact " << extents.size() << " extents";
+
+        log() << "compact orphan deleted lists" << endl;
+        _details->orphanDeletedList(txn);
+
+        // Start over from scratch with our extent sizing and growth
+        _details->setLastExtentSize( txn, 0 );
+
+        // create a new extent so new records go there
+        increaseStorageSize( txn, _details->lastExtentSize(txn), true );
+
+        // reset data size and record counts to 0 for this namespace
+        // as we're about to tally them up again for each new extent
+        _details->setStats( txn, 0, 0 );
+
+        ProgressMeterHolder pm(*txn->setMessage("compact extent",
+                                                "Extent Compacting Progress",
+                                                extents.size()));
+
+        int extentNumber = 0;
+        for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) {
+            _compactExtent(txn, *i, extentNumber++, adaptor, options, stats );
+            pm.hit();
+        }
+
+        invariant( _extentManager->getExtent( _details->firstExtent(txn) )->xprev.isNull() );
+        invariant( _extentManager->getExtent( _details->lastExtent(txn) )->xnext.isNull() );
+
+        // indexes will do their own progress meter
+        pm.finished();
+
+        return Status::OK();
+    }
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple.h b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.h
new file mode 100644
index 00000000000..abc6b11b928
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.h
@@ -0,0 +1,95 @@
+// record_store_v1_simple.h
+
+/**
+*    Copyright (C) 2013-2014 MongoDB Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#pragma once
+
+#include "mongo/db/diskloc.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_base.h"
+
+namespace mongo {
+
+    class SimpleRecordStoreV1Iterator;
+
+    // used by index and original collections
+    class SimpleRecordStoreV1 : public RecordStoreV1Base {
+    public:
+        SimpleRecordStoreV1( OperationContext* txn,
+                             const StringData& ns,
+                             RecordStoreV1MetaData* details,
+                             ExtentManager* em,
+                             bool isSystemIndexes );
+
+        virtual ~SimpleRecordStoreV1();
+
+        const char* name() const { return "SimpleRecordStoreV1"; }
+
+        virtual RecordIterator* getIterator( OperationContext* txn, const DiskLoc& start, bool tailable,
+                                             const CollectionScanParams::Direction& dir) const;
+
+        virtual std::vector<RecordIterator*> getManyIterators(OperationContext* txn) const;
+
+        virtual Status truncate(OperationContext* txn);
+
+        virtual void temp_cappedTruncateAfter(OperationContext* txn, DiskLoc end, bool inclusive) {
+            invariant(!"cappedTruncateAfter not supported");
+        }
+
+        virtual bool compactSupported() const { return true; }
+        virtual Status compact( OperationContext* txn,
+                                RecordStoreCompactAdaptor* adaptor,
+                                const CompactOptions* options,
+                                CompactStats* stats );
+
+    protected:
+        virtual bool isCapped() const { return false; }
+
+        virtual StatusWith<DiskLoc> allocRecord( OperationContext* txn,
+                                                 int lengthWithHeaders,
+                                                 bool enforceQuota );
+
+        virtual void addDeletedRec(OperationContext* txn,
+                                   const DiskLoc& dloc);
+    private:
+        DiskLoc _allocFromExistingExtents( OperationContext* txn,
+                                           int lengthWithHeaders );
+
+        void _compactExtent(OperationContext* txn,
+                            const DiskLoc diskloc,
+                            int extentNumber,
+                            RecordStoreCompactAdaptor* adaptor,
+                            const CompactOptions* compactOptions,
+                            CompactStats* stats );
+
+        bool _normalCollection;
+
+        friend class SimpleRecordStoreV1Iterator;
+    };
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp
new file mode 100644
index 00000000000..803b1494920
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp
@@ -0,0 +1,130 @@
+/**
+ *    Copyright (C) 2013 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h"
+
+#include "mongo/db/catalog/collection.h"
+#include "mongo/db/storage/mmap_v1/extent.h"
+#include "mongo/db/storage/mmap_v1/extent_manager.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h"
+
+namespace mongo {
+
+    //
+    // Regular / non-capped collection traversal
+    //
+
+    SimpleRecordStoreV1Iterator::SimpleRecordStoreV1Iterator(OperationContext* txn,
+                                                             const SimpleRecordStoreV1* collection,
+                                                             const DiskLoc& start,
+                                                             const CollectionScanParams::Direction& dir)
+        : _txn(txn), _curr(start), _recordStore(collection), _direction(dir) {
+
+        if (_curr.isNull()) {
+
+            const ExtentManager* em = _recordStore->_extentManager;
+
+            if ( _recordStore->details()->firstExtent(txn).isNull() ) {
+                // nothing in the collection
+                verify( _recordStore->details()->lastExtent(txn).isNull() );
+            }
+            else if (CollectionScanParams::FORWARD == _direction) {
+
+                // Find a non-empty extent and start with the first record in it.
+                Extent* e = em->getExtent( _recordStore->details()->firstExtent(txn) );
+
+                while (e->firstRecord.isNull() && !e->xnext.isNull()) {
+                    e = em->getExtent( e->xnext );
+                }
+
+                // _curr may be set to DiskLoc() here if e->lastRecord isNull but there is no
+                // valid e->xnext
+                _curr = e->firstRecord;
+            }
+            else {
+                // Walk backwards, skipping empty extents, and use the last record in the first
+                // non-empty extent we see.
+                Extent* e = em->getExtent( _recordStore->details()->lastExtent(txn) );
+
+                // TODO ELABORATE
+                // Does one of e->lastRecord.isNull(), e.firstRecord.isNull() imply the other?
+                while (e->lastRecord.isNull() && !e->xprev.isNull()) {
+                    e = em->getExtent( e->xprev );
+                }
+
+                // _curr may be set to DiskLoc() here if e->lastRecord isNull but there is no
+                // valid e->xprev
+                _curr = e->lastRecord;
+            }
+        }
+    }
+
+    bool SimpleRecordStoreV1Iterator::isEOF() {
+        return _curr.isNull();
+    }
+
+    DiskLoc SimpleRecordStoreV1Iterator::curr() { return _curr; }
+
+    DiskLoc SimpleRecordStoreV1Iterator::getNext() {
+        DiskLoc ret = _curr;
+
+        // Move to the next thing.
+        if (!isEOF()) {
+            if (CollectionScanParams::FORWARD == _direction) {
+                _curr = _recordStore->getNextRecord( _txn, _curr );
+            }
+            else {
+                _curr = _recordStore->getPrevRecord( _txn, _curr );
+            }
+        }
+
+        return ret;
+    }
+
+    void SimpleRecordStoreV1Iterator::invalidate(const DiskLoc& dl) {
+        // Just move past the thing being deleted.
+        if (dl == _curr) {
+            // We don't care about the return of getNext so much as the side effect of moving _curr
+            // to the 'next' thing.
+            getNext();
+        }
+    }
+
+    void SimpleRecordStoreV1Iterator::prepareToYield() {
+    }
+
+    bool SimpleRecordStoreV1Iterator::recoverFromYield() {
+        // if the collection is dropped, then the cursor should be destroyed
+        return true;
+    }
+
+    RecordData SimpleRecordStoreV1Iterator::dataFor( const DiskLoc& loc ) const {
+        return _recordStore->dataFor( loc );
+    }
+
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h
new file mode 100644
index 00000000000..ded30a3ee1d
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h
@@ -0,0 +1,73 @@
+/**
+ *    Copyright (C) 2013 10gen Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/storage/record_store.h"
+
+namespace mongo {
+
+    class SimpleRecordStoreV1;
+
+    /**
+     * This class iterates over a non-capped collection identified by 'ns'.
+     * The collection must exist when the constructor is called.
+     *
+     * If start is not DiskLoc(), the iteration begins at that DiskLoc.
+     */
+    class SimpleRecordStoreV1Iterator : public RecordIterator {
+    public:
+        SimpleRecordStoreV1Iterator( OperationContext* txn,
+                                     const SimpleRecordStoreV1* records,
+                                     const DiskLoc& start,
+                                     const CollectionScanParams::Direction& dir );
+        virtual ~SimpleRecordStoreV1Iterator() { }
+
+        virtual bool isEOF();
+        virtual DiskLoc getNext();
+        virtual DiskLoc curr();
+
+        virtual void invalidate(const DiskLoc& dl);
+        virtual void prepareToYield();
+        virtual bool recoverFromYield();
+
+        virtual RecordData dataFor( const DiskLoc& loc ) const;
+
+    private:
+         // for getNext, not owned
+        OperationContext* _txn;
+
+        // The result returned on the next call to getNext().
+        DiskLoc _curr;
+
+        const SimpleRecordStoreV1* _recordStore;
+
+        CollectionScanParams::Direction _direction;
+    };
+
+}  // namespace mongo
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp
new file mode 100644
index 00000000000..31f17f42b28
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp
@@ -0,0 +1,775 @@
+// record_store_v1_simple_test.cpp
+
+/**
+ *    Copyright (C) 2014 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h"
+
+#include "mongo/db/operation_context_noop.h"
+#include "mongo/db/storage/mmap_v1/record.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h"
+#include "mongo/unittest/unittest.h"
+
+using namespace mongo;
+
+namespace {
+
+    // Provides data to be inserted. Must be large enough for largest possible record.
+    // Should be in BSS so unused portions should be free.
+    char zeros[20*1024*1024] = {};
+
+    TEST( SimpleRecordStoreV1, quantizeAllocationSpaceSimple ) {
+        ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(33),       36);
+        ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1000),     1024);
+        ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(10001),    10240);
+        ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(100000),   106496);
+        ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1000001),  1048576);
+        ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(10000000), 10223616);
+    }
+
+    TEST( SimpleRecordStoreV1, quantizeAllocationMinMaxBound ) {
+        const int maxSize = 16 * 1024 * 1024;
+        ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1), 2);
+        ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(maxSize), maxSize);
+    }
+
+    /**
+     * Test  Quantize record allocation on every boundary, as well as boundary-1
+     *       @see NamespaceDetails::quantizeAllocationSpace()
+     */
+    TEST( SimpleRecordStoreV1, quantizeAllocationBoundary ) {
+        for (int iBucket = 0; iBucket <= RecordStoreV1Base::MaxBucket; ++iBucket) {
+            // for each bucket in range [min, max)
+            const int bucketSize = RecordStoreV1Base::bucketSizes[iBucket];
+            const int prevBucketSize =
+                (iBucket - 1 >= 0) ? RecordStoreV1Base::bucketSizes[iBucket - 1] : 0;
+            const int intervalSize = bucketSize / 16;
+            for (int iBoundary = prevBucketSize;
+                 iBoundary < bucketSize;
+                 iBoundary += intervalSize) {
+                // for each quantization boundary within the bucket
+                for (int iSize = iBoundary - 1; iSize <= iBoundary; ++iSize) {
+                    // test the quantization boundary - 1, and the boundary itself
+                    const int quantized =
+                        RecordStoreV1Base::quantizeAllocationSpace(iSize);
+                    // assert quantized size is greater than or equal to requested size
+                    ASSERT(quantized >= iSize);
+                    // assert quantized size is within one quantization interval of
+                    // the requested size
+                    ASSERT(quantized - iSize <= intervalSize);
+                    // assert quantization is an idempotent operation
+                    ASSERT(quantized ==
+                           RecordStoreV1Base::quantizeAllocationSpace(quantized));
+                }
+            }
+        }
+    }
+
+    /**
+     * For buckets up to 4MB powerOf2 allocation should round up to next power of 2. It should be
+     * return the input unmodified if it is already a power of 2.
+     */
+    TEST( SimpleRecordStoreV1, quantizePowerOf2Small ) {
+        // only tests buckets <= 4MB. Higher buckets quatize to 1MB even with powerOf2
+        for (int bucket = 0; bucket < RecordStoreV1Base::MaxBucket; bucket++) {
+            const int size = RecordStoreV1Base::bucketSizes[bucket];
+            const int nextSize = RecordStoreV1Base::bucketSizes[bucket + 1];
+
+            // size - 1 is quantized to size.
+            ASSERT_EQUALS( size,
+                           RecordStoreV1Base::quantizePowerOf2AllocationSpace( size - 1 ) );
+
+            // size is quantized to size.
+            ASSERT_EQUALS( size,
+                           RecordStoreV1Base::quantizePowerOf2AllocationSpace( size ) );
+
+            // size + 1 is quantized to nextSize (unless > 4MB which is covered by next test)
+            if (size < 4*1024*1024) {
+                ASSERT_EQUALS( nextSize,
+                               RecordStoreV1Base::quantizePowerOf2AllocationSpace( size + 1 ) );
+            }
+        }
+    }
+
+    /**
+     * Within the largest bucket, quantizePowerOf2AllocationSpace quantizes to the nearest
+     * megabyte boundary.
+     */
+    TEST( SimpleRecordStoreV1, SimpleRecordLargePowerOf2ToMegabyteBoundary ) {
+        // Iterate iSize over all 1mb boundaries from the size of the next to largest bucket
+        // to the size of the largest bucket + 1mb.
+        for( int iSize = RecordStoreV1Base::bucketSizes[ RecordStoreV1Base::MaxBucket - 1 ];
+             iSize <= RecordStoreV1Base::bucketSizes[ RecordStoreV1Base::MaxBucket ] + 0x100000;
+             iSize += 0x100000 ) {
+
+            // iSize - 1 is quantized to iSize.
+            ASSERT_EQUALS( iSize,
+                           RecordStoreV1Base::quantizePowerOf2AllocationSpace( iSize - 1 ) );
+
+            // iSize is quantized to iSize.
+            ASSERT_EQUALS( iSize,
+                           RecordStoreV1Base::quantizePowerOf2AllocationSpace( iSize ) );
+
+            // iSize + 1 is quantized to iSize + 1mb.
+            ASSERT_EQUALS( iSize + 0x100000,
+                           RecordStoreV1Base::quantizePowerOf2AllocationSpace( iSize + 1 ) );
+        }
+    }
+
+    BSONObj docForRecordSize( int size ) {
+        BSONObjBuilder b;
+        b.append( "_id", 5 );
+        b.append( "x", string( size - Record::HeaderSize - 22, 'x' ) );
+        BSONObj x = b.obj();
+        ASSERT_EQUALS( Record::HeaderSize + x.objsize(), size );
+        return x;
+    }
+
+    /** alloc() quantizes the requested size using quantizeAllocationSpace() rules. */
+    TEST(SimpleRecordStoreV1, AllocQuantized) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+
+        string myns = "test.AllocQuantized";
+        SimpleRecordStoreV1 rs( &txn, myns, md, &em, false );
+
+        BSONObj obj = docForRecordSize( 300 );
+        StatusWith<DiskLoc> result = rs.insertRecord( &txn, obj.objdata(), obj.objsize(), false);
+        ASSERT( result.isOK() );
+
+        // The length of the allocated record is quantized.
+        ASSERT_EQUALS( 320, rs.dataFor( result.getValue() ).size() + Record::HeaderSize );
+    }
+
+    /**
+     * alloc() does not quantize records in index collections using quantizeAllocationSpace()
+     * rules.
+     */
+    TEST(SimpleRecordStoreV1, AllocIndexNamespaceNotQuantized) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+
+        string myns = "test.AllocIndexNamespaceNotQuantized";
+        SimpleRecordStoreV1 rs( &txn, myns + "$x", md, &em, false );
+
+        BSONObj obj = docForRecordSize( 300 );
+        StatusWith<DiskLoc> result = rs.insertRecord(&txn,  obj.objdata(), obj.objsize(), false);
+        ASSERT( result.isOK() );
+
+        // The length of the allocated record is not quantized.
+        ASSERT_EQUALS( 300, rs.dataFor( result.getValue() ).size() + Record::HeaderSize );
+
+    }
+
+    /** alloc() quantizes records in index collections to the nearest multiple of 4. */
+    TEST(SimpleRecordStoreV1, AllocIndexNamespaceSlightlyQuantized) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+
+        string myns = "test.AllocIndexNamespaceNotQuantized";
+        SimpleRecordStoreV1 rs( &txn, myns + "$x", md, &em, false );
+
+        BSONObj obj = docForRecordSize( 298 );
+        StatusWith<DiskLoc> result = rs.insertRecord( &txn, obj.objdata(), obj.objsize(), false);
+        ASSERT( result.isOK() );
+
+        ASSERT_EQUALS( 300, rs.dataFor( result.getValue() ).size() + Record::HeaderSize );
+    }
+
+    /** alloc() returns a non quantized record larger than the requested size. */
+    TEST(SimpleRecordStoreV1, AllocUseNonQuantizedDeletedRecord) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+        SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+        {
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1000), 310},
+                {}
+            };
+            initializeV1RS(&txn, NULL, drecs, &em, md);
+        }
+
+        BSONObj obj = docForRecordSize( 300 );
+        StatusWith<DiskLoc> actualLocation = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false);
+        ASSERT_OK( actualLocation.getStatus() );
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 1000), 310},
+                {}
+            };
+            LocAndSize drecs[] = {
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+        }
+    }
+
+    /** alloc() returns a non quantized record equal to the requested size. */
+    TEST(SimpleRecordStoreV1, AllocExactSizeNonQuantizedDeletedRecord) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+        SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+        {
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1000), 300},
+                {}
+            };
+            initializeV1RS(&txn, NULL, drecs, &em, md);
+        }
+
+        BSONObj obj = docForRecordSize( 300 );
+        StatusWith<DiskLoc> actualLocation = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false);
+        ASSERT_OK( actualLocation.getStatus() );
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 1000), 300},
+                {}
+            };
+            LocAndSize drecs[] = {
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+        }
+    }
+
+    /**
+     * alloc() returns a non quantized record equal to the quantized size plus some extra space
+     * too small to make a DeletedRecord.
+     */
+    TEST(SimpleRecordStoreV1, AllocQuantizedWithExtra) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+        SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+        {
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1000), 343},
+                {}
+            };
+            initializeV1RS(&txn, NULL, drecs, &em, md);
+        }
+
+        BSONObj obj = docForRecordSize( 300 );
+        StatusWith<DiskLoc> actualLocation = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false);
+        ASSERT_OK( actualLocation.getStatus() );
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 1000), 343},
+                {}
+            };
+            LocAndSize drecs[] = {
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+        }
+    }
+
+    /**
+     * alloc() returns a quantized record when the extra space in the reclaimed deleted record
+     * is large enough to form a new deleted record.
+     */
+    TEST(SimpleRecordStoreV1, AllocQuantizedWithoutExtra) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+        SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+        {
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1000), 344},
+                {}
+            };
+            initializeV1RS(&txn, NULL, drecs, &em, md);
+        }
+
+
+        BSONObj obj = docForRecordSize( 300 );
+        StatusWith<DiskLoc> actualLocation = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false);
+        ASSERT_OK( actualLocation.getStatus() );
+
+        {
+            LocAndSize recs[] = {
+                // The returned record is quantized from 300 to 320.
+                {DiskLoc(0, 1000), 320},
+                {}
+            };
+            LocAndSize drecs[] = {
+                // A new 24 byte deleted record is split off.
+                {DiskLoc(0, 1320), 24},
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+        }
+    }
+
+    /**
+     * A non quantized deleted record within 1/8 of the requested size is returned as is, even
+     * if a quantized portion of the deleted record could be used instead.
+     */
+    TEST(SimpleRecordStoreV1, AllocNotQuantizedNearDeletedSize) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+        SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+        {
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1000), 344},
+                {}
+            };
+            initializeV1RS(&txn, NULL, drecs, &em, md);
+        }
+
+        BSONObj obj = docForRecordSize( 319 );
+        StatusWith<DiskLoc> actualLocation = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false);
+        ASSERT_OK( actualLocation.getStatus() );
+
+        // Even though 319 would be quantized to 320 and 344 - 320 == 24 could become a new
+        // deleted record, the entire deleted record is returned because
+        // ( 344 - 320 ) < ( 320 / 8 ).
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 1000), 344},
+                {}
+            };
+            LocAndSize drecs[] = {
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+        }
+    }
+
+    /** getRecordAllocationSize() returns its argument when the padding factor is 1.0. */
+    TEST(SimpleRecordStoreV1, GetRecordAllocationSizeNoPadding) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+        SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+        ASSERT_EQUALS( 1.0, md->paddingFactor() );
+        ASSERT_EQUALS( 300, rs.getRecordAllocationSize( 300 ) );
+    }
+
+    /** getRecordAllocationSize() multiplies by a padding factor > 1.0. */
+    TEST(SimpleRecordStoreV1, GetRecordAllocationSizeWithPadding) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+        SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+        double paddingFactor = 1.2;
+        md->setPaddingFactor( &txn, paddingFactor );
+        ASSERT_EQUALS( paddingFactor, md->paddingFactor() );
+        ASSERT_EQUALS( int(300 * paddingFactor), rs.getRecordAllocationSize( 300 ) );
+    }
+
+    /**
+     * getRecordAllocationSize() quantizes to the nearest power of 2 when Flag_UsePowerOf2Sizes
+     * is set.
+     */
+    TEST(SimpleRecordStoreV1, GetRecordAllocationSizePowerOf2) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(
+                                                false,
+                                                RecordStoreV1Base::Flag_UsePowerOf2Sizes );
+
+        SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+        ASSERT_EQUALS( 512, rs.getRecordAllocationSize( 300 ) );
+    }
+
+    /**
+     * getRecordAllocationSize() quantizes to the nearest power of 2 when Flag_UsePowerOf2Sizes
+     * is set, ignoring the padding factor.
+     */
+    TEST(SimpleRecordStoreV1, GetRecordAllocationSizePowerOf2PaddingIgnored) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(
+                                                false,
+                                                RecordStoreV1Base::Flag_UsePowerOf2Sizes );
+
+        SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+        md->setPaddingFactor( &txn, 2.0 );
+        ASSERT_EQUALS( 2.0, md->paddingFactor() );
+        ASSERT_EQUALS( 512, rs.getRecordAllocationSize( 300 ) );
+    }
+
+
+    // -----------------
+
+    TEST( SimpleRecordStoreV1, FullSimple1 ) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+        SimpleRecordStoreV1 rs( &txn,
+                                "test.foo",
+                                md,
+                                &em,
+                                false );
+
+
+        ASSERT_EQUALS( 0, md->numRecords() );
+        StatusWith<DiskLoc> result = rs.insertRecord( &txn, "abc", 4, 1000 );
+        ASSERT_TRUE( result.isOK() );
+        ASSERT_EQUALS( 1, md->numRecords() );
+        RecordData recordData = rs.dataFor( result.getValue() );
+        ASSERT_EQUALS( string("abc"), string(recordData.data()) );
+    }
+
+    // ----------------
+
+    /**
+     * Inserts take the first deleted record with the correct size.
+     */
+    TEST( SimpleRecordStoreV1, InsertTakesFirstDeletedWithExactSize ) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+        SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 1000), 100},
+                {DiskLoc(0, 1100), 100},
+                {DiskLoc(0, 1300), 100},
+                {DiskLoc(2, 1100), 100},
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1200), 100}, // this one will be used
+                {DiskLoc(2, 1000), 100},
+                {DiskLoc(1, 1000), 1000},
+                {}
+            };
+
+            initializeV1RS(&txn, recs, drecs, &em, md);
+        }
+
+        rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false);
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 1000), 100},
+                {DiskLoc(0, 1100), 100},
+                {DiskLoc(0, 1300), 100},
+                {DiskLoc(0, 1200), 100}, // this is the new record
+                {DiskLoc(2, 1100), 100},
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(2, 1000), 100},
+                {DiskLoc(1, 1000), 1000},
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+        }
+    }
+
+    /**
+     * Test that we keep looking for better matches for 5 links once we find a non-exact match.
+     * This "extra" scanning does not proceed into bigger buckets.
+     * WARNING: this test depends on magic numbers inside RSV1Simple::_allocFromExistingExtents.
+     */
+    TEST( SimpleRecordStoreV1, InsertLooksForBetterMatchUpTo5Links ) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+        SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+        {
+            LocAndSize recs[] = {
+                {}
+            };
+            LocAndSize drecs[] = {
+                // This intentionally leaves gaps to keep locs readable.
+                {DiskLoc(0, 1000),  75}, // too small
+                {DiskLoc(0, 1100), 100}, // 1st big enough: will be first record
+                {DiskLoc(0, 1200), 100}, // 2nd: will be third record
+                {DiskLoc(0, 1300), 100}, // 3rd
+                {DiskLoc(0, 1400), 100}, // 4th
+                {DiskLoc(0, 1500), 100}, // 5th: first and third will stop once they look here
+                {DiskLoc(0, 1600),  80}, // 6th: second will make it here and use this
+                {DiskLoc(0, 1700), 999}, // bigger bucket. Should never look here
+                {}
+            };
+            initializeV1RS(&txn, recs, drecs, &em, md);
+        }
+
+        rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false);
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 1100), 100}, // 1st insert
+                {DiskLoc(0, 1600),  80}, // 2nd insert
+                {DiskLoc(0, 1200), 100}, // 3rd insert
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1000),  75},
+                {DiskLoc(0, 1300), 100},
+                {DiskLoc(0, 1400), 100},
+                {DiskLoc(0, 1500), 100},
+                {DiskLoc(0, 1700), 999},
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+        }
+    }
+
+    /**
+     * Test that we stop looking in a bucket once we see 31 too small drecs.
+     * WARNING: this test depends on magic numbers inside RSV1Simple::_allocFromExistingExtents.
+     */
+    TEST( SimpleRecordStoreV1, InsertLooksForMatchUpTo31Links ) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+        SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+        {
+            LocAndSize recs[] = {
+                {}
+            };
+            LocAndSize drecs[] = {
+                // This intentionally leaves gaps to keep locs readable.
+                {DiskLoc(0, 1000),  50}, // different bucket
+
+                {DiskLoc(0, 1100),  75}, // 1st too small in correct bucket
+                {DiskLoc(0, 1200),  75},
+                {DiskLoc(0, 1300),  75},
+                {DiskLoc(0, 1400),  75},
+                {DiskLoc(0, 1500),  75},
+                {DiskLoc(0, 1600),  75},
+                {DiskLoc(0, 1700),  75},
+                {DiskLoc(0, 1800),  75},
+                {DiskLoc(0, 1900),  75},
+                {DiskLoc(0, 2000),  75}, // 10th too small
+                {DiskLoc(0, 2100),  75},
+                {DiskLoc(0, 2200),  75},
+                {DiskLoc(0, 2300),  75},
+                {DiskLoc(0, 2400),  75},
+                {DiskLoc(0, 2500),  75},
+                {DiskLoc(0, 2600),  75},
+                {DiskLoc(0, 2700),  75},
+                {DiskLoc(0, 2800),  75},
+                {DiskLoc(0, 2900),  75},
+                {DiskLoc(0, 3000),  75}, // 20th too small
+                {DiskLoc(0, 3100),  75},
+                {DiskLoc(0, 3200),  75},
+                {DiskLoc(0, 3300),  75},
+                {DiskLoc(0, 3400),  75},
+                {DiskLoc(0, 3500),  75},
+                {DiskLoc(0, 3600),  75},
+                {DiskLoc(0, 3700),  75},
+                {DiskLoc(0, 3800),  75},
+                {DiskLoc(0, 3900),  75},
+                {DiskLoc(0, 4000),  75}, // 30th too small
+                {DiskLoc(0, 4100),  75}, // 31st too small
+
+                {DiskLoc(0, 8000),  80}, // big enough but wont be seen until we take an earlier one
+                {DiskLoc(0, 9000), 140}, // bigger bucket. jumps here after seeing 31 drecs
+                {}
+            };
+            initializeV1RS(&txn, recs, drecs, &em, md);
+        }
+
+        rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false); // takes from bigger bucket
+        rs.insertRecord(&txn, zeros, 70 - Record::HeaderSize, false); // removes a 75-sized drec
+        rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false); // now sees big-enough drec
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 9000), 80}, // 1st insert went here
+                {DiskLoc(0, 1100), 75}, // 2nd here
+                {DiskLoc(0, 8000), 80}, // 3rd here
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 9000 + 80),  140 - 80}, // split off during first insert
+                {DiskLoc(0, 1000),  50},
+                {DiskLoc(0, 1200),  75},
+                {DiskLoc(0, 1300),  75},
+                {DiskLoc(0, 1400),  75},
+                {DiskLoc(0, 1500),  75},
+                {DiskLoc(0, 1600),  75},
+                {DiskLoc(0, 1700),  75},
+                {DiskLoc(0, 1800),  75},
+                {DiskLoc(0, 1900),  75},
+                {DiskLoc(0, 2000),  75},
+                {DiskLoc(0, 2100),  75},
+                {DiskLoc(0, 2200),  75},
+                {DiskLoc(0, 2300),  75},
+                {DiskLoc(0, 2400),  75},
+                {DiskLoc(0, 2500),  75},
+                {DiskLoc(0, 2600),  75},
+                {DiskLoc(0, 2700),  75},
+                {DiskLoc(0, 2800),  75},
+                {DiskLoc(0, 2900),  75},
+                {DiskLoc(0, 3000),  75},
+                {DiskLoc(0, 3100),  75},
+                {DiskLoc(0, 3200),  75},
+                {DiskLoc(0, 3300),  75},
+                {DiskLoc(0, 3400),  75},
+                {DiskLoc(0, 3500),  75},
+                {DiskLoc(0, 3600),  75},
+                {DiskLoc(0, 3700),  75},
+                {DiskLoc(0, 3800),  75},
+                {DiskLoc(0, 3900),  75},
+                {DiskLoc(0, 4000),  75},
+                {DiskLoc(0, 4100),  75},
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+        }
+    }
+
+    /**
+     * Test that we stop looking in a bucket once we see 31 drecs, or look 4-past the first
+     * too-large match, whichever comes first. This is a combination of
+     * InsertLooksForBetterMatchUpTo5Links and InsertLooksForMatchUpTo31Links.
+     *
+     * WARNING: this test depends on magic numbers inside RSV1Simple::_allocFromExistingExtents.
+     */
+    TEST( SimpleRecordStoreV1, InsertLooksForMatchUpTo31LinksEvenIfFoundOversizedFit ) {
+        OperationContextNoop txn;
+        DummyExtentManager em;
+        DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 );
+        SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false );
+
+        {
+            LocAndSize recs[] = {
+                {}
+            };
+            LocAndSize drecs[] = {
+                // This intentionally leaves gaps to keep locs readable.
+                {DiskLoc(0, 1000),  50}, // different bucket
+
+                {DiskLoc(0, 1100),  75}, // 1st too small in correct bucket
+                {DiskLoc(0, 1200),  75},
+                {DiskLoc(0, 1300),  75},
+                {DiskLoc(0, 1400),  75},
+                {DiskLoc(0, 1500),  75},
+                {DiskLoc(0, 1600),  75},
+                {DiskLoc(0, 1700),  75},
+                {DiskLoc(0, 1800),  75},
+                {DiskLoc(0, 1900),  75},
+                {DiskLoc(0, 2000),  75}, // 10th too small
+                {DiskLoc(0, 2100),  75},
+                {DiskLoc(0, 2200),  75},
+                {DiskLoc(0, 2300),  75},
+                {DiskLoc(0, 2400),  75},
+                {DiskLoc(0, 2500),  75},
+                {DiskLoc(0, 2600),  75},
+                {DiskLoc(0, 2700),  75},
+                {DiskLoc(0, 2800),  75},
+                {DiskLoc(0, 2900),  75},
+                {DiskLoc(0, 3000),  75}, // 20th too small
+                {DiskLoc(0, 3100),  75},
+                {DiskLoc(0, 3200),  75},
+                {DiskLoc(0, 3300),  75},
+                {DiskLoc(0, 3400),  75},
+                {DiskLoc(0, 3500),  75},
+                {DiskLoc(0, 3600),  75},
+                {DiskLoc(0, 3700),  75}, // 27th too small
+
+                {DiskLoc(0, 7000),  95}, // 1st insert takes this
+                {DiskLoc(0, 7100),  95}, // 3rd insert takes this
+
+                {DiskLoc(0, 3800),  75},
+                {DiskLoc(0, 3900),  75}, // 29th too small (31st overall)
+
+                {DiskLoc(0, 8000),  80}, // exact match. taken by 2nd insert
+
+                {DiskLoc(0, 9000), 140}, // bigger bucket. Should never get here
+                {}
+            };
+            initializeV1RS(&txn, recs, drecs, &em, md);
+        }
+
+        rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false);
+        rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false);
+
+        {
+            LocAndSize recs[] = {
+                {DiskLoc(0, 7000), 95}, // 1st insert went here
+                {DiskLoc(0, 8000), 80}, // 2nd here
+                {DiskLoc(0, 7100), 95}, // 3rd here
+                {}
+            };
+            LocAndSize drecs[] = {
+                {DiskLoc(0, 1000),  50},
+                {DiskLoc(0, 1100),  75},
+                {DiskLoc(0, 1200),  75},
+                {DiskLoc(0, 1300),  75},
+                {DiskLoc(0, 1400),  75},
+                {DiskLoc(0, 1500),  75},
+                {DiskLoc(0, 1600),  75},
+                {DiskLoc(0, 1700),  75},
+                {DiskLoc(0, 1800),  75},
+                {DiskLoc(0, 1900),  75},
+                {DiskLoc(0, 2000),  75},
+                {DiskLoc(0, 2100),  75},
+                {DiskLoc(0, 2200),  75},
+                {DiskLoc(0, 2300),  75},
+                {DiskLoc(0, 2400),  75},
+                {DiskLoc(0, 2500),  75},
+                {DiskLoc(0, 2600),  75},
+                {DiskLoc(0, 2700),  75},
+                {DiskLoc(0, 2800),  75},
+                {DiskLoc(0, 2900),  75},
+                {DiskLoc(0, 3000),  75},
+                {DiskLoc(0, 3100),  75},
+                {DiskLoc(0, 3200),  75},
+                {DiskLoc(0, 3300),  75},
+                {DiskLoc(0, 3400),  75},
+                {DiskLoc(0, 3500),  75},
+                {DiskLoc(0, 3600),  75},
+                {DiskLoc(0, 3700),  75},
+                {DiskLoc(0, 3800),  75},
+                {DiskLoc(0, 3900),  75},
+                {DiskLoc(0, 9000), 140},
+                {}
+            };
+            assertStateV1RS(&txn, recs, drecs, &em, md);
+        }
+    }
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp
new file mode 100644
index 00000000000..3ea4298332f
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp
@@ -0,0 +1,608 @@
+// record_store_v1_test_help.cpp
+
+/**
+*    Copyright (C) 2014 MongoDB Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h"
+
+#include <algorithm>
+#include <map>
+#include <set>
+#include <vector>
+
+#include "mongo/db/storage/mmap_v1/extent.h"
+#include "mongo/db/storage/mmap_v1/record.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo {
+
+    DummyRecordStoreV1MetaData::DummyRecordStoreV1MetaData( bool capped, int userFlags ) {
+        _dataSize = 0;
+        _numRecords = 0;
+        _capped = capped;
+        _userFlags = userFlags;
+        _lastExtentSize = 0;
+        _paddingFactor = 1;
+        _maxCappedDocs = numeric_limits<long long>::max();
+        _capFirstNewRecord.setInvalid();
+        if ( _capped ) {
+            // copied from NamespaceDetails::NamespaceDetails()
+            setDeletedListEntry( NULL, 1, DiskLoc().setInvalid() );
+        }
+    }
+
+    const DiskLoc& DummyRecordStoreV1MetaData::capExtent() const {
+        return _capExtent;
+    }
+
+    void DummyRecordStoreV1MetaData::setCapExtent( OperationContext* txn,
+                                                   const DiskLoc& loc ) {
+        _capExtent = loc;
+    }
+
+    const DiskLoc& DummyRecordStoreV1MetaData::capFirstNewRecord() const {
+        return _capFirstNewRecord;
+    }
+
+    void DummyRecordStoreV1MetaData::setCapFirstNewRecord( OperationContext* txn,
+                                                           const DiskLoc& loc ) {
+        _capFirstNewRecord = loc;
+    }
+
+    long long DummyRecordStoreV1MetaData::dataSize() const {
+        return _dataSize;
+    }
+
+    long long DummyRecordStoreV1MetaData::numRecords() const {
+        return _numRecords;
+    }
+
+    void DummyRecordStoreV1MetaData::incrementStats( OperationContext* txn,
+                                                     long long dataSizeIncrement,
+                                                     long long numRecordsIncrement ) {
+        _dataSize += dataSizeIncrement;
+        _numRecords += numRecordsIncrement;
+    }
+
+    void DummyRecordStoreV1MetaData::setStats( OperationContext* txn,
+                                               long long dataSizeIncrement,
+                                               long long numRecordsIncrement ) {
+        _dataSize = dataSizeIncrement;
+        _numRecords = numRecordsIncrement;
+    }
+
+    namespace {
+        DiskLoc myNull;
+    }
+
+    const DiskLoc& DummyRecordStoreV1MetaData::deletedListEntry( int bucket ) const {
+        invariant( bucket >= 0 );
+        if ( static_cast<size_t>( bucket ) >= _deletedLists.size() )
+            return myNull;
+        return _deletedLists[bucket];
+    }
+
+    void DummyRecordStoreV1MetaData::setDeletedListEntry( OperationContext* txn,
+                                                          int bucket,
+                                                          const DiskLoc& loc ) {
+        invariant( bucket >= 0 );
+        invariant( bucket < 1000 );
+        while ( static_cast<size_t>( bucket ) >= _deletedLists.size() )
+            _deletedLists.push_back( DiskLoc() );
+        _deletedLists[bucket] = loc;
+    }
+
+    void DummyRecordStoreV1MetaData::orphanDeletedList(OperationContext* txn) {
+        invariant( false );
+    }
+
+    const DiskLoc& DummyRecordStoreV1MetaData::firstExtent(OperationContext* txn) const {
+        return _firstExtent;
+    }
+
+    void DummyRecordStoreV1MetaData::setFirstExtent( OperationContext* txn,
+                                                     const DiskLoc& loc ) {
+        _firstExtent = loc;
+    }
+
+    const DiskLoc& DummyRecordStoreV1MetaData::lastExtent(OperationContext* txn) const {
+        return _lastExtent;
+    }
+
+    void DummyRecordStoreV1MetaData::setLastExtent( OperationContext* txn,
+                                                    const DiskLoc& loc ) {
+        _lastExtent = loc;
+    }
+
+    bool DummyRecordStoreV1MetaData::isCapped() const {
+        return _capped;
+    }
+
+    bool DummyRecordStoreV1MetaData::isUserFlagSet( int flag ) const {
+        return _userFlags & flag;
+    }
+
+    bool DummyRecordStoreV1MetaData::setUserFlag( OperationContext* txn, int flag ) {
+        if ( ( _userFlags & flag ) == flag )
+            return false;
+
+        _userFlags |= flag;
+        return true;
+
+    }
+    bool DummyRecordStoreV1MetaData::clearUserFlag( OperationContext* txn, int flag ) {
+        if ( ( _userFlags & flag ) == 0 )
+            return false;
+
+        _userFlags &= ~flag;
+        return true;
+
+    }
+    bool DummyRecordStoreV1MetaData::replaceUserFlags( OperationContext* txn, int flags ) {
+        if ( _userFlags == flags )
+            return false;
+        _userFlags = flags;
+        return true;
+    }
+
+
+    int DummyRecordStoreV1MetaData::lastExtentSize(OperationContext* txn) const {
+        return _lastExtentSize;
+    }
+
+    void DummyRecordStoreV1MetaData::setLastExtentSize( OperationContext* txn, int newMax ) {
+        _lastExtentSize = newMax;
+    }
+
+    long long DummyRecordStoreV1MetaData::maxCappedDocs() const {
+        return _maxCappedDocs;
+    }
+
+    double DummyRecordStoreV1MetaData::paddingFactor() const {
+        return _paddingFactor;
+    }
+
+    void DummyRecordStoreV1MetaData::setPaddingFactor( OperationContext* txn,
+                                                       double paddingFactor ) {
+        _paddingFactor = paddingFactor;
+    }
+
+    // -----------------------------------------
+
+    DummyExtentManager::~DummyExtentManager() {
+        for ( size_t i = 0; i < _extents.size(); i++ ) {
+            if ( _extents[i].data )
+                free( _extents[i].data );
+        }
+    }
+
+    Status DummyExtentManager::init(OperationContext* txn) {
+        return Status::OK();
+    }
+
+    int DummyExtentManager::numFiles() const {
+        return static_cast<int>( _extents.size() );
+    }
+
+    long long DummyExtentManager::fileSize() const {
+        invariant( false );
+        return -1;
+    }
+
+    DiskLoc DummyExtentManager::allocateExtent( OperationContext* txn,
+                                                bool capped,
+                                                int size,
+                                                bool enforceQuota ) {
+        size = quantizeExtentSize( size );
+
+        ExtentInfo info;
+        info.data = static_cast<char*>( malloc( size ) );
+        info.length = size;
+
+        DiskLoc loc( _extents.size(), 0 );
+        _extents.push_back( info );
+
+        Extent* e = getExtent( loc, false );
+        e->magic = Extent::extentSignature;
+        e->myLoc = loc;
+        e->xnext.Null();
+        e->xprev.Null();
+        e->length = size;
+        e->firstRecord.Null();
+        e->lastRecord.Null();
+
+        return loc;
+
+    }
+
+    void DummyExtentManager::freeExtents( OperationContext* txn,
+                                          DiskLoc firstExt, DiskLoc lastExt ) {
+        // XXX
+    }
+
+    void DummyExtentManager::freeExtent( OperationContext* txn, DiskLoc extent ) {
+        // XXX
+    }
+    void DummyExtentManager::freeListStats( int* numExtents, int64_t* totalFreeSize ) const {
+        invariant( false );
+    }
+
+    Record* DummyExtentManager::recordForV1( const DiskLoc& loc ) const {
+        invariant( static_cast<size_t>( loc.a() ) < _extents.size() );
+        invariant( static_cast<size_t>( loc.getOfs() ) < _extents[loc.a()].length );
+        char* root = _extents[loc.a()].data;
+        return reinterpret_cast<Record*>( root + loc.getOfs() );
+    }
+
+    Extent* DummyExtentManager::extentForV1( const DiskLoc& loc ) const {
+        invariant( false );
+    }
+
+    DiskLoc DummyExtentManager::extentLocForV1( const DiskLoc& loc ) const {
+        return DiskLoc( loc.a(), 0 );
+    }
+
+    Extent* DummyExtentManager::getExtent( const DiskLoc& loc, bool doSanityCheck ) const {
+        invariant( !loc.isNull() );
+        invariant( static_cast<size_t>( loc.a() ) < _extents.size() );
+        invariant( loc.getOfs() == 0 );
+        Extent* ext = reinterpret_cast<Extent*>( _extents[loc.a()].data );
+        if (doSanityCheck)
+            ext->assertOk();
+        return ext;
+    }
+
+    int DummyExtentManager::maxSize() const {
+        return 1024 * 1024 * 64;
+    }
+
+    DummyExtentManager::CacheHint* DummyExtentManager::cacheHint( const DiskLoc& extentLoc, const HintType& hint ) {
+        return new CacheHint();
+    }
+
+namespace {
+    void accumulateExtentSizeRequirements(const LocAndSize* las, std::map<int, size_t>* sizes) {
+        if (!las)
+            return;
+
+        while (!las->loc.isNull()) {
+            // We require passed in offsets to be > 1000 to leave room for Extent headers.
+            invariant(Extent::HeaderSize() < 1000);
+            invariant(las->loc.getOfs() >= 1000);
+
+            const size_t end = las->loc.getOfs() + las->size;
+            size_t& sizeNeeded = (*sizes)[las->loc.a()];
+            sizeNeeded = std::max(sizeNeeded, end);
+            las++;
+        }
+    }
+
+    void printRecList(OperationContext* txn,
+                      const ExtentManager* em,
+                      const RecordStoreV1MetaData* md) {
+        log() << " *** BEGIN ACTUAL RECORD LIST *** ";
+        DiskLoc extLoc = md->firstExtent(txn);
+        std::set<DiskLoc> seenLocs;
+        while (!extLoc.isNull()) {
+            Extent* ext = em->getExtent(extLoc, true);
+            DiskLoc actualLoc = ext->firstRecord;
+            while (!actualLoc.isNull()) {
+                const Record* actualRec = em->recordForV1(actualLoc);
+                const int actualSize = actualRec->lengthWithHeaders();
+
+                log() << "loc: " << actualLoc // <--hex
+                      << " (" << actualLoc.getOfs() << ")"
+                      << " size: " << actualSize
+                      << " prev: " << actualRec->prevOfs()
+                      << " next: " << actualRec->nextOfs()
+                      << (actualLoc == md->capFirstNewRecord() ? " (CAP_FIRST_NEW)" : "")
+                      ;
+
+                const bool foundCycle = !seenLocs.insert(actualLoc).second;
+                invariant(!foundCycle);
+
+                const int nextOfs = actualRec->nextOfs();
+                actualLoc = (nextOfs == DiskLoc::NullOfs ? DiskLoc()
+                                                         : DiskLoc(actualLoc.a(), nextOfs));
+            }
+            extLoc = ext->xnext;
+        }
+        log() << " *** END ACTUAL RECORD LIST *** ";
+    }
+
+    void printDRecList(const ExtentManager* em, const RecordStoreV1MetaData* md) {
+        log() << " *** BEGIN ACTUAL DELETED RECORD LIST *** ";
+        std::set<DiskLoc> seenLocs;
+        for (int bucketIdx = 0; bucketIdx < RecordStoreV1Base::Buckets; bucketIdx++) {
+            DiskLoc actualLoc = md->deletedListEntry(bucketIdx);
+            while (!actualLoc.isNull()) {
+                const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted();
+                const int actualSize = actualDrec->lengthWithHeaders();
+
+                log() << "loc: " << actualLoc // <--hex
+                      << " (" << actualLoc.getOfs() << ")"
+                      << " size: " << actualSize
+                      << " bucket: " << bucketIdx
+                      << " next: " << actualDrec->nextDeleted();
+
+                const bool foundCycle = !seenLocs.insert(actualLoc).second;
+                invariant(!foundCycle);
+
+                actualLoc = actualDrec->nextDeleted();
+            }
+
+            // Only print bucket 0 in capped collections since it contains all deleted records
+            if (md->isCapped())
+                break;
+        }
+        log() << " *** END ACTUAL DELETED RECORD LIST *** ";
+    }
+}
+
+    void initializeV1RS(OperationContext* txn,
+                        const LocAndSize* records,
+                        const LocAndSize* drecs,
+                        DummyExtentManager* em,
+                        DummyRecordStoreV1MetaData* md) {
+        invariant(records || drecs); // if both are NULL nothing is being created...
+        
+        // Need to start with a blank slate
+        invariant(em->numFiles() == 0);
+        invariant(md->firstExtent(txn).isNull());
+
+        // pre-allocate extents (even extents that aren't part of this RS)
+        {
+            typedef std::map<int, size_t> ExtentSizes;
+            ExtentSizes extentSizes;
+            accumulateExtentSizeRequirements(records, &extentSizes);
+            accumulateExtentSizeRequirements(drecs, &extentSizes);
+            invariant(!extentSizes.empty());
+
+            const int maxExtent = extentSizes.rbegin()->first;
+            for (int i = 0; i <= maxExtent; i++) {
+                const size_t size = extentSizes.count(i) ? extentSizes[i] : 0;
+                const DiskLoc loc = em->allocateExtent(txn, md->isCapped(), size, 0);
+
+                // This function and assertState depend on these details of DummyExtentManager
+                invariant(loc.a() == i);
+                invariant(loc.getOfs() == 0);
+            }
+
+            // link together extents that should be part of this RS
+            md->setFirstExtent(txn, DiskLoc(extentSizes.begin()->first, 0));
+            md->setLastExtent(txn, DiskLoc(extentSizes.rbegin()->first, 0));
+            for (ExtentSizes::iterator it = extentSizes.begin();
+                    boost::next(it) != extentSizes.end(); /* ++it */ ) {
+                const int a = it->first;
+                ++it;
+                const int b = it->first;
+                em->getExtent(DiskLoc(a, 0))->xnext = DiskLoc(b, 0);
+                em->getExtent(DiskLoc(b, 0))->xprev = DiskLoc(a, 0);
+            }
+
+            // This signals "done allocating new extents".
+            if (md->isCapped())
+                md->setDeletedListEntry(txn, 1, DiskLoc());
+        }
+
+        if (records && !records[0].loc.isNull()) {
+            int recIdx = 0;
+            DiskLoc extLoc = md->firstExtent(txn);
+            while (!extLoc.isNull()) {
+                Extent* ext = em->getExtent(extLoc);
+                int prevOfs = DiskLoc::NullOfs;
+                while (extLoc.a() == records[recIdx].loc.a()) { // for all records in this extent
+                    const DiskLoc loc = records[recIdx].loc;
+                    const int size = records[recIdx].size;;
+                    invariant(size >= Record::HeaderSize);
+
+                    md->incrementStats(txn, size - Record::HeaderSize, 1);
+
+                    if (ext->firstRecord.isNull())
+                        ext->firstRecord = loc;
+
+                    Record* rec = em->recordForV1(loc);
+                    rec->lengthWithHeaders() = size;
+                    rec->extentOfs() = 0;
+
+                    rec->prevOfs() = prevOfs;
+                    prevOfs = loc.getOfs();
+
+                    const DiskLoc nextLoc = records[recIdx + 1].loc;
+                    if (nextLoc.a() == loc.a()) { // if next is in same extent
+                        rec->nextOfs() = nextLoc.getOfs();
+                    }
+                    else {
+                        rec->nextOfs() = DiskLoc::NullOfs;
+                        ext->lastRecord = loc;
+                    }
+
+                    recIdx++;
+                }
+                extLoc = ext->xnext;
+            }
+            invariant(records[recIdx].loc.isNull());
+        }
+
+        if (drecs && !drecs[0].loc.isNull()) {
+            int drecIdx = 0;
+            DiskLoc* prevNextPtr = NULL;
+            int lastBucket = -1;
+            while (!drecs[drecIdx].loc.isNull()) {
+                const DiskLoc loc = drecs[drecIdx].loc;
+                const int size = drecs[drecIdx].size;
+                invariant(size >= Record::HeaderSize);
+                const int bucket = RecordStoreV1Base::bucket(size);
+
+                if (md->isCapped()) {
+                    // All drecs form a single list in bucket 0
+                    if (prevNextPtr == NULL) {
+                        md->setDeletedListEntry(txn, 0, loc);
+                    }
+                    else {
+                        *prevNextPtr = loc;
+                    }
+
+                    if (loc.a() < md->capExtent().a()
+                            && drecs[drecIdx + 1].loc.a() == md->capExtent().a()) {
+                        // Bucket 1 is known as cappedLastDelRecLastExtent
+                        md->setDeletedListEntry(txn, 1, loc);
+                    }
+                } 
+                else if (bucket != lastBucket) {
+                    invariant(bucket > lastBucket); // if this fails, drecs weren't sorted by bucket
+                    md->setDeletedListEntry(txn, bucket, loc);
+                    lastBucket = bucket;
+                }
+                else {
+                    *prevNextPtr = loc;
+                }
+
+                DeletedRecord* drec = &em->recordForV1(loc)->asDeleted();
+                drec->lengthWithHeaders() = size;
+                drec->extentOfs() = 0;
+                drec->nextDeleted() = DiskLoc();
+                prevNextPtr = &drec->nextDeleted();
+
+                drecIdx++;
+            }
+        }
+
+        // Make sure we set everything up as requested.
+        assertStateV1RS(txn, records, drecs, em, md);
+    }
+
+    void assertStateV1RS(OperationContext* txn,
+                         const LocAndSize* records,
+                         const LocAndSize* drecs,
+                         const ExtentManager* em,
+                         const DummyRecordStoreV1MetaData* md) {
+        invariant(records || drecs); // if both are NULL nothing is being asserted...
+
+        try {
+            if (records) {
+                long long dataSize = 0;
+                long long numRecs = 0;
+
+                int recIdx = 0;
+
+                DiskLoc extLoc = md->firstExtent(txn);
+                while (!extLoc.isNull()) { // for each Extent
+                    Extent* ext = em->getExtent(extLoc, true);
+                    int expectedPrevOfs = DiskLoc::NullOfs;
+                    DiskLoc actualLoc = ext->firstRecord;
+                    while (!actualLoc.isNull()) { // for each Record in this Extent
+                        const Record* actualRec = em->recordForV1(actualLoc);
+                        const int actualSize = actualRec->lengthWithHeaders();
+
+                        dataSize += actualSize - Record::HeaderSize;
+                        numRecs += 1;
+
+                        ASSERT_EQUALS(actualLoc, records[recIdx].loc);
+                        ASSERT_EQUALS(actualSize, records[recIdx].size);
+
+                        ASSERT_EQUALS(actualRec->extentOfs(), extLoc.getOfs());
+                        ASSERT_EQUALS(actualRec->prevOfs(), expectedPrevOfs);
+                        expectedPrevOfs = actualLoc.getOfs();
+
+                        recIdx++;
+                        const int nextOfs = actualRec->nextOfs();
+                        actualLoc = (nextOfs == DiskLoc::NullOfs ? DiskLoc()
+                                                                 : DiskLoc(actualLoc.a(), nextOfs));
+                    }
+
+                    if (ext->xnext.isNull()) {
+                        ASSERT_EQUALS(md->lastExtent(txn), extLoc);
+                    }
+
+                    extLoc = ext->xnext;
+                }
+
+                // both the expected and actual record lists must be done at this point
+                ASSERT_EQUALS(records[recIdx].loc, DiskLoc());
+
+                ASSERT_EQUALS(dataSize, md->dataSize());
+                ASSERT_EQUALS(numRecs, md->numRecords());
+            }
+
+            if (drecs) {
+                int drecIdx = 0;
+                for (int bucketIdx = 0; bucketIdx < RecordStoreV1Base::Buckets; bucketIdx++) {
+                    DiskLoc actualLoc = md->deletedListEntry(bucketIdx);
+
+                    if (md->isCapped() && bucketIdx == 1) {
+                        // In capped collections, the 2nd bucket (index 1) points to the drec before
+                        // the first drec in the capExtent. If the capExtent is the first Extent,
+                        // it should be Null.
+
+                        if (md->capExtent() == md->firstExtent(txn)) {
+                            ASSERT_EQUALS(actualLoc, DiskLoc());
+                        }
+                        else {
+                            ASSERT_NOT_EQUALS(actualLoc.a(), md->capExtent().a());
+                            const DeletedRecord* actualDrec =
+                                &em->recordForV1(actualLoc)->asDeleted();
+                            ASSERT_EQUALS(actualDrec->nextDeleted().a(), md->capExtent().a());
+                        }
+
+                        // Don't do normal checking of bucket 1 in capped collections. Checking
+                        // other buckets to verify that they are Null.
+                        continue;
+                    }
+
+                    while (!actualLoc.isNull()) {
+                        const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted();
+                        const int actualSize = actualDrec->lengthWithHeaders();
+
+                        ASSERT_EQUALS(actualLoc, drecs[drecIdx].loc);
+                        ASSERT_EQUALS(actualSize, drecs[drecIdx].size);
+
+                        // Make sure the drec is correct
+                        ASSERT_EQUALS(actualDrec->extentOfs(), 0);
+
+                        // in capped collections all drecs are linked into a single list in bucket 0
+                        ASSERT_EQUALS(bucketIdx, md->isCapped()
+                                                   ? 0
+                                                   : RecordStoreV1Base::bucket(actualSize));
+
+                        drecIdx++;
+                        actualLoc = actualDrec->nextDeleted();
+                    }
+                }
+                // both the expected and actual deleted lists must be done at this point
+                ASSERT_EQUALS(drecs[drecIdx].loc, DiskLoc());
+            }
+        }
+        catch (...) {
+            // If a test fails, provide extra info to make debugging easier
+            printRecList(txn, em, md);
+            printDRecList(em, md);
+            throw;
+        }
+    }
+}
diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h
new file mode 100644
index 00000000000..87ddc078b6d
--- /dev/null
+++ b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h
@@ -0,0 +1,198 @@
+// record_store_v1_test_help.h
+
+/**
+*    Copyright (C) 2014 MongoDB Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#pragma once
+
+#include <vector>
+
+#include "mongo/db/storage/mmap_v1/extent_manager.h"
+#include "mongo/db/storage/mmap_v1/record_store_v1_base.h"
+
+namespace mongo {
+
+    class DummyRecordStoreV1MetaData : public RecordStoreV1MetaData {
+    public:
+        DummyRecordStoreV1MetaData( bool capped, int userFlags );
+        virtual ~DummyRecordStoreV1MetaData(){}
+
+        virtual const DiskLoc& capExtent() const;
+        virtual void setCapExtent( OperationContext* txn, const DiskLoc& loc );
+
+        virtual const DiskLoc& capFirstNewRecord() const;
+        virtual void setCapFirstNewRecord( OperationContext* txn, const DiskLoc& loc );
+
+        virtual long long dataSize() const;
+        virtual long long numRecords() const;
+
+        virtual void incrementStats( OperationContext* txn,
+                                     long long dataSizeIncrement,
+                                     long long numRecordsIncrement );
+
+        virtual void setStats( OperationContext* txn,
+                               long long dataSizeIncrement,
+                               long long numRecordsIncrement );
+
+        virtual const DiskLoc& deletedListEntry( int bucket ) const;
+        virtual void setDeletedListEntry( OperationContext* txn,
+                                          int bucket,
+                                          const DiskLoc& loc );
+        virtual void orphanDeletedList(OperationContext* txn);
+
+        virtual const DiskLoc& firstExtent( OperationContext* txn ) const;
+        virtual void setFirstExtent( OperationContext* txn, const DiskLoc& loc );
+
+        virtual const DiskLoc& lastExtent( OperationContext* txn ) const;
+        virtual void setLastExtent( OperationContext* txn, const DiskLoc& loc );
+
+        virtual bool isCapped() const;
+
+        virtual bool isUserFlagSet( int flag ) const;
+        virtual int userFlags() const { return _userFlags; }
+        virtual bool setUserFlag( OperationContext* txn, int flag );
+        virtual bool clearUserFlag( OperationContext* txn, int flag );
+        virtual bool replaceUserFlags( OperationContext* txn, int flags );
+
+
+        virtual int lastExtentSize( OperationContext* txn ) const;
+        virtual void setLastExtentSize( OperationContext* txn, int newMax );
+
+        virtual long long maxCappedDocs() const;
+
+        virtual double paddingFactor() const;
+
+        virtual void setPaddingFactor( OperationContext* txn, double paddingFactor );
+
+    protected:
+
+        DiskLoc _capExtent;
+        DiskLoc _capFirstNewRecord;
+
+        long long _dataSize;
+        long long _numRecords;
+
+        DiskLoc _firstExtent;
+        DiskLoc _lastExtent;
+
+        bool _capped;
+        int _userFlags;
+        long long _maxCappedDocs;
+
+        int _lastExtentSize;
+        double _paddingFactor;
+
+        std::vector<DiskLoc> _deletedLists;
+    };
+
+    class DummyExtentManager : public ExtentManager {
+    public:
+        virtual ~DummyExtentManager();
+
+        virtual Status init(OperationContext* txn);
+
+        virtual int numFiles() const;
+        virtual long long fileSize() const;
+
+        virtual DiskLoc allocateExtent( OperationContext* txn,
+                                        bool capped,
+                                        int size,
+                                        bool enforceQuota );
+
+        virtual void freeExtents( OperationContext* txn,
+                                  DiskLoc firstExt, DiskLoc lastExt );
+
+        virtual void freeExtent( OperationContext* txn, DiskLoc extent );
+
+        virtual void freeListStats( int* numExtents, int64_t* totalFreeSize ) const;
+
+        virtual Record* recordForV1( const DiskLoc& loc ) const;
+
+        virtual Extent* extentForV1( const DiskLoc& loc ) const;
+
+        virtual DiskLoc extentLocForV1( const DiskLoc& loc ) const;
+
+        virtual Extent* getExtent( const DiskLoc& loc, bool doSanityCheck = true ) const;
+
+        virtual int maxSize() const;
+
+        virtual CacheHint* cacheHint( const DiskLoc& extentLoc, const HintType& hint );
+
+    protected:
+        struct ExtentInfo {
+            char* data;
+            size_t length;
+        };
+
+        std::vector<ExtentInfo> _extents;
+    };
+    
+    struct LocAndSize {
+        DiskLoc loc;
+        int size; // with headers
+    };
+
+    /**
+     * Creates a V1 storage/mmap_v1 with the passed in records and DeletedRecords (drecs).
+     *
+     * List of LocAndSize are terminated by a Null DiskLoc. Passing a NULL pointer is shorthand for
+     * an empty list. Each extent gets it's own DiskLoc file number. DiskLoc Offsets must be > 1000.
+     *
+     * records must be sorted by extent/file. offsets within an extent can be in any order.
+     *
+     * In a simple RS, drecs must be grouped into size-buckets, but the ordering within the size
+     * buckets is up to you.
+     *
+     * In a capped collection, all drecs form a single list and must be grouped by extent, with each
+     * extent having at least one drec. capFirstNewRecord() and capExtent() *must* be correctly set
+     * on md before calling.
+     *
+     * You are responsible for ensuring the records and drecs don't overlap.
+     *
+     * ExtentManager and MetaData must both be empty.
+     */
+    void initializeV1RS(OperationContext* txn,
+                        const LocAndSize* records,
+                        const LocAndSize* drecs,
+                        DummyExtentManager* em,
+                        DummyRecordStoreV1MetaData* md);
+
+    /**
+     * Asserts that the V1RecordStore defined by md has the passed in records and drecs in the
+     * correct order.
+     *
+     * List of LocAndSize are terminated by a Null DiskLoc. Passing a NULL pointer means don't check
+     * that list.
+     */
+    void assertStateV1RS(OperationContext* txn,
+                         const LocAndSize* records,
+                         const LocAndSize* drecs,
+                         const ExtentManager* em,
+                         const DummyRecordStoreV1MetaData* md);
+
+}  // namespace mongo
diff --git a/src/mongo/db/storage/record_store.h b/src/mongo/db/storage/record_store.h
new file mode 100644
index 00000000000..8437046e5d6
--- /dev/null
+++ b/src/mongo/db/storage/record_store.h
@@ -0,0 +1,291 @@
+// record_store.h
+
+/**
+*    Copyright (C) 2013 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#pragma once
+
+#include "mongo/base/owned_pointer_vector.h"
+#include "mongo/bson/mutable/damage_vector.h"
+#include "mongo/db/diskloc.h"
+#include "mongo/db/exec/collection_scan_common.h"
+#include "mongo/db/storage/record_data.h"
+
+namespace mongo {
+
+    class CappedDocumentDeleteCallback;
+    class Collection;
+    struct CompactOptions;
+    struct CompactStats;
+    class DocWriter;
+    class MAdvise;
+    class NamespaceDetails;
+    class OperationContext;
+    class Record;
+
+    class RecordStoreCompactAdaptor;
+    class RecordStore;
+
+    struct ValidateResults;
+    class ValidateAdaptor;
+
+    /**
+     * Allows inserting a Record "in-place" without creating a copy ahead of time.
+     */
+    class DocWriter {
+    public:
+        virtual ~DocWriter() {}
+        virtual void writeDocument( char* buf ) const = 0;
+        virtual size_t documentSize() const = 0;
+        virtual bool addPadding() const { return true; }
+    };
+
+    /**
+     * @see RecordStore::updateRecord
+     */
+    class UpdateMoveNotifier {
+    public:
+        virtual ~UpdateMoveNotifier(){}
+        virtual Status recordStoreGoingToMove( OperationContext* txn,
+                                               const DiskLoc& oldLocation,
+                                               const char* oldBuffer,
+                                               size_t oldSize ) = 0;
+    };
+
+    /**
+     * A RecordIterator provides an interface for walking over a RecordStore.
+     * The details of navigating the collection's structure are below this interface.
+     */
+    class RecordIterator {
+    public:
+        virtual ~RecordIterator() { }
+
+        // True if getNext will produce no more data, false otherwise.
+        virtual bool isEOF() = 0;
+
+        // Return the DiskLoc that the iterator points at.  Returns DiskLoc() if isEOF.
+        virtual DiskLoc curr() = 0;
+
+        // Return the DiskLoc that the iterator points at and move the iterator to the next item
+        // from the collection.  Returns DiskLoc() if isEOF.
+        virtual DiskLoc getNext() = 0;
+
+        // Can only be called after prepareToYield and before recoverFromYield.
+        virtual void invalidate(const DiskLoc& dl) = 0;
+
+        // Save any state required to resume operation (without crashing) after DiskLoc deletion or
+        // a collection drop.
+        virtual void prepareToYield() = 0;
+
+        // Returns true if collection still exists, false otherwise.
+        virtual bool recoverFromYield() = 0;
+
+        // normally this will just go back to the RecordStore and convert
+        // but this gives the iterator an oppurtnity to optimize
+        virtual RecordData dataFor( const DiskLoc& loc ) const = 0;
+    };
+
+
+    class RecordStore {
+        MONGO_DISALLOW_COPYING(RecordStore);
+    public:
+        RecordStore( const StringData& ns ) : _ns(ns.toString()) { }
+
+        virtual ~RecordStore() { }
+
+        // META
+
+        // name of the RecordStore implementation
+        virtual const char* name() const = 0;
+
+        virtual long long dataSize() const = 0;
+
+        virtual long long numRecords() const = 0;
+
+        virtual bool isCapped() const = 0;
+
+        virtual void setCappedDeleteCallback(CappedDocumentDeleteCallback*) {invariant( false );}
+
+        /**
+         * @param extraInfo - optional more debug info
+         * @param level - optional, level of debug info to put in (higher is more)
+         */
+        virtual int64_t storageSize( OperationContext* txn,
+                                     BSONObjBuilder* extraInfo = NULL,
+                                     int infoLevel = 0 ) const = 0;
+
+        // CRUD related
+
+        virtual RecordData dataFor( const DiskLoc& loc) const = 0;
+
+        virtual void deleteRecord( OperationContext* txn, const DiskLoc& dl ) = 0;
+
+        virtual StatusWith<DiskLoc> insertRecord( OperationContext* txn,
+                                                  const char* data,
+                                                  int len,
+                                                  bool enforceQuota ) = 0;
+
+        virtual StatusWith<DiskLoc> insertRecord( OperationContext* txn,
+                                                  const DocWriter* doc,
+                                                  bool enforceQuota ) = 0;
+
+        /**
+         * @param notifier - this is called if the document is moved
+         *                   it is to be called after the document has been written to new
+         *                   location, before deleted from old.
+         * @return Status or DiskLoc, DiskLoc might be different
+         */
+        virtual StatusWith<DiskLoc> updateRecord( OperationContext* txn,
+                                                  const DiskLoc& oldLocation,
+                                                  const char* data,
+                                                  int len,
+                                                  bool enforceQuota,
+                                                  UpdateMoveNotifier* notifier ) = 0;
+
+        virtual Status updateWithDamages( OperationContext* txn,
+                                          const DiskLoc& loc,
+                                          const char* damangeSource,
+                                          const mutablebson::DamageVector& damages ) = 0;
+        /**
+         * returned iterator owned by caller
+         * canonical to get all would be
+         * getIterator( txn, DiskLoc(), false, CollectionScanParams::FORWARD )
+         */
+        virtual RecordIterator* getIterator( OperationContext* txn,
+                                             const DiskLoc& start = DiskLoc(),
+                                             bool tailable = false,
+                                             const CollectionScanParams::Direction& dir =
+                                             CollectionScanParams::FORWARD
+                                             ) const = 0;
+
+        /**
+         * Constructs an iterator over a potentially corrupted store, which can be used to salvage
+         * damaged records. The iterator might return every record in the store if all of them 
+         * are reachable and not corrupted.
+         */
+        virtual RecordIterator* getIteratorForRepair( OperationContext* txn ) const = 0;
+
+        /**
+         * Returns many iterators that partition the RecordStore into many disjoint sets. Iterating
+         * all returned iterators is equivalent to Iterating the full store.
+         */
+        virtual std::vector<RecordIterator*> getManyIterators( OperationContext* txn ) const = 0;
+
+        // higher level
+
+
+        /**
+         * removes all Records
+         */
+        virtual Status truncate( OperationContext* txn ) = 0;
+
+        /**
+         * Truncate documents newer than the document at 'end' from the capped
+         * collection.  The collection cannot be completely emptied using this
+         * function.  An assertion will be thrown if that is attempted.
+         * @param inclusive - Truncate 'end' as well iff true
+         * XXX: this will go away soon, just needed to move for now
+         */
+        virtual void temp_cappedTruncateAfter(OperationContext* txn,
+                                              DiskLoc end,
+                                              bool inclusive) = 0;
+
+        // does this RecordStore support the compact operation
+        virtual bool compactSupported() const = 0;
+        virtual Status compact( OperationContext* txn,
+                                RecordStoreCompactAdaptor* adaptor,
+                                const CompactOptions* options,
+                                CompactStats* stats ) = 0;
+
+        /**
+         * @param full - does more checks
+         * @param scanData - scans each document
+         * @return OK if the validate run successfully
+         *         OK will be returned even if corruption is found
+         *         deatils will be in result
+         */
+        virtual Status validate( OperationContext* txn,
+                                 bool full, bool scanData,
+                                 ValidateAdaptor* adaptor,
+                                 ValidateResults* results, BSONObjBuilder* output ) const = 0;
+
+        /**
+         * @param scaleSize - amount by which to scale size metrics
+         * appends any custom stats from the RecordStore or other unique stats
+         */
+        virtual void appendCustomStats( OperationContext* txn,
+                                        BSONObjBuilder* result,
+                                        double scale ) const = 0;
+
+        /**
+         * Load all data into cache.
+         * What cache depends on implementation.
+         * @param output (optional) - where to put detailed stats
+         */
+        virtual Status touch( OperationContext* txn, BSONObjBuilder* output ) const = 0;
+
+        /**
+         * @return Status::OK() if option hanlded
+         *         InvalidOptions is option not supported
+         *         other errors indicate option supported, but error setting
+         */
+        virtual Status setCustomOption( OperationContext* txn,
+                                        const BSONElement& option,
+                                        BSONObjBuilder* info = NULL ) = 0;
+    protected:
+        std::string _ns;
+    };
+
+    class RecordStoreCompactAdaptor {
+    public:
+        virtual ~RecordStoreCompactAdaptor(){}
+        virtual bool isDataValid( const RecordData& recData ) = 0;
+        virtual size_t dataSize( const RecordData& recData ) = 0;
+        virtual void inserted( const RecordData& recData, const DiskLoc& newLocation ) = 0;
+    };
+
+    struct ValidateResults {
+        ValidateResults() {
+            valid = true;
+        }
+        bool valid;
+        std::vector<std::string> errors;
+    };
+
+    /**
+     * This is so when a RecordStore is validating all records
+     * it can call back to someone to check if a record is valid.
+     * The actual data contained in a Record is totally opaque to the implementation.
+     */
+    class ValidateAdaptor {
+    public:
+        virtual ~ValidateAdaptor(){}
+
+        virtual Status validate( const RecordData& recordData, size_t* dataSize ) = 0;
+    };
+}
diff --git a/src/mongo/db/storage/rocks/rocks_btree_impl.cpp b/src/mongo/db/storage/rocks/rocks_btree_impl.cpp
index 00cbbf1c580..8bd3f2734cf 100644
--- a/src/mongo/db/storage/rocks/rocks_btree_impl.cpp
+++ b/src/mongo/db/storage/rocks/rocks_btree_impl.cpp
@@ -87,7 +87,7 @@ namespace mongo {
             rocksdb::Slice sliced[2];
         };
 
-        class RocksCursor : public BtreeInterface::Cursor {
+        class RocksCursor : public SortedDataInterface::Cursor {
         public:
             RocksCursor( rocksdb::Iterator* iterator, bool direction )
                 : _iterator( iterator ), _direction( direction ), _cached( false ) {
@@ -285,8 +285,8 @@ namespace mongo {
         return Status::OK();
     }
 
-    BtreeInterface::Cursor* RocksBtreeImpl::newCursor(OperationContext* txn,
-                                                      int direction) const {
+    SortedDataInterface::Cursor* RocksBtreeImpl::newCursor(OperationContext* txn,
+                                                           int direction) const {
         return new RocksCursor( _db->NewIterator( rocksdb::ReadOptions(),
                                                   _columnFamily ),
                                 txn,
diff --git a/src/mongo/db/storage/rocks/rocks_btree_impl.h b/src/mongo/db/storage/rocks/rocks_btree_impl.h
index 2a15e46aad5..4e75ad50e11 100644
--- a/src/mongo/db/storage/rocks/rocks_btree_impl.h
+++ b/src/mongo/db/storage/rocks/rocks_btree_impl.h
@@ -28,7 +28,7 @@
  *    it in the license file.
  */
 
-#include "mongo/db/structure/btree/btree_interface.h"
+#include "mongo/db/storage/sorted_data_interface.h"
 
 #pragma once
 
@@ -47,7 +47,7 @@ namespace mongo {
         virtual unsigned long long commit(bool mayInterrupt) = 0;
     };
 
-    class RocksBtreeImpl : public BtreeInterface {
+    class RocksBtreeImpl : public SortedDataInterface {
     public:
         RocksBtreeImpl( rocksdb::DB* db,
                         rocksdb::ColumnFamilyHandle* cf );
diff --git a/src/mongo/db/storage/rocks/rocks_btree_impl_test.cpp b/src/mongo/db/storage/rocks/rocks_btree_impl_test.cpp
index f7102163352..e080fb08faf 100644
--- a/src/mongo/db/storage/rocks/rocks_btree_impl_test.cpp
+++ b/src/mongo/db/storage/rocks/rocks_btree_impl_test.cpp
@@ -126,7 +126,7 @@ namespace mongo {
             DiskLoc loc( 5, 16 );
 
             {
-                scoped_ptr<BtreeInterface::Cursor> cursor( btree.newCursor( 1 ) );
+                scoped_ptr<SortedDataInterface::Cursor> cursor( btree.newCursor( 1 ) );
                 ASSERT( !cursor->locate( key, loc ) );
             }
 
@@ -140,7 +140,7 @@ namespace mongo {
             }
 
             {
-                scoped_ptr<BtreeInterface::Cursor> cursor( btree.newCursor( 1 ) );
+                scoped_ptr<SortedDataInterface::Cursor> cursor( btree.newCursor( 1 ) );
                 ASSERT( cursor->locate( key, loc ) );
                 ASSERT_EQUALS( key, cursor->getKey() );
                 ASSERT_EQUALS( loc, cursor->getDiskLoc() );
@@ -166,7 +166,7 @@ namespace mongo {
             }
 
             {
-                scoped_ptr<BtreeInterface::Cursor> cursor( btree.newCursor( 1 ) );
+                scoped_ptr<SortedDataInterface::Cursor> cursor( btree.newCursor( 1 ) );
                 ASSERT( cursor->locate( BSON( "a" << 2 ), DiskLoc(0,0) ) );
                 ASSERT( !cursor->isEOF()  );
                 ASSERT_EQUALS( BSON( "" << 2 ), cursor->getKey() );
diff --git a/src/mongo/db/storage/sorted_data_interface.h b/src/mongo/db/storage/sorted_data_interface.h
new file mode 100644
index 00000000000..52f20a6288d
--- /dev/null
+++ b/src/mongo/db/storage/sorted_data_interface.h
@@ -0,0 +1,200 @@
+/**
+ *    Copyright (C) 2014 MongoDB Inc.
+ *
+ *    This program is free software: you can redistribute it and/or  modify
+ *    it under the terms of the GNU Affero General Public License, version 3,
+ *    as published by the Free Software Foundation.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU Affero General Public License for more details.
+ *
+ *    You should have received a copy of the GNU Affero General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the GNU Affero General Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/bson/ordering.h"
+#include "mongo/db/catalog/head_manager.h"
+#include "mongo/db/diskloc.h"
+#include "mongo/db/jsobj.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/db/storage/record_store.h"
+
+#pragma once
+
+namespace mongo {
+
+    class BucketDeletionNotification;
+    class SortedDataBuilderInterface;
+
+    /**
+     * This interface is a work in progress.  Notes below:
+     *
+     * This interface began as the SortedDataInterface, a way to hide the fact that there were two
+     * on-disk formats for the btree.  With the introduction of other storage engines, this
+     * interface was generalized to provide access to sorted data.  Specifically:
+     *
+     * 1. Many other storage engines provide different Btree(-ish) implementations.  This interface
+     * could allow those interfaces to avoid storing btree buckets in an already sorted structure.
+     *
+     * TODO: See if there is actually a performance gain.
+     *
+     * 2. The existing btree implementation is written to assume that if it modifies a record it is
+     * modifying the underlying record.  This interface is an attempt to work around that.
+     *
+     * TODO: See if this actually works.
+     */
+    class SortedDataInterface {
+    public:
+        virtual ~SortedDataInterface() { }
+
+        //
+        // Data changes
+        //
+
+        /**
+         * Caller owns returned pointer.
+         * 'this' must outlive the returned pointer.
+         */
+        virtual SortedDataBuilderInterface* getBulkBuilder(OperationContext* txn,
+                                                           bool dupsAllowed) = 0;
+
+        virtual Status insert(OperationContext* txn,
+                              const BSONObj& key,
+                              const DiskLoc& loc,
+                              bool dupsAllowed) = 0;
+
+        virtual bool unindex(OperationContext* txn,
+                             const BSONObj& key,
+                             const DiskLoc& loc) = 0;
+
+        // TODO: Hide this by exposing an update method?
+        virtual Status dupKeyCheck(OperationContext* txn,
+                                   const BSONObj& key,
+                                   const DiskLoc& loc) = 0;
+
+        //
+        // Information about the tree
+        //
+
+        // TODO: expose full set of args for testing?
+        virtual void fullValidate(OperationContext* txn, long long* numKeysOut) = 0;
+
+        virtual bool isEmpty() = 0;
+        
+        /**
+         * Attempt to bring whole index into memory. No-op is ok if not supported.
+         */
+        virtual Status touch(OperationContext* txn) const = 0;
+
+        //
+        // Navigation
+        //
+
+        class Cursor {
+        public:
+            virtual ~Cursor() {}
+
+            virtual int getDirection() const = 0;
+
+            virtual bool isEOF() const = 0;
+
+            /**
+             * Will only be called with other from same index as this.
+             * All EOF locs should be considered equal.
+             */
+             virtual bool pointsToSamePlaceAs(const Cursor& other) const = 0;
+
+            /**
+             * If the SortedDataInterface impl calls the BucketNotificationCallback, the argument must
+             * be forwarded to all Cursors over that SortedData.
+             * TODO something better.
+             */
+            virtual void aboutToDeleteBucket(const DiskLoc& bucket) = 0;
+
+            virtual bool locate(const BSONObj& key, const DiskLoc& loc) = 0;
+
+            virtual void advanceTo(const BSONObj &keyBegin,
+                                   int keyBeginLen,
+                                   bool afterKey,
+                                   const vector<const BSONElement*>& keyEnd,
+                                   const vector<bool>& keyEndInclusive) = 0;
+
+            /**
+             * Locate a key with fields comprised of a combination of keyBegin fields and keyEnd
+             * fields.
+             */
+            virtual void customLocate(const BSONObj& keyBegin,
+                                      int keyBeginLen,
+                                      bool afterVersion,
+                                      const vector<const BSONElement*>& keyEnd,
+                                      const vector<bool>& keyEndInclusive) = 0;
+
+            /**
+             * Return OK if it's not
+             * Otherwise return a status that can be displayed 
+             */
+            virtual BSONObj getKey() const = 0;
+
+            virtual DiskLoc getDiskLoc() const = 0;
+
+            virtual void advance() = 0;
+
+            //
+            // Saving and restoring state
+            //
+            virtual void savePosition() = 0;
+
+            virtual void restorePosition() = 0;
+        };
+
+        /**
+         * Caller takes ownership. SortedDataInterface must outlive all Cursors it produces.
+         */
+        virtual Cursor* newCursor(OperationContext* txn, int direction) const = 0;
+
+        //
+        // Index creation
+        //
+
+        virtual Status initAsEmpty(OperationContext* txn) = 0;
+    };
+
+    /**
+     * A version-hiding wrapper around the bulk builder for the Btree.
+     */
+    class SortedDataBuilderInterface {
+    public:
+        virtual ~SortedDataBuilderInterface() { }
+
+        /**
+         * Adds 'key' to intermediate storage.
+         *
+         * 'key' must be > or >= the last key passed to this function (depends on _dupsAllowed).  If
+         * this is violated an error Status (ErrorCodes::InternalError) will be returned.
+         */
+        virtual Status addKey(const BSONObj& key, const DiskLoc& loc) = 0;
+
+        /**
+         * commit work.  if not called, destructor will clean up partially completed work
+         *  (in case exception has happened).
+         *
+         * Returns number of keys added.
+         */
+        virtual unsigned long long commit(bool mayInterrupt) = 0;
+    };
+
+}  // namespace mongo