diff options
Diffstat (limited to 'src/mongo/db/storage')
60 files changed, 16723 insertions, 67 deletions
diff --git a/src/mongo/db/storage/capped_callback.h b/src/mongo/db/storage/capped_callback.h new file mode 100644 index 00000000000..59c23f9dab9 --- /dev/null +++ b/src/mongo/db/storage/capped_callback.h @@ -0,0 +1,54 @@ +// record_store_v1_capped.h + +/** +* Copyright (C) 2014 MongoDB Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#pragma once + +#include "mongo/db/diskloc.h" + +namespace mongo { + + class OperationContext; + + /** + * When a capped collection has to delete a document, it needs a way to tell the caller + * what its deleting so it can unindex or do any other cleanup. + * This is that way. + */ + class CappedDocumentDeleteCallback { + public: + virtual ~CappedDocumentDeleteCallback(){} + + /** + * This will be called right before loc is deleted when wrapping. + */ + virtual Status aboutToDeleteCapped( OperationContext* txn, const DiskLoc& loc ) = 0; + }; + +} diff --git a/src/mongo/db/storage/heap1/SConscript b/src/mongo/db/storage/heap1/SConscript index 0b1a6db0383..feb9fcbb2d1 100644 --- a/src/mongo/db/storage/heap1/SConscript +++ b/src/mongo/db/storage/heap1/SConscript @@ -8,9 +8,20 @@ env.Library( 'heap1_engine.cpp', ], LIBDEPS= [ + 'heap_record_store', '$BUILD_DIR/mongo/bson', '$BUILD_DIR/mongo/db/catalog/collection_options', - '$BUILD_DIR/mongo/db/structure/record_store', + '$BUILD_DIR/mongo/foundation', + ] + ) + +env.Library( + target= 'heap_record_store', + source= [ + 'record_store_heap.cpp' + ], + LIBDEPS= [ + '$BUILD_DIR/mongo/bson', '$BUILD_DIR/mongo/foundation', ] ) diff --git a/src/mongo/db/storage/heap1/heap1_btree_impl.cpp b/src/mongo/db/storage/heap1/heap1_btree_impl.cpp index 2d5ae2fc63b..9a2ec04417a 100644 --- a/src/mongo/db/storage/heap1/heap1_btree_impl.cpp +++ b/src/mongo/db/storage/heap1/heap1_btree_impl.cpp @@ -200,7 +200,7 @@ namespace { return it->loc != loc; } - class Heap1BtreeBuilderImpl : public BtreeBuilderInterface { + class Heap1BtreeBuilderImpl : public SortedDataBuilderInterface { public: Heap1BtreeBuilderImpl(IndexSet* data, bool dupsAllowed) : _data(data), @@ -241,14 +241,14 @@ namespace { bool _committed; }; - class Heap1BtreeImpl : public BtreeInterface { + class Heap1BtreeImpl : public SortedDataInterface { public: Heap1BtreeImpl(const IndexCatalogEntry& info, IndexSet* data) : _info(info), _data(data) {} - virtual BtreeBuilderInterface* getBulkBuilder(OperationContext* txn, bool dupsAllowed) { + virtual SortedDataBuilderInterface* getBulkBuilder(OperationContext* txn, bool dupsAllowed) { return new Heap1BtreeBuilderImpl(_data, dupsAllowed); } @@ -300,7 +300,7 @@ namespace { return Status::OK(); } - class ForwardCursor : public BtreeInterface::Cursor { + class ForwardCursor : public SortedDataInterface::Cursor { public: ForwardCursor(const IndexSet& data, OperationContext* txn) : _txn(txn), @@ -314,7 +314,7 @@ namespace { return _it == _data.end(); } - virtual bool pointsToSamePlaceAs(const BtreeInterface::Cursor& otherBase) const { + virtual bool pointsToSamePlaceAs(const SortedDataInterface::Cursor& otherBase) const { const ForwardCursor& other = static_cast<const ForwardCursor&>(otherBase); invariant(&_data == &other._data); // iterators over same index return _it == other._it; @@ -399,7 +399,7 @@ namespace { }; // TODO see if this can share any code with ForwardIterator - class ReverseCursor : public BtreeInterface::Cursor { + class ReverseCursor : public SortedDataInterface::Cursor { public: ReverseCursor(const IndexSet& data, OperationContext* txn) : _txn(txn), @@ -413,7 +413,7 @@ namespace { return _it == _data.rend(); } - virtual bool pointsToSamePlaceAs(const BtreeInterface::Cursor& otherBase) const { + virtual bool pointsToSamePlaceAs(const SortedDataInterface::Cursor& otherBase) const { const ReverseCursor& other = static_cast<const ReverseCursor&>(otherBase); invariant(&_data == &other._data); // iterators over same index return _it == other._it; @@ -512,7 +512,7 @@ namespace { DiskLoc _savedLoc; }; - virtual BtreeInterface::Cursor* newCursor(OperationContext* txn, int direction) const { + virtual SortedDataInterface::Cursor* newCursor(OperationContext* txn, int direction) const { if (direction == 1) return new ForwardCursor(*_data, txn); @@ -533,7 +533,7 @@ namespace { // IndexCatalogEntry argument taken by non-const pointer for consistency with other Btree // factories. We don't actually modify it. - BtreeInterface* getHeap1BtreeImpl(IndexCatalogEntry* info, boost::shared_ptr<void>* dataInOut) { + SortedDataInterface* getHeap1BtreeImpl(IndexCatalogEntry* info, boost::shared_ptr<void>* dataInOut) { invariant(info); invariant(dataInOut); if (!*dataInOut) { diff --git a/src/mongo/db/storage/heap1/heap1_btree_impl.h b/src/mongo/db/storage/heap1/heap1_btree_impl.h index 72b38ce3696..7187dc589dc 100644 --- a/src/mongo/db/storage/heap1/heap1_btree_impl.h +++ b/src/mongo/db/storage/heap1/heap1_btree_impl.h @@ -28,17 +28,18 @@ #include <boost/shared_ptr.hpp> -#include "mongo/db/structure/btree/btree_interface.h" +#include "mongo/db/storage/sorted_data_interface.h" #pragma once namespace mongo { + class IndexCatalogEntry; /** * Caller takes ownership. * All permanent data will be stored and fetch from dataInOut. */ - BtreeInterface* getHeap1BtreeImpl(IndexCatalogEntry* info, boost::shared_ptr<void>* dataInOut); + SortedDataInterface* getHeap1BtreeImpl(IndexCatalogEntry* info, boost::shared_ptr<void>* dataInOut); } // namespace mongo diff --git a/src/mongo/db/storage/heap1/heap1_database_catalog_entry.cpp b/src/mongo/db/storage/heap1/heap1_database_catalog_entry.cpp index 53dea7f10c7..58e069d9863 100644 --- a/src/mongo/db/storage/heap1/heap1_database_catalog_entry.cpp +++ b/src/mongo/db/storage/heap1/heap1_database_catalog_entry.cpp @@ -1,32 +1,30 @@ -// heap1_database_catalog_entry.cpp - /** -* Copyright (C) 2014 MongoDB Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ + * Copyright (C) 2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ #include "mongo/db/storage/heap1/heap1_database_catalog_entry.h" @@ -43,7 +41,7 @@ #include "mongo/db/operation_context.h" #include "mongo/db/storage/heap1/heap1_btree_impl.h" #include "mongo/db/storage/heap1/heap1_recovery_unit.h" -#include "mongo/db/structure/record_store_heap.h" +#include "mongo/db/storage/heap1/record_store_heap.h" namespace mongo { @@ -159,14 +157,14 @@ namespace mongo { index->headManager()->setHead(txn, DiskLoc(0xDEAD, 0xBEAF)); // When is a btree not a Btree? When it is a Heap1BtreeImpl! - std::auto_ptr<BtreeInterface> btree(getHeap1BtreeImpl(index, &i->second->data)); + std::auto_ptr<SortedDataInterface> btree(getHeap1BtreeImpl(index, &i->second->data)); #else if (!i->second->rs) i->second->rs.reset(new HeapRecordStore( index->descriptor()->indexName() )); - std::auto_ptr<BtreeInterface> btree( - BtreeInterface::getInterface(index->headManager(), + std::auto_ptr<SortedDataInterface> btree( + SortedDataInterface::getInterface(index->headManager(), i->second->rs, index->ordering(), index->descriptor()->indexNamespace(), diff --git a/src/mongo/db/storage/heap1/record_store_heap.cpp b/src/mongo/db/storage/heap1/record_store_heap.cpp new file mode 100644 index 00000000000..e0578dc5c71 --- /dev/null +++ b/src/mongo/db/storage/heap1/record_store_heap.cpp @@ -0,0 +1,494 @@ +/** + * Copyright (C) 2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/storage/heap1/record_store_heap.h" + +namespace mongo { + + // + // RecordStore + // + + HeapRecordStore::HeapRecordStore(const StringData& ns, + bool isCapped, + int64_t cappedMaxSize, + int64_t cappedMaxDocs, + CappedDocumentDeleteCallback* cappedDeleteCallback) + : RecordStore(ns), + _isCapped(isCapped), + _cappedMaxSize(cappedMaxSize), + _cappedMaxDocs(cappedMaxDocs), + _cappedDeleteCallback(cappedDeleteCallback), + _dataSize(0), + _nextId(1) { // DiskLoc(0,0) isn't valid for records. + + if (_isCapped) { + invariant(_cappedMaxSize > 0); + invariant(_cappedMaxDocs == -1 || _cappedMaxDocs > 0); + } + else { + invariant(_cappedMaxSize == -1); + invariant(_cappedMaxDocs == -1); + } + } + + const char* HeapRecordStore::name() const { return "heap"; } + + RecordData HeapRecordStore::dataFor( const DiskLoc& loc ) const { + return recordFor(loc)->toRecordData(); + } + + HeapRecordStore::HeapRecord* HeapRecordStore::recordFor(const DiskLoc& loc) const { + Records::const_iterator it = _records.find(loc); + invariant(it != _records.end()); + return reinterpret_cast<HeapRecord*>(it->second.get()); + } + + void HeapRecordStore::deleteRecord(OperationContext* txn, const DiskLoc& loc) { + HeapRecord* rec = recordFor(loc); + _dataSize -= rec->netLength(); + invariant(_records.erase(loc) == 1); + } + + bool HeapRecordStore::cappedAndNeedDelete() const { + if (!_isCapped) + return false; + + if (_dataSize > _cappedMaxSize) + return true; + + if ((_cappedMaxDocs != -1) && (numRecords() > _cappedMaxDocs)) + return true; + + return false; + } + + void HeapRecordStore::cappedDeleteAsNeeded(OperationContext* txn) { + while (cappedAndNeedDelete()) { + invariant(!_records.empty()); + + DiskLoc oldest = _records.begin()->first; + + if (_cappedDeleteCallback) + uassertStatusOK(_cappedDeleteCallback->aboutToDeleteCapped(txn, oldest)); + + deleteRecord(txn, oldest); + } + } + + StatusWith<DiskLoc> HeapRecordStore::insertRecord(OperationContext* txn, + const char* data, + int len, + bool enforceQuota) { + if (_isCapped && len > _cappedMaxSize) { + // We use dataSize for capped rollover and we don't want to delete everything if we know + // this won't fit. + return StatusWith<DiskLoc>(ErrorCodes::BadValue, + "object to insert exceeds cappedMaxSize"); + } + + // TODO padding? + const int lengthWithHeaders = len + HeapRecord::HeaderSize; + boost::shared_array<char> buf(new char[lengthWithHeaders]); + HeapRecord* rec = reinterpret_cast<HeapRecord*>(buf.get()); + rec->lengthWithHeaders() = lengthWithHeaders; + memcpy(rec->data(), data, len); + + const DiskLoc loc = allocateLoc(); + _records[loc] = buf; + _dataSize += len; + + cappedDeleteAsNeeded(txn); + + return StatusWith<DiskLoc>(loc); + } + + StatusWith<DiskLoc> HeapRecordStore::insertRecord(OperationContext* txn, + const DocWriter* doc, + bool enforceQuota) { + const int len = doc->documentSize(); + if (_isCapped && len > _cappedMaxSize) { + // We use dataSize for capped rollover and we don't want to delete everything if we know + // this won't fit. + return StatusWith<DiskLoc>(ErrorCodes::BadValue, + "object to insert exceeds cappedMaxSize"); + } + + // TODO padding? + const int lengthWithHeaders = len + HeapRecord::HeaderSize; + boost::shared_array<char> buf(new char[lengthWithHeaders]); + HeapRecord* rec = reinterpret_cast<HeapRecord*>(buf.get()); + rec->lengthWithHeaders() = lengthWithHeaders; + doc->writeDocument(rec->data()); + + const DiskLoc loc = allocateLoc(); + _records[loc] = buf; + _dataSize += len; + + cappedDeleteAsNeeded(txn); + + return StatusWith<DiskLoc>(loc); + } + + StatusWith<DiskLoc> HeapRecordStore::updateRecord(OperationContext* txn, + const DiskLoc& oldLocation, + const char* data, + int len, + bool enforceQuota, + UpdateMoveNotifier* notifier ) { + HeapRecord* oldRecord = recordFor( oldLocation ); + int oldLen = oldRecord->netLength(); + + // If the length of the new data is <= the length of the old data then just + // memcopy into the old space + if ( len <= oldLen) { + memcpy(oldRecord->data(), data, len); + _dataSize += len - oldLen; + return StatusWith<DiskLoc>(oldLocation); + } + + if ( _isCapped ) { + return StatusWith<DiskLoc>( ErrorCodes::InternalError, + "failing update: objects in a capped ns cannot grow", + 10003 ); + } + + // If the length of the new data exceeds the size of the old Record, we need to allocate + // a new Record, and delete the old one + + const int lengthWithHeaders = len + HeapRecord::HeaderSize; + boost::shared_array<char> buf(new char[lengthWithHeaders]); + HeapRecord* rec = reinterpret_cast<HeapRecord*>(buf.get()); + rec->lengthWithHeaders() = lengthWithHeaders; + memcpy(rec->data(), data, len); + + _records[oldLocation] = buf; + _dataSize += len - oldLen; + + cappedDeleteAsNeeded(txn); + + return StatusWith<DiskLoc>(oldLocation); + } + + Status HeapRecordStore::updateWithDamages( OperationContext* txn, + const DiskLoc& loc, + const char* damangeSource, + const mutablebson::DamageVector& damages ) { + HeapRecord* rec = recordFor( loc ); + char* root = rec->data(); + + // All updates were in place. Apply them via durability and writing pointer. + mutablebson::DamageVector::const_iterator where = damages.begin(); + const mutablebson::DamageVector::const_iterator end = damages.end(); + for( ; where != end; ++where ) { + const char* sourcePtr = damangeSource + where->sourceOffset; + char* targetPtr = root + where->targetOffset; + std::memcpy(targetPtr, sourcePtr, where->size); + } + + return Status::OK(); + } + + RecordIterator* HeapRecordStore::getIterator(OperationContext* txn, + const DiskLoc& start, + bool tailable, + const CollectionScanParams::Direction& dir) const { + if (tailable) + invariant(_isCapped && dir == CollectionScanParams::FORWARD); + + if (dir == CollectionScanParams::FORWARD) { + return new HeapRecordIterator(txn, _records, *this, start, tailable); + } + else { + return new HeapRecordIterator(txn, _records, *this, start); + } + } + + RecordIterator* HeapRecordStore::getIteratorForRepair(OperationContext* txn) const { + // TODO maybe make different from HeapRecordIterator + return new HeapRecordIterator(txn, _records, *this); + } + + std::vector<RecordIterator*> HeapRecordStore::getManyIterators(OperationContext* txn) const { + std::vector<RecordIterator*> out; + // TODO maybe find a way to return multiple iterators. + out.push_back(new HeapRecordIterator(txn, _records, *this)); + return out; + } + + Status HeapRecordStore::truncate(OperationContext* txn) { + _records.clear(); + _dataSize = 0; + return Status::OK(); + } + + void HeapRecordStore::temp_cappedTruncateAfter(OperationContext* txn, + DiskLoc end, + bool inclusive) { + Records::iterator it = inclusive ? _records.lower_bound(end) + : _records.upper_bound(end); + while(it != _records.end()) { + _dataSize -= reinterpret_cast<HeapRecord*>(it->second.get())->netLength(); + _records.erase(it++); + } + } + + bool HeapRecordStore::compactSupported() const { + return false; + } + Status HeapRecordStore::compact(OperationContext* txn, + RecordStoreCompactAdaptor* adaptor, + const CompactOptions* options, + CompactStats* stats) { + // TODO might be possible to do something here + invariant(!"compact not yet implemented"); + } + + Status HeapRecordStore::validate(OperationContext* txn, + bool full, + bool scanData, + ValidateAdaptor* adaptor, + ValidateResults* results, + BSONObjBuilder* output) const { + results->valid = true; + if (scanData && full) { + for (Records::const_iterator it = _records.begin(); it != _records.end(); ++it) { + HeapRecord* rec = reinterpret_cast<HeapRecord*>(it->second.get()); + size_t dataSize; + const Status status = adaptor->validate(rec->toRecordData(), &dataSize); + if (!status.isOK()) { + results->valid = false; + results->errors.push_back("invalid object detected (see logs)"); + log() << "Invalid object detected in " << _ns << ": " << status.reason(); + } + } + } + + output->appendNumber( "nrecords", _records.size() ); + + return Status::OK(); + + } + + void HeapRecordStore::appendCustomStats( OperationContext* txn, + BSONObjBuilder* result, + double scale ) const { + result->append( "note", "HeapRecordStore has no cusom stats yet" ); + } + + Status HeapRecordStore::touch(OperationContext* txn, BSONObjBuilder* output) const { + if (output) { + output->append("numRanges", 1); + output->append("millis", 0); + } + return Status::OK(); + } + + Status HeapRecordStore::setCustomOption( + OperationContext* txn, const BSONElement& option, BSONObjBuilder* info) { + invariant(!"setCustomOption not yet implemented"); + } + + void HeapRecordStore::increaseStorageSize(OperationContext* txn, int size, bool enforceQuota) { + // unclear what this would mean for this class. For now, just error if called. + invariant(!"increaseStorageSize not yet implemented"); + } + + int64_t HeapRecordStore::storageSize(OperationContext* txn, + BSONObjBuilder* extraInfo, + int infoLevel) const { + // Note: not making use of extraInfo or infoLevel since we don't have extents + const int64_t recordOverhead = numRecords() * HeapRecord::HeaderSize; + return _dataSize + recordOverhead; + } + + DiskLoc HeapRecordStore::allocateLoc() { + const int64_t id = _nextId++; + // This is a hack, but both the high and low order bits of DiskLoc offset must be 0, and the + // file must fit in 23 bits. This gives us a total of 30 + 23 == 53 bits. + invariant(id < (1LL << 53)); + return DiskLoc(int(id >> 30), int((id << 1) & ~(1<<31))); + } + + // + // Forward Iterator + // + + HeapRecordIterator::HeapRecordIterator(OperationContext* txn, + const HeapRecordStore::Records& records, + const HeapRecordStore& rs, + DiskLoc start, + bool tailable) + : _txn(txn), + _tailable(tailable), + _lastLoc(minDiskLoc), + _killedByInvalidate(false), + _records(records), + _rs(rs) { + if (start.isNull()) { + _it = _records.begin(); + } + else { + _it = _records.find(start); + invariant(_it != _records.end()); + } + } + + bool HeapRecordIterator::isEOF() { + return _it == _records.end(); + } + + DiskLoc HeapRecordIterator::curr() { + if (isEOF()) + return DiskLoc(); + return _it->first; + } + + DiskLoc HeapRecordIterator::getNext() { + if (isEOF()) { + if (!_tailable) + return DiskLoc(); + + if (_records.empty()) + return DiskLoc(); + + invariant(!_killedByInvalidate); + + // recover to last returned record + invariant(!_lastLoc.isNull()); + _it = _records.find(_lastLoc); + invariant(_it != _records.end()); + + if (++_it == _records.end()) + return DiskLoc(); + } + + const DiskLoc out = _it->first; + ++_it; + if (_tailable && _it == _records.end()) + _lastLoc = out; + return out; + } + + void HeapRecordIterator::invalidate(const DiskLoc& loc) { + if (_rs.isCapped()) { + // Capped iterators die on invalidation rather than advancing. + if (isEOF()) { + if (_lastLoc == loc) { + _killedByInvalidate = true; + } + } + else if (_it->first == loc) { + _killedByInvalidate = true; + } + + return; + } + + if (_it != _records.end() && _it->first == loc) + ++_it; + } + + void HeapRecordIterator::prepareToYield() { + } + + bool HeapRecordIterator::recoverFromYield() { + return !_killedByInvalidate; + } + + RecordData HeapRecordIterator::dataFor(const DiskLoc& loc) const { + return _rs.dataFor(loc); + } + + // + // Reverse Iterator + // + + HeapRecordReverseIterator::HeapRecordReverseIterator(OperationContext* txn, + const HeapRecordStore::Records& records, + const HeapRecordStore& rs, + DiskLoc start) + : _txn(txn), + _killedByInvalidate(false), + _records(records), + _rs(rs) { + if (start.isNull()) { + _it = _records.rbegin(); + } + else { + _it = HeapRecordStore::Records::const_reverse_iterator(_records.find(start)); + invariant(_it != _records.rend()); + } + } + + bool HeapRecordReverseIterator::isEOF() { + return _it == _records.rend(); + } + + DiskLoc HeapRecordReverseIterator::curr() { + if (isEOF()) + return DiskLoc(); + return _it->first; + } + + DiskLoc HeapRecordReverseIterator::getNext() { + if (isEOF()) + return DiskLoc(); + + const DiskLoc out = _it->first; + ++_it; + return out; + } + + void HeapRecordReverseIterator::invalidate(const DiskLoc& loc) { + if (isEOF()) + return; + + if (_it->first == loc) { + if (_rs.isCapped()) { + // Capped iterators die on invalidation rather than advancing. + _killedByInvalidate = true; + return; + } + ++_it; + } + } + + void HeapRecordReverseIterator::prepareToYield() { + } + + bool HeapRecordReverseIterator::recoverFromYield() { + return !_killedByInvalidate; + } + + RecordData HeapRecordReverseIterator::dataFor(const DiskLoc& loc) const { + return _rs.dataFor(loc); + } + +} // namespace mongo diff --git a/src/mongo/db/storage/heap1/record_store_heap.h b/src/mongo/db/storage/heap1/record_store_heap.h new file mode 100644 index 00000000000..f4810b04972 --- /dev/null +++ b/src/mongo/db/storage/heap1/record_store_heap.h @@ -0,0 +1,241 @@ +// record_store_heap.h + +/** +* Copyright (C) 2014 MongoDB Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#pragma once + +#include <boost/shared_array.hpp> +#include <map> + +#include "mongo/db/storage/capped_callback.h" +#include "mongo/db/storage/record_store.h" + +namespace mongo { + + class HeapRecordIterator; + + /** + * A RecordStore that stores all data on the heap. + * + * @param cappedMaxSize - required if isCapped. limit uses dataSize() in this impl. + */ + class HeapRecordStore : public RecordStore { + public: + explicit HeapRecordStore(const StringData& ns, + bool isCapped = false, + int64_t cappedMaxSize = -1, + int64_t cappedMaxDocs = -1, + CappedDocumentDeleteCallback* cappedDeleteCallback = NULL); + + virtual const char* name() const; + + virtual RecordData dataFor( const DiskLoc& loc ) const; + + virtual void deleteRecord( OperationContext* txn, const DiskLoc& dl ); + + virtual StatusWith<DiskLoc> insertRecord( OperationContext* txn, + const char* data, + int len, + bool enforceQuota ); + + virtual StatusWith<DiskLoc> insertRecord( OperationContext* txn, + const DocWriter* doc, + bool enforceQuota ); + + virtual StatusWith<DiskLoc> updateRecord( OperationContext* txn, + const DiskLoc& oldLocation, + const char* data, + int len, + bool enforceQuota, + UpdateMoveNotifier* notifier ); + + virtual Status updateWithDamages( OperationContext* txn, + const DiskLoc& loc, + const char* damangeSource, + const mutablebson::DamageVector& damages ); + + virtual RecordIterator* getIterator( OperationContext* txn, + const DiskLoc& start, + bool tailable, + const CollectionScanParams::Direction& dir) const; + + virtual RecordIterator* getIteratorForRepair( OperationContext* txn ) const; + + virtual std::vector<RecordIterator*> getManyIterators( OperationContext* txn ) const; + + virtual Status truncate( OperationContext* txn ); + + virtual void temp_cappedTruncateAfter( OperationContext* txn, DiskLoc end, bool inclusive ); + + virtual bool compactSupported() const; + virtual Status compact( OperationContext* txn, + RecordStoreCompactAdaptor* adaptor, + const CompactOptions* options, + CompactStats* stats ); + + virtual Status validate( OperationContext* txn, + bool full, + bool scanData, + ValidateAdaptor* adaptor, + ValidateResults* results, BSONObjBuilder* output ) const; + + virtual void appendCustomStats( OperationContext* txn, + BSONObjBuilder* result, + double scale ) const; + + virtual Status touch( OperationContext* txn, BSONObjBuilder* output ) const; + + virtual Status setCustomOption( OperationContext* txn, + const BSONElement& option, + BSONObjBuilder* info = NULL ); + + virtual void increaseStorageSize( OperationContext* txn, int size, bool enforceQuota ); + + virtual int64_t storageSize( OperationContext* txn, + BSONObjBuilder* extraInfo = NULL, + int infoLevel = 0) const; + + virtual long long dataSize() const { return _dataSize; } + + virtual long long numRecords() const { return _records.size(); } + + protected: + class HeapRecord { + public: + enum HeaderSizeValue { HeaderSize = 16 }; + + int lengthWithHeaders() const { return _lengthWithHeaders; } + int& lengthWithHeaders() { return _lengthWithHeaders; } + + const char* data() const { return _data; } + char* data() { return _data; } + + int netLength() const { return _lengthWithHeaders - HeaderSize; } + + RecordData toRecordData() const { return RecordData(_data, netLength()); } + + private: + int _lengthWithHeaders; + char _data[4]; + }; + + virtual HeapRecord* recordFor( const DiskLoc& loc ) const; + + public: + // + // Not in RecordStore interface + // + + typedef std::map<DiskLoc, boost::shared_array<char> > Records; + + bool isCapped() const { return _isCapped; } + void setCappedDeleteCallback(CappedDocumentDeleteCallback* cb) { _cappedDeleteCallback = cb; } + bool cappedMaxDocs() const { invariant(_isCapped); return _cappedMaxDocs; } + bool cappedMaxSize() const { invariant(_isCapped); return _cappedMaxSize; } + + private: + DiskLoc allocateLoc(); + bool cappedAndNeedDelete() const; + void cappedDeleteAsNeeded(OperationContext* txn); + + // TODO figure out a proper solution to metadata + const bool _isCapped; + const int64_t _cappedMaxSize; + const int64_t _cappedMaxDocs; + CappedDocumentDeleteCallback* _cappedDeleteCallback; + int64_t _dataSize; + + Records _records; + int64_t _nextId; + }; + + class HeapRecordIterator : public RecordIterator { + public: + HeapRecordIterator(OperationContext* txn, + const HeapRecordStore::Records& records, + const HeapRecordStore& rs, + DiskLoc start = DiskLoc(), + bool tailable = false); + + virtual bool isEOF(); + + virtual DiskLoc curr(); + + virtual DiskLoc getNext(); + + virtual void invalidate(const DiskLoc& dl); + + virtual void prepareToYield(); + + virtual bool recoverFromYield(); + + virtual RecordData dataFor( const DiskLoc& loc ) const; + + private: + OperationContext* _txn; // not owned + HeapRecordStore::Records::const_iterator _it; + bool _tailable; + DiskLoc _lastLoc; // only for restarting tailable + bool _killedByInvalidate; + + const HeapRecordStore::Records& _records; + const HeapRecordStore& _rs; + }; + + class HeapRecordReverseIterator : public RecordIterator { + public: + HeapRecordReverseIterator(OperationContext* txn, + const HeapRecordStore::Records& records, + const HeapRecordStore& rs, + DiskLoc start = DiskLoc()); + + virtual bool isEOF(); + + virtual DiskLoc curr(); + + virtual DiskLoc getNext(); + + virtual void invalidate(const DiskLoc& dl); + + virtual void prepareToYield(); + + virtual bool recoverFromYield(); + + virtual RecordData dataFor( const DiskLoc& loc ) const; + + private: + OperationContext* _txn; // not owned + HeapRecordStore::Records::const_reverse_iterator _it; + bool _killedByInvalidate; + + const HeapRecordStore::Records& _records; + const HeapRecordStore& _rs; + }; + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/SConscript b/src/mongo/db/storage/mmap_v1/SConscript index 5f7ac5eabd2..11b6b06b3e7 100644 --- a/src/mongo/db/storage/mmap_v1/SConscript +++ b/src/mongo/db/storage/mmap_v1/SConscript @@ -1,6 +1,34 @@ Import("env") env.Library( + target = 'storage_mmapv1', + source = [ "catalog/index_details.cpp", + "catalog/namespace.cpp", + "catalog/namespace_details.cpp", + "catalog/namespace_details_collection_entry.cpp", + "catalog/namespace_details_rsv1_metadata.cpp", + "catalog/namespace_index.cpp", + "data_file.cpp", + "durable_mapped_file.cpp", + "dur.cpp", + "durop.cpp", + "dur_writetodatafiles.cpp", + "dur_preplogbuffer.cpp", + "dur_commitjob.cpp", + "dur_recover.cpp", + "dur_journal.cpp", + "dur_recovery_unit.cpp", + "mmap_v1_database_catalog_entry.cpp", + "mmap_v1_engine.cpp", + "mmap_v1_extent_manager.cpp", + "repair_database.cpp", + ], + LIBDEPS = [ + 'record_store_v1', + 'btree'] + ) + +env.Library( target= 'extent', source= [ 'extent.cpp', @@ -11,3 +39,94 @@ env.Library( '$BUILD_DIR/mongo/foundation', ] ) + +env.Library( + target= 'record_store_v1', + source= [ + 'record_store_v1_base.cpp', + 'record_store_v1_capped.cpp', + 'record_store_v1_capped_iterator.cpp', + 'record_store_v1_repair_iterator.cpp', + 'record_store_v1_simple.cpp', + 'record_store_v1_simple_iterator.cpp', + ], + LIBDEPS= [ + 'extent', + '$BUILD_DIR/mongo/mongocommon', # for ProgressMeter + '$BUILD_DIR/mongo/db/commands/server_status_core', + ] + ) + +env.Library( + target='record_store_v1_test_help', + source=['record_store_v1_test_help.cpp', + ], + LIBDEPS=[ + 'record_store_v1' + ] + ) + +env.CppUnitTest(target = 'namespace_test', + source = ['catalog/namespace_test.cpp'], + LIBDEPS = ['$BUILD_DIR/mongo/foundation']) + +env.CppUnitTest( + target='record_store_v1_simple_test', + source=['record_store_v1_simple_test.cpp', + ], + LIBDEPS=[ + 'record_store_v1_test_help' + ] + ) + +env.CppUnitTest( + target='record_store_v1_capped_test', + source=['record_store_v1_capped_test.cpp', + ], + LIBDEPS=[ + 'record_store_v1_test_help' + ] + ) + +env.Library( + target= 'btree', + source= [ + 'btree/btree_logic.cpp', + 'btree/btree_interface.cpp', + 'btree/key.cpp' + ], + LIBDEPS= [ + '$BUILD_DIR/mongo/bson' + ] + ) + +env.Library( + target= 'btree_test_help', + source= [ + 'btree/btree_test_help.cpp' + ], + LIBDEPS= [ + 'btree', + '$BUILD_DIR/mongo/mongocommon', # for ProgressMeter + '$BUILD_DIR/mongo/db/storage/mmap_v1/record_store_v1_test_help', + '$BUILD_DIR/mongo/db/storage/heap1/heap_record_store' # XXX? + ] + ) + +env.CppUnitTest( + target='btree_logic_test', + source=['btree/btree_logic_test.cpp' + ], + LIBDEPS=[ + 'btree_test_help' + ] + ) + +env.CppUnitTest( + target='btree_builder_test', + source=['btree/btree_builder_test.cpp' + ], + LIBDEPS=[ + 'btree_test_help' + ] + ) diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_builder_test.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_builder_test.cpp new file mode 100644 index 00000000000..89d2ffc4d98 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/btree/btree_builder_test.cpp @@ -0,0 +1,133 @@ +// btree_builder_test.cpp : Btree builder unit test + +/** + * Copyright (C) 2014 MongoDB + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects + * for all of the code used other than as permitted herein. If you modify + * file(s) with this exception, you may extend this exception to your + * version of the file(s), but you are not obligated to do so. If you do not + * wish to do so, delete this exception statement from your version. If you + * delete this exception statement from all source files in the program, + * then also delete it in the license file. + */ + +// This file contains simple tests to check the Btree builder logic, +// including handling of interruptions. + +#include "mongo/db/instance.h" +#include "mongo/db/operation_context_noop.h" +#include "mongo/db/storage/mmap_v1/btree/btree_test_help.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { + + class MockOperationContextKillable : public OperationContextNoop { + public: + MockOperationContextKillable() + : _killPending(false) { + } + + virtual void checkForInterrupt(bool heedMutex = true) const { + if (_killPending) { + throw UserException(ErrorCodes::Interrupted, "interrupted"); + } + } + + virtual void kill() { + _killPending = true; + } + + private: + bool _killPending; + }; + + /** + * Builder::commit() is interrupted if there is a request to kill the current operation. + */ + template<class OnDiskFormat> + class InterruptCommit { + public: + typedef typename BtreeLogic<OnDiskFormat>::Builder Builder; + + InterruptCommit( bool mayInterrupt ) : + _mayInterrupt( mayInterrupt ), + _helper(BSON( "a" << 1 )) { + } + + void run() { + // Create a btree builder. + MockOperationContextKillable txn; + Builder* builder = _helper.btree.newBuilder(&txn, false); + + // Add some keys to the builder, in order. We need enough keys to build an internal + // node in order to check for an interrupt. + int32_t nKeys = 1000; + for( int32_t i = 0; i < nKeys; ++i ) { + BSONObj key = BSON( "a" << i ); + builder->addKey( key, /* dummy location */ DiskLoc() ); + } + + // The root of the index has not yet been set. + ASSERT( _helper.headManager.getHead().isNull() ); + // Register a request to kill the current operation. + txn.kill(); + if ( _mayInterrupt ) { + // Call commit on the builder, which will be aborted due to the kill request. + ASSERT_THROWS( builder->commit( _mayInterrupt ), UserException ); + // The root of the index is not set because commit() did not complete. + ASSERT( _helper.headManager.getHead().isNull() ); + } + else { + // Call commit on the builder, which will not be aborted because mayInterrupt is + // false. + builder->commit( _mayInterrupt ); + // The root of the index is set because commit() completed. + ASSERT( !_helper.headManager.getHead().isNull() ); + } + } + + private: + bool _mayInterrupt; + BtreeLogicTestHelper<OnDiskFormat> _helper; + }; + + + // + // TEST SUITE DEFINITION + // + + template<class OnDiskFormat> + class BtreeBuilderTestSuite : public unittest::Suite { + public: + BtreeBuilderTestSuite(const std::string& name) : Suite(name) { + + } + + void setupTests() { + + add< InterruptCommit<OnDiskFormat> >( false ); + add< InterruptCommit<OnDiskFormat> >( true ); + } + }; + + // Test suite for both V0 and V1 + static BtreeBuilderTestSuite<BtreeLayoutV0> SUITE_V0("BtreeBuilderTests V0"); + static BtreeBuilderTestSuite<BtreeLayoutV1> SUITE_V1("BtreeBuilderTests V1"); +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp new file mode 100644 index 00000000000..6d2fae7bffa --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp @@ -0,0 +1,266 @@ +/** + * Copyright (C) 2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/storage/sorted_data_interface.h" + +#include "mongo/db/operation_context.h" +#include "mongo/db/storage/mmap_v1/btree/btree_logic.h" + + +namespace mongo { + + template <class OnDiskFormat> + class BtreeBuilderInterfaceImpl : public SortedDataBuilderInterface { + public: + BtreeBuilderInterfaceImpl(OperationContext* trans, + typename BtreeLogic<OnDiskFormat>::Builder* builder) + : _builder(builder), _trans(trans) { } + + virtual ~BtreeBuilderInterfaceImpl() { } + + Status addKey(const BSONObj& key, const DiskLoc& loc) { + return _builder->addKey(key, loc); + } + + unsigned long long commit(bool mayInterrupt) { + return _builder->commit(mayInterrupt); + } + + private: + typename BtreeLogic<OnDiskFormat>::Builder* _builder; + + // Not owned here. + OperationContext* _trans; + }; + + template <class OnDiskFormat> + class BtreeInterfaceImpl : public SortedDataInterface { + public: + BtreeInterfaceImpl(HeadManager* headManager, + RecordStore* recordStore, + const Ordering& ordering, + const string& indexName, + BucketDeletionNotification* bucketDeletionNotification) { + + _btree.reset(new BtreeLogic<OnDiskFormat>(headManager, + recordStore, + ordering, + indexName, + bucketDeletionNotification)); + } + + virtual ~BtreeInterfaceImpl() { } + + virtual SortedDataBuilderInterface* getBulkBuilder(OperationContext* txn, + bool dupsAllowed) { + + return new BtreeBuilderInterfaceImpl<OnDiskFormat>( + txn, _btree->newBuilder(txn, dupsAllowed)); + } + + virtual Status insert(OperationContext* txn, + const BSONObj& key, + const DiskLoc& loc, + bool dupsAllowed) { + + return _btree->insert(txn, key, loc, dupsAllowed); + } + + virtual bool unindex(OperationContext* txn, + const BSONObj& key, + const DiskLoc& loc) { + + return _btree->unindex(txn, key, loc); + } + + virtual void fullValidate(OperationContext* txn, long long *numKeysOut) { + *numKeysOut = _btree->fullValidate(txn, NULL, false, false, 0); + } + + virtual Status dupKeyCheck(OperationContext* txn, + const BSONObj& key, + const DiskLoc& loc) { + return _btree->dupKeyCheck(txn, key, loc); + } + + virtual bool isEmpty() { + return _btree->isEmpty(); + } + + virtual Status touch(OperationContext* txn) const{ + return _btree->touch(txn); + } + + class Cursor : public SortedDataInterface::Cursor { + public: + Cursor(OperationContext* txn, + const BtreeLogic<OnDiskFormat>* btree, + int direction) + : _txn(txn), + _btree(btree), + _direction(direction), + _bucket(btree->getHead()), // XXX this shouldn't be nessisary, but is. + _ofs(0) { + } + + virtual int getDirection() const { return _direction; } + + virtual bool isEOF() const { return _bucket.isNull(); } + + virtual bool pointsToSamePlaceAs(const SortedDataInterface::Cursor& otherBase) const { + const Cursor& other = static_cast<const Cursor&>(otherBase); + if (isEOF()) + return other.isEOF(); + + return _bucket == other._bucket && _ofs == other._ofs; + + } + + virtual void aboutToDeleteBucket(const DiskLoc& bucket) { + if (_bucket == bucket) + _ofs = -1; + } + + virtual bool locate(const BSONObj& key, const DiskLoc& loc) { + return _btree->locate(_txn, key, loc, _direction, &_ofs, &_bucket); + } + + virtual void customLocate(const BSONObj& keyBegin, + int keyBeginLen, + bool afterKey, + const vector<const BSONElement*>& keyEnd, + const vector<bool>& keyEndInclusive) { + + _btree->customLocate(_txn, + &_bucket, + &_ofs, + keyBegin, + keyBeginLen, + afterKey, + keyEnd, + keyEndInclusive, + _direction); + } + + void advanceTo(const BSONObj &keyBegin, + int keyBeginLen, + bool afterKey, + const vector<const BSONElement*>& keyEnd, + const vector<bool>& keyEndInclusive) { + + _btree->advanceTo(_txn, + &_bucket, + &_ofs, + keyBegin, + keyBeginLen, + afterKey, + keyEnd, + keyEndInclusive, + _direction); + } + + virtual BSONObj getKey() const { + return _btree->getKey(_bucket, _ofs); + } + + virtual DiskLoc getDiskLoc() const { + return _btree->getDiskLoc(_bucket, _ofs); + } + + virtual void advance() { + _btree->advance(_txn, &_bucket, &_ofs, _direction); + } + + virtual void savePosition() { + if (!_bucket.isNull()) { + _savedKey = getKey().getOwned(); + _savedLoc = getDiskLoc(); + } + } + + virtual void restorePosition() { + if (!_bucket.isNull()) { + _btree->restorePosition(_txn, + _savedKey, + _savedLoc, + _direction, + &_bucket, + &_ofs); + } + } + + private: + OperationContext* _txn; // not owned + const BtreeLogic<OnDiskFormat>* const _btree; + const int _direction; + + DiskLoc _bucket; + int _ofs; + + // Only used by save/restorePosition() if _bucket is non-Null. + BSONObj _savedKey; + DiskLoc _savedLoc; + }; + + virtual Cursor* newCursor(OperationContext* txn, int direction) const { + return new Cursor(txn, _btree.get(), direction); + } + + virtual Status initAsEmpty(OperationContext* txn) { + return _btree->initAsEmpty(txn); + } + + private: + scoped_ptr<BtreeLogic<OnDiskFormat> > _btree; + }; + + SortedDataInterface* getMMAPV1Interface(HeadManager* headManager, + RecordStore* recordStore, + const Ordering& ordering, + const string& indexName, + int version, + BucketDeletionNotification* bucketDeletion) { + + if (0 == version) { + return new BtreeInterfaceImpl<BtreeLayoutV0>(headManager, + recordStore, + ordering, + indexName, + bucketDeletion); + } + else { + invariant(1 == version); + return new BtreeInterfaceImpl<BtreeLayoutV1>(headManager, + recordStore, + ordering, + indexName, + bucketDeletion); + } + } + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_interface.h b/src/mongo/db/storage/mmap_v1/btree/btree_interface.h new file mode 100644 index 00000000000..ad0d07b7ece --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/btree/btree_interface.h @@ -0,0 +1,50 @@ +/** + * Copyright (C) 2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/bson/ordering.h" +#include "mongo/db/catalog/head_manager.h" +#include "mongo/db/diskloc.h" +#include "mongo/db/jsobj.h" +#include "mongo/db/operation_context.h" +#include "mongo/db/storage/record_store.h" +#include "mongo/db/storage/sorted_data_interface.h" + +#pragma once + +namespace mongo { + + class BucketDeletionNotification; + + SortedDataInterface* getMMAPV1Interface(HeadManager* headManager, + RecordStore* recordStore, + const Ordering& ordering, + const string& indexName, + int version, + BucketDeletionNotification* bucketDeletion); + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp new file mode 100644 index 00000000000..93f802bc4a5 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp @@ -0,0 +1,2519 @@ +/** + * Copyright (C) 2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/platform/basic.h" + +#include "mongo/db/diskloc.h" +#include "mongo/db/jsobj.h" +#include "mongo/db/operation_context.h" +#include "mongo/db/storage/mmap_v1/btree/btree_logic.h" +#include "mongo/db/storage/mmap_v1/btree/key.h" +#include "mongo/db/storage/record_store.h" +#include "mongo/util/log.h" +#include "mongo/util/mongoutils/str.h" + +namespace mongo { + + MONGO_LOG_DEFAULT_COMPONENT_FILE(::mongo::logger::LogComponent::kIndexing); + + // + // Public Builder logic + // + + template <class BtreeLayout> + typename BtreeLogic<BtreeLayout>::Builder* + BtreeLogic<BtreeLayout>::newBuilder(OperationContext* txn, bool dupsAllowed) { + return new Builder(this, txn, dupsAllowed); + } + + template <class BtreeLayout> + BtreeLogic<BtreeLayout>::Builder::Builder(BtreeLogic* logic, + OperationContext* txn, + bool dupsAllowed) + : _logic(logic), + _dupsAllowed(dupsAllowed), + _numAdded(0), + _txn(txn) { + + // XXX: Due to the way bulk building works, we may already have an empty root bucket that we + // must now dispose of. This isn't the case in some unit tests that use the Builder directly + // rather than going through an IndexAccessMethod. + DiskLoc oldHead = _logic->_headManager->getHead(); + if (!oldHead.isNull()) { + _logic->_headManager->setHead(_txn, DiskLoc()); + _logic->_recordStore->deleteRecord(_txn, oldHead); + } + + _first = _cur = _logic->_addBucket(txn); + _b = _getModifiableBucket(_cur); + _committed = false; + } + + template <class BtreeLayout> + Status BtreeLogic<BtreeLayout>::Builder::addKey(const BSONObj& keyObj, const DiskLoc& loc) { + auto_ptr<KeyDataOwnedType> key(new KeyDataOwnedType(keyObj)); + + if (key->dataSize() > BtreeLayout::KeyMax) { + string msg = str::stream() << "Btree::insert: key too large to index, failing " + << _logic->_indexName + << ' ' << key->dataSize() << ' ' << key->toString(); + log() << msg << endl; + return Status(ErrorCodes::KeyTooLong, msg); + } + + // If we have a previous key to compare to... + if (_numAdded > 0) { + int cmp = _keyLast->woCompare(*key, _logic->_ordering); + + // This shouldn't happen ever. We expect keys in sorted order. + if (cmp > 0) { + return Status(ErrorCodes::InternalError, "Bad key order in btree builder"); + } + + // This could easily happen.. + if (!_dupsAllowed && (cmp == 0)) { + return Status(ErrorCodes::DuplicateKey, _logic->dupKeyError(*_keyLast)); + } + } + + if (!_logic->_pushBack(_b, loc, *key, DiskLoc())) { + // bucket was full + newBucket(); + _logic->pushBack(_b, loc, *key, DiskLoc()); + } + + _keyLast = key; + _numAdded++; + mayCommitProgressDurably(); + return Status::OK(); + } + + template <class BtreeLayout> + unsigned long long BtreeLogic<BtreeLayout>::Builder::commit(bool mayInterrupt) { + buildNextLevel(_first, mayInterrupt); + _committed = true; + return _numAdded; + } + + // + // Private Builder logic + // + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::Builder::newBucket() { + DiskLoc newBucketLoc = _logic->_addBucket(_txn); + _b->parent = newBucketLoc; + _cur = newBucketLoc; + _b = _getModifiableBucket(_cur); + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::Builder::buildNextLevel(DiskLoc loc, bool mayInterrupt) { + for (;;) { + if (_getBucket(loc)->parent.isNull()) { + // only 1 bucket at this level. we are done. + _logic->_headManager->setHead(_txn, loc); + break; + } + + DiskLoc upLoc = _logic->_addBucket(_txn); + DiskLoc upStart = upLoc; + BucketType* up = _getModifiableBucket(upLoc); + + DiskLoc xloc = loc; + while (!xloc.isNull()) { + if (_txn->recoveryUnit()->commitIfNeeded()) { + _b = _getModifiableBucket(_cur); + up = _getModifiableBucket(upLoc); + } + + if (mayInterrupt) { + _txn->checkForInterrupt(); + } + + BucketType* x = _getModifiableBucket(xloc); + KeyDataType k; + DiskLoc r; + _logic->popBack(x, &r, &k); + bool keepX = (x->n != 0); + DiskLoc keepLoc = keepX ? xloc : x->nextChild; + + if (!_logic->_pushBack(up, r, k, keepLoc)) { + // current bucket full + DiskLoc n = _logic->_addBucket(_txn); + up->parent = n; + upLoc = n; + up = _getModifiableBucket(upLoc); + _logic->pushBack(up, r, k, keepLoc); + } + + DiskLoc nextLoc = x->parent; + if (keepX) { + x->parent = upLoc; + } + else { + if (!x->nextChild.isNull()) { + DiskLoc ll = x->nextChild; + _getModifiableBucket(ll)->parent = upLoc; + } + _logic->deallocBucket(_txn, x, xloc); + } + xloc = nextLoc; + } + + loc = upStart; + mayCommitProgressDurably(); + } + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::Builder::mayCommitProgressDurably() { + if (_txn->recoveryUnit()->commitIfNeeded()) { + _b = _getModifiableBucket(_cur); + } + } + + template <class BtreeLayout> + typename BtreeLogic<BtreeLayout>::BucketType* + BtreeLogic<BtreeLayout>::Builder::_getModifiableBucket(DiskLoc loc) { + return _logic->btreemod(_txn, _logic->getBucket(loc)); + } + + template <class BtreeLayout> + typename BtreeLogic<BtreeLayout>::BucketType* + BtreeLogic<BtreeLayout>::Builder::_getBucket(DiskLoc loc) { + return _logic->getBucket(loc); + } + + // + // BtreeLogic logic + // + + // static + template <class BtreeLayout> + typename BtreeLogic<BtreeLayout>::FullKey + BtreeLogic<BtreeLayout>::getFullKey(const BucketType* bucket, int i) { + if (i >= bucket->n) { + int code = 13000; + massert(code, + (string)"invalid keyNode: " + BSON( "i" << i << "n" << bucket->n ).jsonString(), + i < bucket->n ); + } + return FullKey(bucket, i); + } + + // static + template <class BtreeLayout> + typename BtreeLogic<BtreeLayout>::KeyHeaderType& + BtreeLogic<BtreeLayout>::getKeyHeader(BucketType* bucket, int i) { + return ((KeyHeaderType*)bucket->data)[i]; + } + + // static + template <class BtreeLayout> + const typename BtreeLogic<BtreeLayout>::KeyHeaderType& + BtreeLogic<BtreeLayout>::getKeyHeader(const BucketType* bucket, int i) { + return ((const KeyHeaderType*)bucket->data)[i]; + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::markUnused(BucketType* bucket, int keyPos) { + invariant(keyPos >= 0 && keyPos < bucket->n); + getKeyHeader(bucket, keyPos).setUnused(); + } + + template <class BtreeLayout> + char* BtreeLogic<BtreeLayout>::dataAt(BucketType* bucket, short ofs) { + return bucket->data + ofs; + } + + template <class BtreeLayout> + typename BtreeLogic<BtreeLayout>::BucketType* + BtreeLogic<BtreeLayout>::btreemod(OperationContext* txn, BucketType* bucket) { + txn->recoveryUnit()->writingPtr(bucket, BtreeLayout::BucketSize); + return bucket; + } + + template <class BtreeLayout> + int BtreeLogic<BtreeLayout>::totalDataSize(BucketType* bucket) { + return (int) (BtreeLayout::BucketSize - (bucket->data - (char*)bucket)); + } + + // We define this value as the maximum number of bytes such that, if we have + // fewer than this many bytes, we must be able to either merge with or receive + // keys from any neighboring node. If our utilization goes below this value we + // know we can bring up the utilization with a simple operation. Ignoring the + // 90/10 split policy which is sometimes employed and our 'unused' nodes, this + // is a lower bound on bucket utilization for non root buckets. + // + // Note that the exact value here depends on the implementation of + // _rebalancedSeparatorPos(). The conditions for lowWaterMark - 1 are as + // follows: We know we cannot merge with the neighbor, so the total data size + // for us, the neighbor, and the separator must be at least + // BucketType::bodySize() + 1. We must be able to accept one key of any + // allowed size, so our size plus storage for that additional key must be + // <= BucketType::bodySize() / 2. This way, with the extra key we'll have a + // new bucket data size < half the total data size and by the implementation + // of _rebalancedSeparatorPos() the key must be added. + template <class BtreeLayout> + int BtreeLogic<BtreeLayout>::lowWaterMark() { + return BtreeLayout::BucketBodySize / 2 - BtreeLayout::KeyMax - sizeof(KeyHeaderType) + 1; + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::init(BucketType* bucket) { + BtreeLayout::initBucket(bucket); + bucket->parent.Null(); + bucket->nextChild.Null(); + bucket->flags = Packed; + bucket->n = 0; + bucket->emptySize = totalDataSize(bucket); + bucket->topSize = 0; + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::_unalloc(BucketType* bucket, int bytes) { + bucket->topSize -= bytes; + bucket->emptySize += bytes; + } + + /** + * We allocate space from the end of the buffer for data. The keynodes grow from the front. + */ + template <class BtreeLayout> + int BtreeLogic<BtreeLayout>::_alloc(BucketType* bucket, int bytes) { + invariant(bucket->emptySize >= bytes); + bucket->topSize += bytes; + bucket->emptySize -= bytes; + int ofs = totalDataSize(bucket) - bucket->topSize; + invariant(ofs > 0); + return ofs; + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::setNotPacked(BucketType* bucket) { + bucket->flags &= ~Packed; + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::setPacked(BucketType* bucket) { + bucket->flags |= Packed; + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::_delKeyAtPos(BucketType* bucket, int keypos, bool mayEmpty) { + invariant(keypos >= 0 && keypos <= bucket->n); + invariant(childLocForPos(bucket, keypos).isNull()); + invariant((mayEmpty && bucket->n > 0) || bucket->n > 1 || bucket->nextChild.isNull()); + + bucket->emptySize += sizeof(KeyHeaderType); + bucket->n--; + + for (int j = keypos; j < bucket->n; j++) { + getKeyHeader(bucket, j) = getKeyHeader(bucket, j + 1); + } + + setNotPacked(bucket); + } + + /** + * Pull rightmost key from the bucket. This version requires its right child to be null so it + * does not bother returning that value. + */ + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::popBack(BucketType* bucket, + DiskLoc* recordLocOut, + KeyDataType *keyDataOut) { + + massert(17435, "n==0 in btree popBack()", bucket->n > 0 ); + + invariant(getKeyHeader(bucket, bucket->n - 1).isUsed()); + + FullKey kn = getFullKey(bucket, bucket->n - 1); + *recordLocOut = kn.recordLoc; + keyDataOut->assign(kn.data); + int keysize = kn.data.dataSize(); + + massert(17436, "rchild not null in btree popBack()", bucket->nextChild.isNull()); + + // Weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't + // full. + bucket->nextChild = kn.prevChildBucket; + bucket->n--; + // This is risky because the key we are returning points to this unalloc'ed memory, + // and we are assuming that the last key points to the last allocated + // bson region. + bucket->emptySize += sizeof(KeyHeaderType); + _unalloc(bucket, keysize); + } + + /** + * Add a key. Must be > all existing. Be careful to set next ptr right. + */ + template <class BtreeLayout> + bool BtreeLogic<BtreeLayout>::_pushBack(BucketType* bucket, + const DiskLoc recordLoc, + const KeyDataType& key, + const DiskLoc prevChild) { + + int bytesNeeded = key.dataSize() + sizeof(KeyHeaderType); + if (bytesNeeded > bucket->emptySize) { + return false; + } + invariant(bytesNeeded <= bucket->emptySize); + + if (bucket->n) { + const FullKey klast = getFullKey(bucket, bucket->n - 1); + if (klast.data.woCompare(key, _ordering) > 0) { + log() << "btree bucket corrupt? " + "consider reindexing or running validate command" << endl; + log() << " klast: " << klast.data.toString() << endl; + log() << " key: " << key.toString() << endl; + invariant(false); + } + } + + bucket->emptySize -= sizeof(KeyHeaderType); + KeyHeaderType& kn = getKeyHeader(bucket, bucket->n++); + kn.prevChildBucket = prevChild; + kn.recordLoc = recordLoc; + kn.setKeyDataOfs((short)_alloc(bucket, key.dataSize())); + short ofs = kn.keyDataOfs(); + char *p = dataAt(bucket, ofs); + memcpy(p, key.data(), key.dataSize()); + return true; + } + + /** + * Durability note: + * + * We do separate intent declarations herein. Arguably one could just declare the whole bucket + * given we do group commits. This is something we could investigate later as to what is + * faster. + **/ + + /** + * Insert a key in a bucket with no complexity -- no splits required + * Returns false if a split is required. + */ + template <class BtreeLayout> + bool BtreeLogic<BtreeLayout>::basicInsert(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int& keypos, + const KeyDataType& key, + const DiskLoc recordLoc) { + invariant(bucket->n < 1024); + invariant(keypos >= 0 && keypos <= bucket->n); + + int bytesNeeded = key.dataSize() + sizeof(KeyHeaderType); + if (bytesNeeded > bucket->emptySize) { + _pack(txn, bucket, bucketLoc, keypos); + if (bytesNeeded > bucket->emptySize) { + return false; + } + } + + invariant(getBucket(bucketLoc) == bucket); + + { + // declare that we will write to [k(keypos),k(n)] + char* start = reinterpret_cast<char*>(&getKeyHeader(bucket, keypos)); + char* end = reinterpret_cast<char*>(&getKeyHeader(bucket, bucket->n + 1)); + + // Declare that we will write to [k(keypos),k(n)] + txn->recoveryUnit()->writingPtr(start, end - start); + } + + // e.g. for n==3, keypos==2 + // 1 4 9 -> 1 4 _ 9 + for (int j = bucket->n; j > keypos; j--) { + getKeyHeader(bucket, j) = getKeyHeader(bucket, j - 1); + } + + size_t writeLen = sizeof(bucket->emptySize) + sizeof(bucket->topSize) + sizeof(bucket->n); + txn->recoveryUnit()->writingPtr(&bucket->emptySize, writeLen); + bucket->emptySize -= sizeof(KeyHeaderType); + bucket->n++; + + // This _KeyNode was marked for writing above. + KeyHeaderType& kn = getKeyHeader(bucket, keypos); + kn.prevChildBucket.Null(); + kn.recordLoc = recordLoc; + kn.setKeyDataOfs((short) _alloc(bucket, key.dataSize())); + char *p = dataAt(bucket, kn.keyDataOfs()); + txn->recoveryUnit()->writingPtr(p, key.dataSize()); + memcpy(p, key.data(), key.dataSize()); + return true; + } + + /** + * With this implementation, refPos == 0 disregards effect of refPos. index > 0 prevents + * creation of an empty bucket. + */ + template <class BtreeLayout> + bool BtreeLogic<BtreeLayout>::mayDropKey(BucketType* bucket, int index, int refPos) { + return index > 0 + && (index != refPos) + && getKeyHeader(bucket, index).isUnused() + && getKeyHeader(bucket, index).prevChildBucket.isNull(); + } + + template <class BtreeLayout> + int BtreeLogic<BtreeLayout>::_packedDataSize(BucketType* bucket, int refPos) { + if (bucket->flags & Packed) { + return BtreeLayout::BucketSize - bucket->emptySize - BucketType::HeaderSize; + } + + int size = 0; + for (int j = 0; j < bucket->n; ++j) { + if (mayDropKey(bucket, j, refPos)) { + continue; + } + size += getFullKey(bucket, j).data.dataSize() + sizeof(KeyHeaderType); + } + + return size; + } + + /** + * When we delete things, we just leave empty space until the node is full and then we repack + * it. + */ + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::_pack(OperationContext* txn, + BucketType* bucket, + const DiskLoc thisLoc, + int &refPos) { + + invariant(getBucket(thisLoc) == bucket); + + if (bucket->flags & Packed) { + return; + } + + _packReadyForMod(btreemod(txn, bucket), refPos); + } + + /** + * Version when write intent already declared. + */ + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::_packReadyForMod(BucketType* bucket, int &refPos) { + if (bucket->flags & Packed) { + return; + } + + int tdz = totalDataSize(bucket); + char temp[BtreeLayout::BucketSize]; + int ofs = tdz; + bucket->topSize = 0; + + int i = 0; + for (int j = 0; j < bucket->n; j++) { + if (mayDropKey(bucket, j, refPos)) { + // key is unused and has no children - drop it + continue; + } + + if (i != j) { + if (refPos == j) { + // i < j so j will never be refPos again + refPos = i; + } + getKeyHeader(bucket, i) = getKeyHeader(bucket, j); + } + + short ofsold = getKeyHeader(bucket, i).keyDataOfs(); + int sz = getFullKey(bucket, i).data.dataSize(); + ofs -= sz; + bucket->topSize += sz; + memcpy(temp + ofs, dataAt(bucket, ofsold), sz); + getKeyHeader(bucket, i).setKeyDataOfsSavingUse(ofs); + ++i; + } + + if (refPos == bucket->n) { + refPos = i; + } + + bucket->n = i; + int dataUsed = tdz - ofs; + memcpy(bucket->data + ofs, temp + ofs, dataUsed); + + bucket->emptySize = tdz - dataUsed - bucket->n * sizeof(KeyHeaderType); + int foo = bucket->emptySize; + invariant( foo >= 0 ); + setPacked(bucket); + assertValid(_indexName, bucket, _ordering); + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::truncateTo(BucketType* bucket, + int N, + int &refPos) { + bucket->n = N; + setNotPacked(bucket); + _packReadyForMod(bucket, refPos); + } + + /** + * In the standard btree algorithm, we would split based on the + * existing keys _and_ the new key. But that's more work to + * implement, so we split the existing keys and then add the new key. + * + * There are several published heuristic algorithms for doing splits, but basically what you + * want are (1) even balancing between the two sides and (2) a small split key so the parent can + * have a larger branching factor. + * + * We just have a simple algorithm right now: if a key includes the halfway point (or 10% way + * point) in terms of bytes, split on that key; otherwise split on the key immediately to the + * left of the halfway point (or 10% point). + * + * This function is expected to be called on a packed bucket. + */ + template <class BtreeLayout> + int BtreeLogic<BtreeLayout>::splitPos(BucketType* bucket, int keypos) { + invariant(bucket->n > 2); + int split = 0; + int rightSize = 0; + + // When splitting a btree node, if the new key is greater than all the other keys, we should + // not do an even split, but a 90/10 split. see SERVER-983. TODO I think we only want to + // do the 90% split on the rhs node of the tree. + int rightSizeLimit = (bucket->topSize + sizeof(KeyHeaderType) * bucket->n) + / (keypos == bucket->n ? 10 : 2); + + for (int i = bucket->n - 1; i > -1; --i) { + rightSize += getFullKey(bucket, i).data.dataSize() + sizeof(KeyHeaderType); + if (rightSize > rightSizeLimit) { + split = i; + break; + } + } + + // safeguards - we must not create an empty bucket + if (split < 1) { + split = 1; + } + else if (split > bucket->n - 2) { + split = bucket->n - 2; + } + + return split; + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::reserveKeysFront(BucketType* bucket, int nAdd) { + invariant(bucket->emptySize >= int(sizeof(KeyHeaderType) * nAdd)); + bucket->emptySize -= sizeof(KeyHeaderType) * nAdd; + for (int i = bucket->n - 1; i > -1; --i) { + getKeyHeader(bucket, i + nAdd) = getKeyHeader(bucket, i); + } + bucket->n += nAdd; + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::setKey(BucketType* bucket, + int i, + const DiskLoc recordLoc, + const KeyDataType& key, + const DiskLoc prevChildBucket) { + KeyHeaderType &kn = getKeyHeader(bucket, i); + kn.recordLoc = recordLoc; + kn.prevChildBucket = prevChildBucket; + short ofs = (short) _alloc(bucket, key.dataSize()); + kn.setKeyDataOfs(ofs); + char *p = dataAt(bucket, ofs); + memcpy(p, key.data(), key.dataSize()); + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::dropFront(BucketType* bucket, + int nDrop, + int &refpos) { + for (int i = nDrop; i < bucket->n; ++i) { + getKeyHeader(bucket, i - nDrop) = getKeyHeader(bucket, i); + } + bucket->n -= nDrop; + setNotPacked(bucket); + _packReadyForMod(bucket, refpos ); + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::customLocate(OperationContext* txn, + DiskLoc* locInOut, + int* keyOfsInOut, + const BSONObj& keyBegin, + int keyBeginLen, + bool afterKey, + const vector<const BSONElement*>& keyEnd, + const vector<bool>& keyEndInclusive, + int direction) const { + pair<DiskLoc, int> unused; + + customLocate(txn, + locInOut, + keyOfsInOut, + keyBegin, + keyBeginLen, + afterKey, + keyEnd, + keyEndInclusive, + direction, + unused); + + skipUnusedKeys(txn, locInOut, keyOfsInOut, direction); + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::advance(OperationContext* txn, + DiskLoc* bucketLocInOut, + int* posInOut, + int direction) const { + + *bucketLocInOut = advance(txn, *bucketLocInOut, posInOut, direction); + skipUnusedKeys(txn, bucketLocInOut, posInOut, direction); + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::skipUnusedKeys(OperationContext* txn, + DiskLoc* loc, + int* pos, + int direction) const { + while (!loc->isNull() && !keyIsUsed(*loc, *pos)) { + *loc = advance(txn, *loc, pos, direction); + } + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::advanceTo(OperationContext* txn, + DiskLoc* thisLocInOut, + int* keyOfsInOut, + const BSONObj &keyBegin, + int keyBeginLen, + bool afterKey, + const vector<const BSONElement*>& keyEnd, + const vector<bool>& keyEndInclusive, + int direction) const { + + advanceToImpl(txn, + thisLocInOut, + keyOfsInOut, + keyBegin, + keyBeginLen, + afterKey, + keyEnd, + keyEndInclusive, + direction); + + skipUnusedKeys(txn, thisLocInOut, keyOfsInOut, direction); + } + + /** + * find smallest/biggest value greater-equal/less-equal than specified + * + * starting thisLoc + keyOfs will be strictly less than/strictly greater than + * keyBegin/keyBeginLen/keyEnd + * + * All the direction checks below allowed me to refactor the code, but possibly separate forward + * and reverse implementations would be more efficient + */ + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::advanceToImpl(OperationContext* txn, + DiskLoc* thisLocInOut, + int* keyOfsInOut, + const BSONObj &keyBegin, + int keyBeginLen, + bool afterKey, + const vector<const BSONElement*>& keyEnd, + const vector<bool>& keyEndInclusive, + int direction) const { + + BucketType* bucket = getBucket(*thisLocInOut); + + int l, h; + bool dontGoUp; + + if (direction > 0) { + l = *keyOfsInOut; + h = bucket->n - 1; + int cmpResult = customBSONCmp(getFullKey(bucket, h).data.toBson(), + keyBegin, + keyBeginLen, + afterKey, + keyEnd, + keyEndInclusive, + _ordering, + direction); + dontGoUp = (cmpResult >= 0); + } + else { + l = 0; + h = *keyOfsInOut; + int cmpResult = customBSONCmp(getFullKey(bucket, l).data.toBson(), + keyBegin, + keyBeginLen, + afterKey, + keyEnd, + keyEndInclusive, + _ordering, + direction); + dontGoUp = (cmpResult <= 0); + } + + pair<DiskLoc, int> bestParent; + + if (dontGoUp) { + // this comparison result assures h > l + if (!customFind(l, + h, + keyBegin, + keyBeginLen, + afterKey, + keyEnd, + keyEndInclusive, + _ordering, + direction, + thisLocInOut, + keyOfsInOut, + bestParent)) { + return; + } + } + else { + // go up parents until rightmost/leftmost node is >=/<= target or at top + while (!bucket->parent.isNull()) { + *thisLocInOut = bucket->parent; + bucket = getBucket(*thisLocInOut); + + if (direction > 0) { + if (customBSONCmp(getFullKey(bucket, bucket->n - 1).data.toBson(), + keyBegin, + keyBeginLen, + afterKey, + keyEnd, + keyEndInclusive, + _ordering, + direction) >= 0 ) { + break; + } + } + else { + if (customBSONCmp(getFullKey(bucket, 0).data.toBson(), + keyBegin, + keyBeginLen, + afterKey, + keyEnd, + keyEndInclusive, + _ordering, + direction) <= 0) { + break; + } + } + } + } + + customLocate(txn, + thisLocInOut, + keyOfsInOut, + keyBegin, + keyBeginLen, + afterKey, + keyEnd, + keyEndInclusive, + direction, + bestParent); + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::customLocate(OperationContext* txn, + DiskLoc* locInOut, + int* keyOfsInOut, + const BSONObj& keyBegin, + int keyBeginLen, + bool afterKey, + const vector<const BSONElement*>& keyEnd, + const vector<bool>& keyEndInclusive, + int direction, + pair<DiskLoc, int>& bestParent) const { + + BucketType* bucket = getBucket(*locInOut); + + if (0 == bucket->n) { + *locInOut = DiskLoc(); + return; + } + + // go down until find smallest/biggest >=/<= target + for (;;) { + int l = 0; + int h = bucket->n - 1; + + // +direction: 0, -direction: h + int z = (direction > 0) ? 0 : h; + + // leftmost/rightmost key may possibly be >=/<= search key + int res = customBSONCmp(getFullKey(bucket, z).data.toBson(), + keyBegin, + keyBeginLen, + afterKey, + keyEnd, + keyEndInclusive, + _ordering, + direction); + + + if (direction * res >= 0) { + DiskLoc next; + *keyOfsInOut = z; + + if (direction > 0) { + dassert(z == 0); + next = getKeyHeader(bucket, 0).prevChildBucket; + } + else { + next = bucket->nextChild; + } + + if (!next.isNull()) { + bestParent = pair<DiskLoc, int>(*locInOut, *keyOfsInOut); + *locInOut = next; + bucket = getBucket(*locInOut); + continue; + } + else { + return; + } + } + + res = customBSONCmp(getFullKey(bucket, h - z).data.toBson(), + keyBegin, + keyBeginLen, + afterKey, + keyEnd, + keyEndInclusive, + _ordering, + direction); + + if (direction * res < 0) { + DiskLoc next; + if (direction > 0) { + next = bucket->nextChild; + } + else { + next = getKeyHeader(bucket, 0).prevChildBucket; + } + + if (next.isNull()) { + // if bestParent is null, we've hit the end and locInOut gets set to DiskLoc() + *locInOut = bestParent.first; + *keyOfsInOut = bestParent.second; + return; + } + else { + *locInOut = next; + bucket = getBucket(*locInOut); + continue; + } + } + + if (!customFind(l, + h, + keyBegin, + keyBeginLen, + afterKey, + keyEnd, + keyEndInclusive, + _ordering, + direction, + locInOut, + keyOfsInOut, + bestParent)) { + return; + } + + bucket = getBucket(*locInOut); + } + } + + template <class BtreeLayout> + bool BtreeLogic<BtreeLayout>::customFind(int low, + int high, + const BSONObj& keyBegin, + int keyBeginLen, + bool afterKey, + const vector<const BSONElement*>& keyEnd, + const vector<bool>& keyEndInclusive, + const Ordering& order, + int direction, + DiskLoc* thisLocInOut, + int* keyOfsInOut, + pair<DiskLoc, int>& bestParent) const { + + const BucketType* bucket = getBucket(*thisLocInOut); + + for (;;) { + if (low + 1 == high) { + *keyOfsInOut = (direction > 0) ? high : low; + DiskLoc next = getKeyHeader(bucket, high).prevChildBucket; + if (!next.isNull()) { + bestParent = make_pair(*thisLocInOut, *keyOfsInOut); + *thisLocInOut = next; + return true; + } + else { + return false; + } + } + + int middle = low + (high - low) / 2; + + int cmp = customBSONCmp(getFullKey(bucket, middle).data.toBson(), + keyBegin, + keyBeginLen, + afterKey, + keyEnd, + keyEndInclusive, + order, + direction); + + if (cmp < 0) { + low = middle; + } + else if (cmp > 0) { + high = middle; + } + else { + if (direction < 0) { + low = middle; + } + else { + high = middle; + } + } + } + } + + /** + * NOTE: Currently the Ordering implementation assumes a compound index will not have more keys + * than an unsigned variable has bits. The same assumption is used in the implementation below + * with respect to the 'mask' variable. + * + * 'l' is a regular bsonobj + * + * 'rBegin' is composed partly of an existing bsonobj, and the remaining keys are taken from a + * vector of elements that frequently changes + * + * see https://jira.mongodb.org/browse/SERVER-371 + */ + // static + template <class BtreeLayout> + int BtreeLogic<BtreeLayout>::customBSONCmp(const BSONObj& l, + const BSONObj& rBegin, + int rBeginLen, + bool rSup, + const vector<const BSONElement*>& rEnd, + const vector<bool>& rEndInclusive, + const Ordering& o, + int direction) const { + // XXX: make this readable + BSONObjIterator ll( l ); + BSONObjIterator rr( rBegin ); + vector< const BSONElement * >::const_iterator rr2 = rEnd.begin(); + vector< bool >::const_iterator inc = rEndInclusive.begin(); + unsigned mask = 1; + for( int i = 0; i < rBeginLen; ++i, mask <<= 1 ) { + BSONElement lll = ll.next(); + BSONElement rrr = rr.next(); + ++rr2; + ++inc; + + int x = lll.woCompare( rrr, false ); + if ( o.descending( mask ) ) + x = -x; + if ( x != 0 ) + return x; + } + if ( rSup ) { + return -direction; + } + for( ; ll.more(); mask <<= 1 ) { + BSONElement lll = ll.next(); + BSONElement rrr = **rr2; + ++rr2; + int x = lll.woCompare( rrr, false ); + if ( o.descending( mask ) ) + x = -x; + if ( x != 0 ) + return x; + if ( !*inc ) { + return -direction; + } + ++inc; + } + return 0; + } + + template <class BtreeLayout> + bool BtreeLogic<BtreeLayout>::exists(OperationContext* txn, const KeyDataType& key) const { + int position = 0; + + // Find the DiskLoc + bool found; + + DiskLoc bucket = _locate(txn, getRootLoc(), key, &position, &found, minDiskLoc, 1); + + while (!bucket.isNull()) { + FullKey fullKey = getFullKey(getBucket(bucket), position); + if (fullKey.header.isUsed()) { + return fullKey.data.woEqual(key); + } + bucket = advance(txn, bucket, &position, 1); + } + + return false; + } + + template <class BtreeLayout> + Status BtreeLogic<BtreeLayout>::dupKeyCheck(OperationContext* txn, + const BSONObj& key, + const DiskLoc& loc) const { + KeyDataOwnedType theKey(key); + if (!wouldCreateDup(txn, theKey, loc)) { + return Status::OK(); + } + + return Status(ErrorCodes::DuplicateKey, dupKeyError(theKey)); + } + + template <class BtreeLayout> + bool BtreeLogic<BtreeLayout>::wouldCreateDup(OperationContext* txn, + const KeyDataType& key, + const DiskLoc self) const { + int position; + bool found; + + DiskLoc posLoc = _locate(txn, getRootLoc(), key, &position, &found, minDiskLoc, 1); + + while (!posLoc.isNull()) { + FullKey fullKey = getFullKey(getBucket(posLoc), position); + if (fullKey.header.isUsed()) { + // TODO: we may not need fullKey.data until we know fullKey.header.isUsed() here + // and elsewhere. + if (fullKey.data.woEqual(key)) { + return fullKey.recordLoc != self; + } + break; + } + + posLoc = advance(txn, posLoc, &position, 1); + } + return false; + } + + template <class BtreeLayout> + string BtreeLogic<BtreeLayout>::dupKeyError(const KeyDataType& key) const { + stringstream ss; + ss << "E11000 duplicate key error "; + ss << "index: " << _indexName << " "; + ss << "dup key: " << key.toString(); + return ss.str(); + } + + /** + * Find a key within this btree bucket. + * + * When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the + * key. That assures that even when there are many duplicates (e.g., 1 million) for a key, our + * performance is still good. + * + * assertIfDup: if the key exists (ignoring the recordLoc), uassert + * + * pos: for existing keys k0...kn-1. + * returns # it goes BEFORE. so key[pos-1] < key < key[pos] + * returns n if it goes after the last existing key. + * note result might be an Unused location! + */ + template <class BtreeLayout> + Status BtreeLogic<BtreeLayout>::_find(OperationContext* txn, + BucketType* bucket, + const KeyDataType& key, + const DiskLoc& recordLoc, + bool errorIfDup, + int* keyPositionOut, + bool* foundOut) const { + + // XXX: fix the ctor for DiskLoc56bit so we can just convert w/o assignment operator + LocType genericRecordLoc; + genericRecordLoc = recordLoc; + + bool dupsChecked = false; + + int low = 0; + int high = bucket->n - 1; + int middle = (low + high) / 2; + + while (low <= high) { + FullKey fullKey = getFullKey(bucket, middle); + int cmp = key.woCompare(fullKey.data, _ordering); + + // The key data is the same. + if (0 == cmp) { + // Found the key in this bucket. If we're checking for dups... + if (errorIfDup) { + if (fullKey.header.isUnused()) { + // It's ok that the key is there if it is unused. We need to check that + // there aren't other entries for the key then. as it is very rare that + // we get here, we don't put any coding effort in here to make this + // particularly fast + if (!dupsChecked) { + // This is expensive and we only want to do it once(? -- when would + // it happen twice). + dupsChecked = true; + if (exists(txn, key)) { + if (wouldCreateDup(txn, key, genericRecordLoc)) { + return Status(ErrorCodes::DuplicateKey, dupKeyError(key), 11000); + } + else { + return Status(ErrorCodes::UniqueIndexViolation, "FIXME"); + } + } + } + } + else { + if (fullKey.recordLoc == recordLoc) { + return Status(ErrorCodes::UniqueIndexViolation, "FIXME"); + } + else { + return Status(ErrorCodes::DuplicateKey, dupKeyError(key), 11000); + } + } + } + + // If we're here dup keys are allowed, or the key is a dup but unused. + LocType recordLocCopy = fullKey.recordLoc; + + // We clear this bit so we can test equality without the used bit messing us up. + // XXX: document this + // XXX: kill this GETOFS stuff + recordLocCopy.GETOFS() &= ~1; + + // Set 'cmp' to the comparison w/the DiskLoc and fall through below. + cmp = recordLoc.compare(recordLocCopy); + } + + if (cmp < 0) { + high = middle - 1; + } + else if (cmp > 0) { + low = middle + 1; + } + else { + // Found it! + *keyPositionOut = middle; + *foundOut = true; + return Status::OK(); + } + + middle = (low + high) / 2; + } + + // Not found. + *keyPositionOut = low; + + // Some debugging checks. + if (low != bucket->n) { + wassert(key.woCompare(getFullKey(bucket, low).data, _ordering) <= 0); + + if (low > 0) { + if (getFullKey(bucket, low - 1).data.woCompare(key, _ordering) > 0) { + DEV { + log() << key.toString() << endl; + log() << getFullKey(bucket, low - 1).data.toString() << endl; + } + wassert(false); + } + } + } + + *foundOut = false; + return Status::OK(); + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::delBucket(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc) { + invariant(bucketLoc != getRootLoc()); + + _bucketDeletion->aboutToDeleteBucket(bucketLoc); + + BucketType* p = getBucket(bucket->parent); + int parentIdx = indexInParent(txn, bucket, bucketLoc); + *txn->recoveryUnit()->writing(&childLocForPos(p, parentIdx)) = DiskLoc(); + deallocBucket(txn, bucket, bucketLoc); + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::deallocBucket(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc) { + bucket->n = BtreeLayout::INVALID_N_SENTINEL; + bucket->parent.Null(); + _recordStore->deleteRecord(txn, bucketLoc); + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::restorePosition(OperationContext* txn, + const BSONObj& savedKey, + const DiskLoc& savedLoc, + int direction, + DiskLoc* bucketLocInOut, + int* keyOffsetInOut) const { + + // _keyOffset is -1 if the bucket was deleted. When buckets are deleted the Btree calls + // a clientcursor function that calls down to all BTree buckets. Really, this deletion + // thing should be kept BTree-internal. This'll go away with finer grained locking: we + // can hold on to a bucket for as long as we need it. + if (-1 == *keyOffsetInOut) { + locate(txn, savedKey, savedLoc, direction, keyOffsetInOut, bucketLocInOut); + return; + } + + invariant(*keyOffsetInOut >= 0); + + BucketType* bucket = getBucket(*bucketLocInOut); + invariant(bucket); + invariant(BtreeLayout::INVALID_N_SENTINEL != bucket->n); + + if (_keyIsAt(savedKey, savedLoc, bucket, *keyOffsetInOut)) { + skipUnusedKeys(txn, bucketLocInOut, keyOffsetInOut, direction); + return; + } + + if (*keyOffsetInOut > 0) { + (*keyOffsetInOut)--; + if (_keyIsAt(savedKey, savedLoc, bucket, *keyOffsetInOut)) { + skipUnusedKeys(txn, bucketLocInOut, keyOffsetInOut, direction); + return; + } + } + + locate(txn, savedKey, savedLoc, direction, keyOffsetInOut, bucketLocInOut); + } + + template <class BtreeLayout> + bool BtreeLogic<BtreeLayout>::_keyIsAt(const BSONObj& savedKey, + const DiskLoc& savedLoc, + BucketType* bucket, + int keyPos) const { + if (keyPos >= bucket->n) { + return false; + } + + FullKey key = getFullKey(bucket, keyPos); + if (!key.data.toBson().binaryEqual(savedKey)) { + return false; + } + return key.header.recordLoc == savedLoc; + } + + /** + * May delete the bucket 'bucket' rendering 'bucketLoc' invalid. + */ + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::delKeyAtPos(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int p) { + invariant(bucket->n > 0); + DiskLoc left = childLocForPos(bucket, p); + if (bucket->n == 1) { + if (left.isNull() && bucket->nextChild.isNull()) { + _delKeyAtPos(bucket, p); + if (isHead(bucket)) { + // we don't delete the top bucket ever + } + else { + if (!mayBalanceWithNeighbors(txn, bucket, bucketLoc)) { + // An empty bucket is only allowed as a txnient state. If + // there are no neighbors to balance with, we delete ourself. + // This condition is only expected in legacy btrees. + delBucket(txn, bucket, bucketLoc); + } + } + return; + } + deleteInternalKey(txn, bucket, bucketLoc, p); + return; + } + + if (left.isNull()) { + _delKeyAtPos(bucket, p); + mayBalanceWithNeighbors(txn, bucket, bucketLoc); + } + else { + deleteInternalKey(txn, bucket, bucketLoc, p); + } + } + + /** + * This function replaces the specified key (k) by either the prev or next key in the btree + * (k'). We require that k have either a left or right child. If k has a left child, we set k' + * to the prev key of k, which must be a leaf present in the left child. If k does not have a + * left child, we set k' to the next key of k, which must be a leaf present in the right child. + * When we replace k with k', we copy k' over k (which may cause a split) and then remove k' + * from its original location. Because k' is stored in a descendent of k, replacing k by k' + * will not modify the storage location of the original k', and we can easily remove k' from its + * original location. + * + * This function is only needed in cases where k has a left or right child; in other cases a + * simpler key removal implementation is possible. + * + * NOTE on noncompliant BtreeBuilder btrees: It is possible (though likely rare) for btrees + * created by BtreeBuilder to have k' that is not a leaf, see SERVER-2732. These cases are + * handled in the same manner as described in the "legacy btree structures" note below. + * + * NOTE on legacy btree structures: In legacy btrees, k' can be a nonleaf. In such a case we + * 'delete' k by marking it as an unused node rather than replacing it with k'. Also, k' may be + * a leaf but marked as an unused node. In such a case we replace k by k', preserving the key's + * unused marking. This function is only expected to mark a key as unused when handling a + * legacy btree. + */ + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::deleteInternalKey(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int keypos) { + DiskLoc lchild = childLocForPos(bucket, keypos); + DiskLoc rchild = childLocForPos(bucket, keypos + 1); + invariant(!lchild.isNull() || !rchild.isNull()); + int advanceDirection = lchild.isNull() ? 1 : -1; + int advanceKeyOfs = keypos; + DiskLoc advanceLoc = advance(txn, bucketLoc, &advanceKeyOfs, advanceDirection); + // advanceLoc must be a descentant of thisLoc, because thisLoc has a + // child in the proper direction and all descendants of thisLoc must be + // nonempty because they are not the root. + BucketType* advanceBucket = getBucket(advanceLoc); + + if (!childLocForPos(advanceBucket, advanceKeyOfs).isNull() + || !childLocForPos(advanceBucket, advanceKeyOfs + 1).isNull()) { + + markUnused(bucket, keypos); + return; + } + + FullKey kn = getFullKey(advanceBucket, advanceKeyOfs); + // Because advanceLoc is a descendant of thisLoc, updating thisLoc will + // not affect packing or keys of advanceLoc and kn will be stable + // during the following setInternalKey() + setInternalKey(txn, bucket, bucketLoc, keypos, kn.recordLoc, kn.data, + childLocForPos(bucket, keypos), + childLocForPos(bucket, keypos + 1)); + delKeyAtPos(txn, btreemod(txn, advanceBucket), advanceLoc, advanceKeyOfs); + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::replaceWithNextChild(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc) { + + invariant(bucket->n == 0 && !bucket->nextChild.isNull() ); + if (bucket->parent.isNull()) { + invariant(getRootLoc() == bucketLoc); + _headManager->setHead(txn, bucket->nextChild); + } + else { + BucketType* parentBucket = getBucket(bucket->parent); + int bucketIndexInParent = indexInParent(txn, bucket, bucketLoc); + *txn->recoveryUnit()->writing(&childLocForPos(parentBucket, bucketIndexInParent)) = + bucket->nextChild; + } + + *txn->recoveryUnit()->writing(&getBucket(bucket->nextChild)->parent) = bucket->parent; + _bucketDeletion->aboutToDeleteBucket(bucketLoc); + deallocBucket(txn, bucket, bucketLoc); + } + + template <class BtreeLayout> + bool BtreeLogic<BtreeLayout>::canMergeChildren(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + const int leftIndex) { + invariant(leftIndex >= 0 && leftIndex < bucket->n); + + DiskLoc leftNodeLoc = childLocForPos(bucket, leftIndex); + DiskLoc rightNodeLoc = childLocForPos(bucket, leftIndex + 1); + + if (leftNodeLoc.isNull() || rightNodeLoc.isNull()) { + return false; + } + + int pos = 0; + + BucketType* leftBucket = getBucket(leftNodeLoc); + BucketType* rightBucket = getBucket(rightNodeLoc); + + int sum = BucketType::HeaderSize + + _packedDataSize(leftBucket, pos) + + _packedDataSize(rightBucket, pos) + + getFullKey(bucket, leftIndex).data.dataSize() + + sizeof(KeyHeaderType); + + return sum <= BtreeLayout::BucketSize; + } + + /** + * This implementation must respect the meaning and value of lowWaterMark. Also see comments in + * splitPos(). + */ + template <class BtreeLayout> + int BtreeLogic<BtreeLayout>::_rebalancedSeparatorPos(OperationContext* txn, + BucketType* bucket, + int leftIndex) { + int split = -1; + int rightSize = 0; + + const BucketType* l = childForPos(bucket, leftIndex); + const BucketType* r = childForPos(bucket, leftIndex + 1); + + int KNS = sizeof(KeyHeaderType); + int rightSizeLimit = ( l->topSize + + l->n * KNS + + getFullKey(bucket, leftIndex).data.dataSize() + + KNS + + r->topSize + + r->n * KNS ) / 2; + + // This constraint should be ensured by only calling this function + // if we go below the low water mark. + invariant(rightSizeLimit < BtreeLayout::BucketBodySize); + + for (int i = r->n - 1; i > -1; --i) { + rightSize += getFullKey(r, i).data.dataSize() + KNS; + if (rightSize > rightSizeLimit) { + split = l->n + 1 + i; + break; + } + } + + if (split == -1) { + rightSize += getFullKey(bucket, leftIndex).data.dataSize() + KNS; + if (rightSize > rightSizeLimit) { + split = l->n; + } + } + + if (split == -1) { + for (int i = l->n - 1; i > -1; --i) { + rightSize += getFullKey(l, i).data.dataSize() + KNS; + if (rightSize > rightSizeLimit) { + split = i; + break; + } + } + } + + // safeguards - we must not create an empty bucket + if (split < 1) { + split = 1; + } + else if (split > l->n + 1 + r->n - 2) { + split = l->n + 1 + r->n - 2; + } + + return split; + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::doMergeChildren(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int leftIndex) { + + DiskLoc leftNodeLoc = childLocForPos(bucket, leftIndex); + DiskLoc rightNodeLoc = childLocForPos(bucket, leftIndex + 1); + + BucketType* l = btreemod(txn, getBucket(leftNodeLoc)); + BucketType* r = btreemod(txn, getBucket(rightNodeLoc)); + + int pos = 0; + _packReadyForMod(l, pos); + _packReadyForMod(r, pos); + + // We know the additional keys below will fit in l because canMergeChildren() must be true. + int oldLNum = l->n; + // left child's right child becomes old parent key's left child + FullKey knLeft = getFullKey(bucket, leftIndex); + pushBack(l, knLeft.recordLoc, knLeft.data, l->nextChild); + + for (int i = 0; i < r->n; ++i) { + FullKey kn = getFullKey(r, i); + pushBack(l, kn.recordLoc, kn.data, kn.prevChildBucket); + } + + l->nextChild = r->nextChild; + fixParentPtrs(txn, l, leftNodeLoc, oldLNum); + delBucket(txn, r, rightNodeLoc); + + childLocForPos(bucket, leftIndex + 1) = leftNodeLoc; + childLocForPos(bucket, leftIndex) = DiskLoc(); + _delKeyAtPos(bucket, leftIndex, true); + + if (bucket->n == 0) { + // Will trash bucket and bucketLoc. + // + // TODO To ensure all leaves are of equal height, we should ensure this is only called + // on the root. + replaceWithNextChild(txn, bucket, bucketLoc); + } + else { + mayBalanceWithNeighbors(txn, bucket, bucketLoc); + } + } + + template <class BtreeLayout> + int BtreeLogic<BtreeLayout>::indexInParent(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc) const { + invariant(!bucket->parent.isNull()); + const BucketType* p = getBucket(bucket->parent); + if (p->nextChild == bucketLoc) { + return p->n; + } + + for (int i = 0; i < p->n; ++i) { + if (getKeyHeader(p, i).prevChildBucket == bucketLoc) { + return i; + } + } + + log() << "ERROR: can't find ref to child bucket.\n"; + log() << "child: " << bucketLoc << "\n"; + //dump(); + log() << "Parent: " << bucket->parent << "\n"; + //p->dump(); + invariant(false); + return -1; // just to compile + } + + template <class BtreeLayout> + bool BtreeLogic<BtreeLayout>::tryBalanceChildren(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int leftIndex) { + + // If we can merge, then we must merge rather than balance to preserve bucket utilization + // constraints. + if (canMergeChildren(txn, bucket, bucketLoc, leftIndex)) { + return false; + } + + doBalanceChildren(txn, btreemod(txn, bucket), bucketLoc, leftIndex); + return true; + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::doBalanceLeftToRight(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int leftIndex, + int split, + BucketType* l, + const DiskLoc lchild, + BucketType* r, + const DiskLoc rchild) { + + // TODO maybe do some audits the same way pushBack() does? As a precondition, rchild + the + // old separator are <= half a body size, and lchild is at most completely full. Based on + // the value of split, rchild will get <= half of the total bytes which is at most 75% of a + // full body. So rchild will have room for the following keys: + int rAdd = l->n - split; + reserveKeysFront(r, rAdd); + + for (int i = split + 1, j = 0; i < l->n; ++i, ++j) { + FullKey kn = getFullKey(l, i); + setKey(r, j, kn.recordLoc, kn.data, kn.prevChildBucket); + } + + FullKey leftIndexKN = getFullKey(bucket, leftIndex); + setKey(r, rAdd - 1, leftIndexKN.recordLoc, leftIndexKN.data, l->nextChild); + + fixParentPtrs(txn, r, rchild, 0, rAdd - 1); + + FullKey kn = getFullKey(l, split); + l->nextChild = kn.prevChildBucket; + + // Because lchild is a descendant of thisLoc, updating thisLoc will not affect packing or + // keys of lchild and kn will be stable during the following setInternalKey() + setInternalKey(txn, bucket, bucketLoc, leftIndex, kn.recordLoc, kn.data, lchild, rchild); + + // lchild and rchild cannot be merged, so there must be >0 (actually more) keys to the left + // of split. + int zeropos = 0; + truncateTo(l, split, zeropos); + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::doBalanceRightToLeft(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int leftIndex, + int split, + BucketType* l, + const DiskLoc lchild, + BucketType* r, + const DiskLoc rchild) { + // As a precondition, lchild + the old separator are <= half a body size, + // and rchild is at most completely full. Based on the value of split, + // lchild will get less than half of the total bytes which is at most 75% + // of a full body. So lchild will have room for the following keys: + int lN = l->n; + + { + // left child's right child becomes old parent key's left child + FullKey kn = getFullKey(bucket, leftIndex); + pushBack(l, kn.recordLoc, kn.data, l->nextChild); + } + + for (int i = 0; i < split - lN - 1; ++i) { + FullKey kn = getFullKey(r, i); + pushBack(l, kn.recordLoc, kn.data, kn.prevChildBucket); + } + + { + FullKey kn = getFullKey(r, split - lN - 1); + l->nextChild = kn.prevChildBucket; + // Child lN was lchild's old nextChild, and don't need to fix that one. + fixParentPtrs(txn, l, lchild, lN + 1, l->n); + // Because rchild is a descendant of thisLoc, updating thisLoc will + // not affect packing or keys of rchild and kn will be stable + // during the following setInternalKey() + setInternalKey(txn, bucket, bucketLoc, leftIndex, kn.recordLoc, kn.data, lchild, rchild); + } + + // lchild and rchild cannot be merged, so there must be >0 (actually more) + // keys to the right of split. + int zeropos = 0; + dropFront(r, split - lN, zeropos); + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::doBalanceChildren(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int leftIndex) { + + DiskLoc lchild = childLocForPos(bucket, leftIndex); + DiskLoc rchild = childLocForPos(bucket, leftIndex + 1); + + int zeropos = 0; + BucketType* l = btreemod(txn, getBucket(lchild)); + _packReadyForMod(l, zeropos); + + BucketType* r = btreemod(txn, getBucket(rchild)); + _packReadyForMod(r, zeropos); + + int split = _rebalancedSeparatorPos(txn, bucket, leftIndex); + + // By definition, if we are below the low water mark and cannot merge + // then we must actively balance. + invariant(split != l->n); + if (split < l->n) { + doBalanceLeftToRight(txn, bucket, bucketLoc, leftIndex, split, l, lchild, r, rchild); + } + else { + doBalanceRightToLeft(txn, bucket, bucketLoc, leftIndex, split, l, lchild, r, rchild); + } + } + + template <class BtreeLayout> + bool BtreeLogic<BtreeLayout>::mayBalanceWithNeighbors(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc) { + if (bucket->parent.isNull()) { + return false; + } + + if (_packedDataSize(bucket, 0) >= lowWaterMark()) { + return false; + } + + BucketType* p = getBucket(bucket->parent); + int parentIdx = indexInParent(txn, bucket, bucketLoc); + + // TODO will missing neighbor case be possible long term? Should we try to merge/balance + // somehow in that case if so? + bool mayBalanceRight = (parentIdx < p->n) && !childLocForPos(p, parentIdx + 1).isNull(); + bool mayBalanceLeft = ( parentIdx > 0 ) && !childLocForPos(p, parentIdx - 1).isNull(); + + // Balance if possible on one side - we merge only if absolutely necessary to preserve btree + // bucket utilization constraints since that's a more heavy duty operation (especially if we + // must re-split later). + if (mayBalanceRight && tryBalanceChildren(txn, p, bucket->parent, parentIdx)) { + return true; + } + + if (mayBalanceLeft && tryBalanceChildren(txn, p, bucket->parent, parentIdx - 1)) { + return true; + } + + BucketType* pm = btreemod(txn, getBucket(bucket->parent)); + if (mayBalanceRight) { + doMergeChildren(txn, pm, bucket->parent, parentIdx); + return true; + } + else if (mayBalanceLeft) { + doMergeChildren(txn, pm, bucket->parent, parentIdx - 1); + return true; + } + + return false; + } + + template <class BtreeLayout> + bool BtreeLogic<BtreeLayout>::unindex(OperationContext* txn, + const BSONObj& key, + const DiskLoc& recordLoc) { + int pos; + bool found = false; + KeyDataOwnedType ownedKey(key); + + DiskLoc loc = _locate(txn, getRootLoc(), ownedKey, &pos, &found, recordLoc, 1); + if (found) { + BucketType* bucket = btreemod(txn, getBucket(loc)); + delKeyAtPos(txn, bucket, loc, pos); + assertValid(_indexName, getRoot(), _ordering); + } + return found; + } + + template <class BtreeLayout> + bool BtreeLogic<BtreeLayout>::isEmpty() const { + return getRoot()->n == 0; + } + + /** + * This can cause a lot of additional page writes when we assign buckets to different parents. + * Maybe get rid of parent ptrs? + */ + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::fixParentPtrs(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int firstIndex, + int lastIndex) { + + invariant(getBucket(bucketLoc) == bucket); + + if (lastIndex == -1) { + lastIndex = bucket->n; + } + + for (int i = firstIndex; i <= lastIndex; i++) { + const DiskLoc childLoc = childLocForPos(bucket, i); + if (!childLoc.isNull()) { + *txn->recoveryUnit()->writing(&getBucket(childLoc)->parent) = bucketLoc; + } + } + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::setInternalKey(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int keypos, + const DiskLoc recordLoc, + const KeyDataType& key, + const DiskLoc lchild, + const DiskLoc rchild) { + childLocForPos(bucket, keypos).Null(); + // This may leave the bucket empty (n == 0) which is ok only as a txnient state. In the + // instant case, the implementation of insertHere behaves correctly when n == 0 and as a + // side effect increments n. + _delKeyAtPos(bucket, keypos, true); + + // Ensure we do not orphan neighbor's old child. + invariant(childLocForPos(bucket, keypos ) == rchild); + + // Just set temporarily - required to pass validation in insertHere() + childLocForPos(bucket, keypos) = lchild; + + insertHere(txn, bucketLoc, keypos, key, recordLoc, lchild, rchild); + } + + /** + * insert a key in this bucket, splitting if necessary. + * + * @keypos - where to insert the key in range 0..n. 0=make leftmost, n=make rightmost. NOTE + * this function may free some data, and as a result the value passed for keypos may be invalid + * after calling insertHere() + * + * Some of the write intent signaling below relies on the implementation of the optimized write + * intent code in basicInsert(). + */ + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::insertHere(OperationContext* txn, + const DiskLoc bucketLoc, + int pos, + const KeyDataType& key, + const DiskLoc recordLoc, + const DiskLoc leftChildLoc, + const DiskLoc rightChildLoc) { + + BucketType* bucket = getBucket(bucketLoc); + + if (!basicInsert(txn, bucket, bucketLoc, pos, key, recordLoc)) { + // If basicInsert() fails, the bucket will be packed as required by split(). + split(txn, btreemod(txn, bucket), bucketLoc, pos, recordLoc, key, leftChildLoc, rightChildLoc); + return; + } + + KeyHeaderType* kn = &getKeyHeader(bucket, pos); + if (pos + 1 == bucket->n) { + // It's the last key. + if (bucket->nextChild != leftChildLoc) { + // XXX log more + invariant(false); + } + kn->prevChildBucket = bucket->nextChild; + invariant(kn->prevChildBucket == leftChildLoc); + *txn->recoveryUnit()->writing(&bucket->nextChild) = rightChildLoc; + if (!rightChildLoc.isNull()) { + *txn->recoveryUnit()->writing(&getBucket(rightChildLoc)->parent) = bucketLoc; + } + } + else { + kn->prevChildBucket = leftChildLoc; + if (getKeyHeader(bucket, pos + 1).prevChildBucket != leftChildLoc) { + // XXX: log more + invariant(false); + } + const LocType *pc = &getKeyHeader(bucket, pos + 1).prevChildBucket; + // Intent declared in basicInsert() + *const_cast<LocType*>(pc) = rightChildLoc; + if (!rightChildLoc.isNull()) { + *txn->recoveryUnit()->writing(&getBucket(rightChildLoc)->parent) = bucketLoc; + } + } + } + + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::split(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int keypos, + const DiskLoc recordLoc, + const KeyDataType& key, + const DiskLoc lchild, + const DiskLoc rchild) { + + int split = splitPos(bucket, keypos); + DiskLoc rLoc = _addBucket(txn); + BucketType* r = btreemod(txn, getBucket(rLoc)); + + for (int i = split + 1; i < bucket->n; i++) { + FullKey kn = getFullKey(bucket, i); + pushBack(r, kn.recordLoc, kn.data, kn.prevChildBucket); + } + r->nextChild = bucket->nextChild; + assertValid(_indexName, r, _ordering); + + r = NULL; + fixParentPtrs(txn, getBucket(rLoc), rLoc); + + FullKey splitkey = getFullKey(bucket, split); + // splitkey key gets promoted, its children will be thisLoc (l) and rLoc (r) + bucket->nextChild = splitkey.prevChildBucket; + + // Because thisLoc is a descendant of parent, updating parent will not affect packing or + // keys of thisLoc and splitkey will be stable during the following: + + if (bucket->parent.isNull()) { + // promote splitkey to a parent this->node make a new parent if we were the root + DiskLoc L = _addBucket(txn); + BucketType* p = btreemod(txn, getBucket(L)); + pushBack(p, splitkey.recordLoc, splitkey.data, bucketLoc); + p->nextChild = rLoc; + assertValid(_indexName, p, _ordering); + bucket->parent = L; + _headManager->setHead(txn, L); + *txn->recoveryUnit()->writing(&getBucket(rLoc)->parent) = bucket->parent; + } + else { + // set this before calling _insert - if it splits it will do fixParent() logic and + // change the value. + *txn->recoveryUnit()->writing(&getBucket(rLoc)->parent) = bucket->parent; + _insert(txn, + getBucket(bucket->parent), + bucket->parent, + splitkey.data, + splitkey.recordLoc, + true, // dupsallowed + bucketLoc, + rLoc); + } + + int newpos = keypos; + // note this may trash splitkey.key. thus we had to promote it before finishing up here. + truncateTo(bucket, split, newpos); + + // add our this->new key, there is room this->now + if (keypos <= split) { + insertHere(txn, bucketLoc, newpos, key, recordLoc, lchild, rchild); + } + else { + int kp = keypos - split - 1; + invariant(kp >= 0); + insertHere(txn, rLoc, kp, key, recordLoc, lchild, rchild); + } + } + + class DummyDocWriter : public DocWriter { + public: + DummyDocWriter(size_t sz) : _sz(sz) { } + virtual void writeDocument(char* buf) const { /* no-op */ } + virtual size_t documentSize() const { return _sz; } + private: + size_t _sz; + }; + + template <class BtreeLayout> + Status BtreeLogic<BtreeLayout>::initAsEmpty(OperationContext* txn) { + if (!_headManager->getHead().isNull()) { + return Status(ErrorCodes::InternalError, "index already initialized"); + } + + _headManager->setHead(txn, _addBucket(txn)); + return Status::OK(); + } + + template <class BtreeLayout> + DiskLoc BtreeLogic<BtreeLayout>::_addBucket(OperationContext* txn) { + DummyDocWriter docWriter(BtreeLayout::BucketSize); + StatusWith<DiskLoc> loc = _recordStore->insertRecord(txn, &docWriter, false); + // XXX: remove this(?) or turn into massert or sanely bubble it back up. + uassertStatusOK(loc.getStatus()); + + // this is a new bucket, not referenced by anyone, probably don't need this lock + BucketType* b = btreemod(txn, getBucket(loc.getValue())); + init(b); + return loc.getValue(); + } + + // static + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::dumpBucket(const BucketType* bucket, int indentLength) { + log() << "BUCKET n:" << bucket->n << ", parent:" << hex << bucket->parent.getOfs() << dec; + + const string indent = string(indentLength, ' '); + + for (int i = 0; i < bucket->n; i++) { + log() << '\n' << indent; + FullKey k = getFullKey(bucket, i); + string ks = k.data.toString(); + log() << " " << hex << k.prevChildBucket.getOfs() << "<-- prevChildBucket for " << i << '\n'; + log() << indent << " " << i << ' ' << ks.substr(0, 30) + << " Loc:" << k.recordLoc.toString() << dec; + if (getKeyHeader(bucket, i).isUnused()) { + log() << " UNUSED"; + } + } + + log() << "\n" << indent << " " << hex << bucket->nextChild.getOfs() << dec << endl; + } + + template <class BtreeLayout> + DiskLoc BtreeLogic<BtreeLayout>::getDiskLoc(const DiskLoc& bucketLoc, const int keyOffset) const { + invariant(!bucketLoc.isNull()); + BucketType* bucket = getBucket(bucketLoc); + return getKeyHeader(bucket, keyOffset).recordLoc; + } + + template <class BtreeLayout> + BSONObj BtreeLogic<BtreeLayout>::getKey(const DiskLoc& bucketLoc, const int keyOffset) const { + invariant(!bucketLoc.isNull()); + BucketType* bucket = getBucket(bucketLoc); + int n = bucket->n; + invariant(n != BtreeLayout::INVALID_N_SENTINEL); + invariant(n >= 0); + invariant(n < 10000); + invariant(n != 0xffff); + + invariant(keyOffset >= 0); + invariant(keyOffset < n); + + // XXX: should we really return an empty obj if keyOffset>=n? + if (keyOffset >= n) { + return BSONObj(); + } + else { + return getFullKey(bucket, keyOffset).data.toBson(); + } + } + + template <class BtreeLayout> + Status BtreeLogic<BtreeLayout>::touch(OperationContext* txn) const { + return _recordStore->touch( txn, NULL ); + } + + template <class BtreeLayout> + long long BtreeLogic<BtreeLayout>::fullValidate(OperationContext* txn, + long long *unusedCount, + bool strict, + bool dumpBuckets, + unsigned depth) { + return _fullValidate(txn, getRootLoc(), unusedCount, strict, dumpBuckets, depth); + } + + template <class BtreeLayout> + long long BtreeLogic<BtreeLayout>::_fullValidate(OperationContext* txn, + const DiskLoc bucketLoc, + long long *unusedCount, + bool strict, + bool dumpBuckets, + unsigned depth) { + BucketType* bucket = getBucket(bucketLoc); + assertValid(_indexName, bucket, _ordering, true); + + if (dumpBuckets) { + log() << bucketLoc.toString() << ' '; + dumpBucket(bucket, depth); + } + + long long keyCount = 0; + + for (int i = 0; i < bucket->n; i++) { + KeyHeaderType& kn = getKeyHeader(bucket, i); + + if (kn.isUsed()) { + keyCount++; + } + else if (NULL != unusedCount) { + ++(*unusedCount); + } + + if (!kn.prevChildBucket.isNull()) { + DiskLoc left = kn.prevChildBucket; + BucketType* b = getBucket(left); + + if (strict) { + invariant(b->parent == bucketLoc); + } + else { + wassert(b->parent == bucketLoc); + } + + keyCount += _fullValidate(txn, left, unusedCount, strict, dumpBuckets, depth + 1); + } + } + + if (!bucket->nextChild.isNull()) { + BucketType* b = getBucket(bucket->nextChild); + if (strict) { + invariant(b->parent == bucketLoc); + } + else { + wassert(b->parent == bucketLoc); + } + + keyCount += _fullValidate(txn, bucket->nextChild, unusedCount, strict, dumpBuckets, depth + 1); + } + + return keyCount; + } + + // XXX: remove this(?) used to not dump every key in assertValid. + int nDumped = 0; + + // static + template <class BtreeLayout> + void BtreeLogic<BtreeLayout>::assertValid(const std::string& ns, + BucketType* bucket, + const Ordering& ordering, + bool force) { + if (!force) { + return; + } + + // this is very slow so don't do often + { + static int _k; + if (++_k % 128) { + return; + } + } + + DEV { + // slow: + for (int i = 0; i < bucket->n - 1; i++) { + FullKey firstKey = getFullKey(bucket, i); + FullKey secondKey = getFullKey(bucket, i + 1); + int z = firstKey.data.woCompare(secondKey.data, ordering); + if (z > 0) { + log() << "ERROR: btree key order corrupt. Keys:" << endl; + if (++nDumped < 5) { + for (int j = 0; j < bucket->n; j++) { + log() << " " << getFullKey(bucket, j).data.toString() << endl; + } + dumpBucket(bucket); + } + wassert(false); + break; + } + else if (z == 0) { + if (!(firstKey.header.recordLoc < secondKey.header.recordLoc)) { + log() << "ERROR: btree key order corrupt (recordlocs wrong):" << endl; + log() << " k(" << i << ")" << firstKey.data.toString() + << " RL:" << firstKey.header.recordLoc.toString() << endl; + log() << " k(" << i + 1 << ")" << secondKey.data.toString() + << " RL:" << secondKey.header.recordLoc.toString() << endl; + wassert(firstKey.header.recordLoc < secondKey.header.recordLoc); + } + } + } + } + else { + //faster: + if (bucket->n > 1) { + FullKey k1 = getFullKey(bucket, 0); + FullKey k2 = getFullKey(bucket, bucket->n - 1); + int z = k1.data.woCompare(k2.data, ordering); + //wassert( z <= 0 ); + if (z > 0) { + log() << "Btree keys out of order in collection " << ns; + ONCE { + dumpBucket(bucket); + } + invariant(false); + } + } + } + } + + template <class BtreeLayout> + Status BtreeLogic<BtreeLayout>::insert(OperationContext* txn, + const BSONObj& rawKey, + const DiskLoc& value, + bool dupsAllowed) { + KeyDataOwnedType key(rawKey); + + if (key.dataSize() > BtreeLayout::KeyMax) { + string msg = str::stream() << "Btree::insert: key too large to index, failing " + << _indexName << ' ' + << key.dataSize() << ' ' << key.toString(); + return Status(ErrorCodes::KeyTooLong, msg); + } + + Status status = _insert(txn, + getRoot(), + getRootLoc(), + key, + value, + dupsAllowed, + DiskLoc(), + DiskLoc()); + + assertValid(_indexName, getRoot(), _ordering); + return status; + } + + template <class BtreeLayout> + Status BtreeLogic<BtreeLayout>::_insert(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + const KeyDataType& key, + const DiskLoc recordLoc, + bool dupsAllowed, + const DiskLoc leftChild, + const DiskLoc rightChild) { + invariant( key.dataSize() > 0 ); + + int pos; + bool found; + Status findStatus = _find(txn, bucket, key, recordLoc, !dupsAllowed, &pos, &found); + if (!findStatus.isOK()) { + return findStatus; + } + + if (found) { + static KeyHeaderType& header = getKeyHeader(bucket, pos); + if (header.isUnused()) { + LOG(4) << "btree _insert: reusing unused key" << endl; + massert(17433, "_insert: reuse key but lchild is not null", leftChild.isNull()); + massert(17434, "_insert: reuse key but rchild is not null", rightChild.isNull()); + txn->recoveryUnit()->writing(&header)->setUsed(); + return Status::OK(); + } + return Status(ErrorCodes::UniqueIndexViolation, "FIXME"); + } + + DiskLoc childLoc = childLocForPos(bucket, pos); + + // In current usage, rightChild is NULL for a new key and is not NULL when we are + // promoting a split key. These are the only two cases where _insert() is called + // currently. + if (childLoc.isNull() || !rightChild.isNull()) { + insertHere(txn, bucketLoc, pos, key, recordLoc, leftChild, rightChild); + return Status::OK(); + } + else { + return _insert(txn, + getBucket(childLoc), + childLoc, + key, + recordLoc, + dupsAllowed, + DiskLoc(), + DiskLoc()); + } + } + + template <class BtreeLayout> + DiskLoc BtreeLogic<BtreeLayout>::advance(OperationContext* txn, + const DiskLoc& bucketLoc, + int* posInOut, + int direction) const { + BucketType* bucket = getBucket(bucketLoc); + + if (*posInOut < 0 || *posInOut >= bucket->n ) { + log() << "ASSERT failure advancing btree bucket" << endl; + log() << " thisLoc: " << bucketLoc.toString() << endl; + log() << " keyOfs: " << *posInOut << " n:" << bucket->n << " direction: " << direction << endl; + // log() << bucketSummary() << endl; + invariant(false); + } + + // XXX document + int adj = direction < 0 ? 1 : 0; + int ko = *posInOut + direction; + + // Look down if we need to. + DiskLoc nextDownLoc = childLocForPos(bucket, ko + adj); + BucketType* nextDown = getBucket(nextDownLoc); + if (NULL != nextDown) { + for (;;) { + if (direction > 0) { + *posInOut = 0; + } + else { + *posInOut = nextDown->n - 1; + } + DiskLoc newNextDownLoc = childLocForPos(nextDown, *posInOut + adj); + BucketType* newNextDownBucket = getBucket(newNextDownLoc); + if (NULL == newNextDownBucket) { + break; + } + nextDownLoc = newNextDownLoc; + nextDown = newNextDownBucket; + } + return nextDownLoc; + } + + // Looking down isn't the right choice, move forward. + if (ko < bucket->n && ko >= 0) { + *posInOut = ko; + return bucketLoc; + } + + // Hit the end of the bucket, move up and over. + DiskLoc childLoc = bucketLoc; + DiskLoc ancestor = getBucket(bucketLoc)->parent; + for (;;) { + if (ancestor.isNull()) { + break; + } + BucketType* an = getBucket(ancestor); + for (int i = 0; i < an->n; i++) { + if (childLocForPos(an, i + adj) == childLoc) { + *posInOut = i; + return ancestor; + } + } + invariant(direction < 0 || an->nextChild == childLoc); + // parent exhausted also, keep going up + childLoc = ancestor; + ancestor = an->parent; + } + + return DiskLoc(); + } + + template <class BtreeLayout> + bool BtreeLogic<BtreeLayout>::keyIsUsed(const DiskLoc& loc, const int& pos) const { + return getKeyHeader(getBucket(loc), pos).isUsed(); + } + + template <class BtreeLayout> + bool BtreeLogic<BtreeLayout>::locate(OperationContext* txn, + const BSONObj& key, + const DiskLoc& recordLoc, + const int direction, + int* posOut, + DiskLoc* bucketLocOut) const { + // Clear out any data. + *posOut = 0; + *bucketLocOut = DiskLoc(); + + bool found = false; + KeyDataOwnedType owned(key); + + *bucketLocOut = _locate(txn, getRootLoc(), owned, posOut, &found, recordLoc, direction); + + if (!found) { + return false; + } + + skipUnusedKeys(txn, bucketLocOut, posOut, direction); + + return found; + } + + /** + * Recursively walk down the btree, looking for a match of key and recordLoc. + * Caller should have acquired lock on bucketLoc. + */ + template <class BtreeLayout> + DiskLoc BtreeLogic<BtreeLayout>::_locate(OperationContext* txn, + const DiskLoc& bucketLoc, + const KeyDataType& key, + int* posOut, + bool* foundOut, + const DiskLoc& recordLoc, + const int direction) const { + int position; + BucketType* bucket = getBucket(bucketLoc); + // XXX: owned to not owned conversion(?) + _find(txn, bucket, key, recordLoc, false, &position, foundOut); + + // Look in our current bucket. + if (*foundOut) { + *posOut = position; + return bucketLoc; + } + + // Not in our current bucket. 'position' tells us where there may be a child. + DiskLoc childLoc = childLocForPos(bucket, position); + + if (!childLoc.isNull()) { + DiskLoc inChild = _locate(txn, childLoc, key, posOut, foundOut, recordLoc, direction); + if (!inChild.isNull()) { + return inChild; + } + } + + *posOut = position; + + if (direction < 0) { + // The key *would* go to our left. + (*posOut)--; + if (-1 == *posOut) { + // But there's no space for that in our bucket. + return DiskLoc(); + } + else { + return bucketLoc; + } + } + else { + // The key would go to our right... + if (bucket->n == *posOut) { + return DiskLoc(); + } + else { + // But only if there is space. + return bucketLoc; + } + } + } + + // TODO relcoate + template <class BtreeLayout> + bool BtreeLogic<BtreeLayout>::isHead(BucketType* bucket) { + return bucket->parent.isNull(); + } + + template <class BtreeLayout> + typename BtreeLogic<BtreeLayout>::BucketType* + BtreeLogic<BtreeLayout>::getBucket(const DiskLoc dl) const { + if (dl.isNull()) { + return NULL; + } + + RecordData recordData = _recordStore->dataFor(dl); + + // we need to be working on the raw bytes, not a transient copy + invariant(!recordData.isOwned()); + + return reinterpret_cast<BucketType*>(const_cast<char*>(recordData.data())); + } + + template <class BtreeLayout> + typename BtreeLogic<BtreeLayout>::BucketType* + BtreeLogic<BtreeLayout>::getRoot() const { + return getBucket(_headManager->getHead()); + } + + template <class BtreeLayout> + DiskLoc + BtreeLogic<BtreeLayout>::getRootLoc() const { + return _headManager->getHead(); + } + + template <class BtreeLayout> + typename BtreeLogic<BtreeLayout>::BucketType* + BtreeLogic<BtreeLayout>::childForPos(BucketType* bucket, int pos) const { + DiskLoc loc = childLocForPos(bucket, pos); + return getBucket(loc); + } + + template <class BtreeLayout> + typename BtreeLogic<BtreeLayout>::LocType& + BtreeLogic<BtreeLayout>::childLocForPos(BucketType* bucket, int pos) { + if (bucket->n == pos) { + return bucket->nextChild; + } + else { + return getKeyHeader(bucket, pos).prevChildBucket; + } + } + + // + // And, template stuff. + // + + // V0 format. + template struct FixedWidthKey<DiskLoc>; + template class BtreeLogic<BtreeLayoutV0>; + + // V1 format. + template struct FixedWidthKey<DiskLoc56Bit>; + template class BtreeLogic<BtreeLayoutV1>; + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic.h b/src/mongo/db/storage/mmap_v1/btree/btree_logic.h new file mode 100644 index 00000000000..ff7d7718de9 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/btree/btree_logic.h @@ -0,0 +1,593 @@ +/** + * Copyright (C) 2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/catalog/head_manager.h" +#include "mongo/db/catalog/index_catalog_entry.h" +#include "mongo/db/diskloc.h" +#include "mongo/db/jsobj.h" +#include "mongo/db/operation_context.h" +#include "mongo/db/storage/mmap_v1/btree/btree_ondisk.h" +#include "mongo/db/storage/mmap_v1/btree/key.h" +#include "mongo/db/storage/mmap_v1/btree/bucket_deletion_notification.h" + + +namespace mongo { + + class BucketDeletionNotification; + class RecordStore; + + // Used for unit-testing only + template <class BtreeLayout> class BtreeLogicTestBase; + template <class BtreeLayout> class ArtificialTreeBuilder; + + /** + * This is the logic for manipulating the Btree. It is (mostly) independent of the on-disk + * format. + */ + template <class BtreeLayout> + class BtreeLogic { + public: + // AKA _keyNode + typedef typename BtreeLayout::FixedWidthKeyType KeyHeaderType; + + // AKA Key + typedef typename BtreeLayout::KeyType KeyDataType; + + // AKA KeyOwned + typedef typename BtreeLayout::KeyOwnedType KeyDataOwnedType; + + // AKA Loc + typedef typename BtreeLayout::LocType LocType; + + // AKA BucketBasics or BtreeBucket, either one. + typedef typename BtreeLayout::BucketType BucketType; + + /** + * 'head' manages the catalog information. + * 'store' allocates and frees buckets. + * 'ordering' is meta-information we store in the catalog. + * 'indexName' is a string identifying the index that we use to print errors with. + */ + BtreeLogic(HeadManager* head, + RecordStore* store, + const Ordering& ordering, + const string& indexName, + BucketDeletionNotification* bucketDeletion) + : _headManager(head), + _recordStore(store), + _ordering(ordering), + _indexName(indexName), + _bucketDeletion(bucketDeletion) { + + } + + // + // Public-facing + // + + class Builder { + public: + typedef typename BtreeLayout::KeyOwnedType KeyDataOwnedType; + typedef typename BtreeLayout::KeyType KeyDataType; + + Status addKey(const BSONObj& key, const DiskLoc& loc); + + // XXX: status, outparam for # keys? + unsigned long long commit(bool mayInterrupt); + + private: + friend class BtreeLogic; + + Builder(BtreeLogic* logic, OperationContext* txn, bool dupsAllowed); + + // Direct ports of functionality + void newBucket(); + void buildNextLevel(DiskLoc loc, bool mayInterrupt); + void mayCommitProgressDurably(); + BucketType* _getModifiableBucket(DiskLoc loc); + BucketType* _getBucket(DiskLoc loc); + // Direct ports of functionality + + // Not owned. + BtreeLogic* _logic; + + // Direct port of names. + DiskLoc _cur; + DiskLoc _first; + BucketType* _b; + bool _committed; + bool _dupsAllowed; + long long _numAdded; + auto_ptr<KeyDataOwnedType> _keyLast; + + // Not owned. + OperationContext* _txn; + }; + + /** + * Caller owns the returned pointer. + * 'this' must outlive the returned pointer. + */ + Builder* newBuilder(OperationContext* txn, bool dupsAllowed); + + Status dupKeyCheck(OperationContext* txn, + const BSONObj& key, + const DiskLoc& loc) const; + + Status insert(OperationContext* txn, + const BSONObj& rawKey, + const DiskLoc& value, + bool dupsAllowed); + + /** + * Navigates down the tree and locates the bucket and position containing a record with + * the specified <key, recordLoc> combination. + * + * @return true if the exact <key, recordLoc> was found. Otherwise, false and the + * bucketLocOut would contain the bucket containing key which is before or after the + * searched one (dependent on the direction). + */ + bool locate(OperationContext* txn, + const BSONObj& key, + const DiskLoc& recordLoc, + const int direction, + int* posOut, + DiskLoc* bucketLocOut) const; + + void advance(OperationContext* txn, + DiskLoc* bucketLocInOut, + int* posInOut, + int direction) const; + + bool exists(OperationContext* txn, const KeyDataType& key) const; + + bool unindex(OperationContext* txn, + const BSONObj& key, + const DiskLoc& recordLoc); + + bool isEmpty() const; + + long long fullValidate(OperationContext*, + long long *unusedCount, + bool strict, + bool dumpBuckets, + unsigned depth); + + DiskLoc getDiskLoc(const DiskLoc& bucketLoc, const int keyOffset) const; + + BSONObj getKey(const DiskLoc& bucketLoc, const int keyOffset) const; + + DiskLoc getHead() const { return _headManager->getHead(); } + + Status touch(OperationContext* txn) const; + + // + // Composite key navigation methods + // + + void customLocate(OperationContext* txn, + DiskLoc* locInOut, + int* keyOfsInOut, + const BSONObj& keyBegin, + int keyBeginLen, + bool afterKey, + const vector<const BSONElement*>& keyEnd, + const vector<bool>& keyEndInclusive, + int direction) const; + + void advanceTo(OperationContext*, + DiskLoc* thisLocInOut, + int* keyOfsInOut, + const BSONObj &keyBegin, + int keyBeginLen, + bool afterKey, + const vector<const BSONElement*>& keyEnd, + const vector<bool>& keyEndInclusive, + int direction) const; + + void restorePosition(OperationContext* txn, + const BSONObj& savedKey, + const DiskLoc& savedLoc, + int direction, + DiskLoc* bucketInOut, + int* keyOffsetInOut) const; + + // + // Creation and deletion + // + + /** + * Returns OK if the index was uninitialized before, error status otherwise. + */ + Status initAsEmpty(OperationContext* txn); + + // + // Size constants + // + + static int lowWaterMark(); + + private: + friend class BtreeLogic::Builder; + + // Used for unit-testing only + friend class BtreeLogicTestBase<BtreeLayout>; + friend class ArtificialTreeBuilder<BtreeLayout>; + + /** + * This is an in memory wrapper for the variable length data associated with a + * KeyHeaderType. It points to on-disk data but is not itself on-disk data. + * + * This object and its BSONObj 'key' will become invalid if the KeyHeaderType data that owns + * this it is moved within the btree. In general, a KeyWrapper should not be expected to be + * valid after a write. + */ + struct FullKey { + FullKey(const BucketType* bucket, int i) + : header(getKeyHeader(bucket, i)), + prevChildBucket(header.prevChildBucket), + recordLoc(header.recordLoc), + data(bucket->data + header.keyDataOfs()) { } + + // This is actually a reference to something on-disk. + const KeyHeaderType& header; + + // These are actually in 'header'. + const LocType& prevChildBucket; + const LocType& recordLoc; + + // This is *not* memory-mapped but its members point to something on-disk. + KeyDataType data; + }; + + // + // Functions that depend on the templated type info but nothing in 'this'. + // + + static LocType& childLocForPos(BucketType* bucket, int pos); + + static FullKey getFullKey(const BucketType* bucket, int i); + + static KeyHeaderType& getKeyHeader(BucketType* bucket, int i); + + static const KeyHeaderType& getKeyHeader(const BucketType* bucket, int i); + + static char* dataAt(BucketType* bucket, short ofs); + + static void markUnused(BucketType* bucket, int keypos); + + static int totalDataSize(BucketType* bucket); + + static void init(BucketType* bucket); + + static int _alloc(BucketType* bucket, int bytes); + + static void _unalloc(BucketType* bucket, int bytes); + + static void _delKeyAtPos(BucketType* bucket, int keypos, bool mayEmpty = false); + + static void popBack(BucketType* bucket, DiskLoc* recordLocOut, KeyDataType *keyDataOut); + + static bool mayDropKey(BucketType* bucket, int index, int refPos); + + static int _packedDataSize(BucketType* bucket, int refPos); + + static void setPacked(BucketType* bucket); + + static void setNotPacked(BucketType* bucket); + + static BucketType* btreemod(OperationContext* txn, BucketType* bucket); + + static int splitPos(BucketType* bucket, int keypos); + + static void reserveKeysFront(BucketType* bucket, int nAdd); + + static void setKey(BucketType* bucket, + int i, + const DiskLoc recordLoc, + const KeyDataType &key, + const DiskLoc prevChildBucket); + + static bool isHead(BucketType* bucket); + + static void dumpBucket(const BucketType* bucket, int indentLength = 0); + + static void assertValid(const std::string& ns, + BucketType* bucket, + const Ordering& ordering, + bool force = false); + + // + // 'this'-specific helpers (require record store, catalog information, or ordering, or type + // information). + // + + bool basicInsert(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int& keypos, + const KeyDataType& key, + const DiskLoc recordLoc); + + void dropFront(BucketType* bucket, int nDrop, int& refpos); + + void _pack(OperationContext* txn, BucketType* bucket, const DiskLoc thisLoc, int &refPos); + + void customLocate(OperationContext* txn, + DiskLoc* locInOut, + int* keyOfsInOut, + const BSONObj& keyBegin, + int keyBeginLen, + bool afterKey, + const vector<const BSONElement*>& keyEnd, + const vector<bool>& keyEndInclusive, + int direction, + pair<DiskLoc, int>& bestParent) const; + + Status _find(OperationContext* txn, + BucketType* bucket, + const KeyDataType& key, + const DiskLoc& recordLoc, + bool errorIfDup, + int* keyPositionOut, + bool* foundOut) const; + + bool customFind(int low, + int high, + const BSONObj& keyBegin, + int keyBeginLen, + bool afterKey, + const vector<const BSONElement*>& keyEnd, + const vector<bool>& keyEndInclusive, + const Ordering& order, + int direction, + DiskLoc* thisLocInOut, + int* keyOfsInOut, + pair<DiskLoc, int>& bestParent) const; + + void advanceToImpl(OperationContext* txn, + DiskLoc* thisLocInOut, + int* keyOfsInOut, + const BSONObj &keyBegin, + int keyBeginLen, + bool afterKey, + const vector<const BSONElement*>& keyEnd, + const vector<bool>& keyEndInclusive, + int direction) const; + + bool wouldCreateDup(OperationContext* txn, + const KeyDataType& key, + const DiskLoc self) const; + + bool keyIsUsed(const DiskLoc& loc, const int& pos) const; + + void skipUnusedKeys(OperationContext* txn, + DiskLoc* loc, + int* pos, + int direction) const; + + DiskLoc advance(OperationContext* txn, + const DiskLoc& bucketLoc, + int* posInOut, + int direction) const; + + DiskLoc _locate(OperationContext* txn, + const DiskLoc& bucketLoc, + const KeyDataType& key, + int* posOut, + bool* foundOut, + const DiskLoc& recordLoc, + const int direction) const; + + long long _fullValidate(OperationContext* txn, + const DiskLoc bucketLoc, + long long *unusedCount, + bool strict, + bool dumpBuckets, + unsigned depth); + + DiskLoc _addBucket(OperationContext* txn); + + bool canMergeChildren(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + const int leftIndex); + + // has to look in children of 'bucket' and requires record store + int _rebalancedSeparatorPos(OperationContext* txn, + BucketType* bucket, + int leftIndex); + + void _packReadyForMod(BucketType* bucket, int &refPos); + + void truncateTo(BucketType* bucket, int N, int &refPos); + + void split(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int keypos, + const DiskLoc recordLoc, + const KeyDataType& key, + const DiskLoc lchild, + const DiskLoc rchild); + + Status _insert(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + const KeyDataType& key, + const DiskLoc recordLoc, + bool dupsAllowed, + const DiskLoc leftChild, + const DiskLoc rightChild); + + // TODO take a BucketType*? + void insertHere(OperationContext* txn, + const DiskLoc bucketLoc, + int pos, + const KeyDataType& key, + const DiskLoc recordLoc, + const DiskLoc leftChild, + const DiskLoc rightChild); + + std::string dupKeyError(const KeyDataType& key) const; + + void setInternalKey(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int keypos, + const DiskLoc recordLoc, + const KeyDataType& key, + const DiskLoc lchild, + const DiskLoc rchild); + + void fixParentPtrs(OperationContext* trans, + BucketType* bucket, + const DiskLoc bucketLoc, + int firstIndex = 0, + int lastIndex = -1); + + bool mayBalanceWithNeighbors(OperationContext* txn, BucketType* bucket, const DiskLoc bucketLoc); + + void doBalanceChildren(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int leftIndex); + + void doBalanceLeftToRight(OperationContext* txn, + BucketType* bucket, + const DiskLoc thisLoc, + int leftIndex, + int split, + BucketType* l, + const DiskLoc lchild, + BucketType* r, + const DiskLoc rchild); + + void doBalanceRightToLeft(OperationContext* txn, + BucketType* bucket, + const DiskLoc thisLoc, + int leftIndex, + int split, + BucketType* l, + const DiskLoc lchild, + BucketType* r, + const DiskLoc rchild); + + bool tryBalanceChildren(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int leftIndex); + + int indexInParent(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc) const; + + void doMergeChildren(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int leftIndex); + + void replaceWithNextChild(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc); + + void deleteInternalKey(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int keypos); + + void delKeyAtPos(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc, + int p); + + void delBucket(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc); + + void deallocBucket(OperationContext* txn, + BucketType* bucket, + const DiskLoc bucketLoc); + + bool _keyIsAt(const BSONObj& savedKey, + const DiskLoc& savedLoc, + BucketType* bucket, + int keyPos) const; + + // TODO 'this' for _ordering(?) + int customBSONCmp(const BSONObj& l, + const BSONObj& rBegin, + int rBeginLen, + bool rSup, + const std::vector<const BSONElement*>& rEnd, + const std::vector<bool>& rEndInclusive, + const Ordering& o, + int direction) const; + + // TODO needs 'this' for _ordering for sanity check + bool _pushBack(BucketType* bucket, + const DiskLoc recordLoc, + const KeyDataType& key, + const DiskLoc prevChild); + + void pushBack(BucketType* bucket, + const DiskLoc recordLoc, + const KeyDataType& key, + const DiskLoc prevChild) { + invariant(_pushBack(bucket, recordLoc, key, prevChild)); + } + + BucketType* childForPos(BucketType* bucket, int pos) const; + + BucketType* getBucket(const DiskLoc dl) const; + + BucketType* getRoot() const; + + DiskLoc getRootLoc() const; + + // + // Data + // + + // Not owned here. + HeadManager* _headManager; + + // Not owned here. + RecordStore* _recordStore; + + Ordering _ordering; + + string _indexName; + + // Not owned here + BucketDeletionNotification* _bucketDeletion; + }; + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp new file mode 100644 index 00000000000..ca6cdce9a9e --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp @@ -0,0 +1,2207 @@ +// btree_logic_test.cpp : Btree unit tests +// + +/** + * Copyright (C) 2014 MongoDB + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects + * for all of the code used other than as permitted herein. If you modify + * file(s) with this exception, you may extend this exception to your + * version of the file(s), but you are not obligated to do so. If you do not + * wish to do so, delete this exception statement from your version. If you + * delete this exception statement from all source files in the program, + * then also delete it in the license file. + */ + +// This file contains simple single-threaded tests, which check various aspects of the Btree logic +// + +#include "mongo/db/instance.h" +#include "mongo/db/operation_context_noop.h" +#include "mongo/db/storage/mmap_v1/btree/btree_test_help.h" +#include "mongo/unittest/unittest.h" + + +namespace mongo { + + /** + * This class is made friend of BtreeLogic so we can add whatever private method accesses we + * need to it, to be used by the tests. + */ + template<class BtreeLayoutType> + class BtreeLogicTestBase { + public: + typedef typename BtreeLayoutType::BucketType BucketType; + typedef typename BtreeLayoutType::FixedWidthKeyType FixedWidthKeyType; + + typedef typename BtreeLogic<BtreeLayoutType>::FullKey FullKey; + typedef typename BtreeLogic<BtreeLayoutType>::KeyDataOwnedType KeyDataOwnedType; + + BtreeLogicTestBase() : _helper(BSON("TheKey" << 1)) { + + } + + virtual ~BtreeLogicTestBase() { + + } + + protected: + void checkValidNumKeys(int nKeys) { + OperationContextNoop txn; + ASSERT_EQUALS(nKeys, _helper.btree.fullValidate(&txn, NULL, true, true, 0)); + } + + void insert(const BSONObj &key, const DiskLoc dl) { + OperationContextNoop txn; + _helper.btree.insert(&txn, key, dl, true); + } + + bool unindex(const BSONObj &key) { + OperationContextNoop txn; + return _helper.btree.unindex(&txn, key, _helper.dummyDiskLoc); + } + + void locate(const BSONObj &key, + int expectedPos, + bool expectedFound, + const DiskLoc &expectedLocation, + int direction) { + int pos; + DiskLoc loc; + OperationContextNoop txn; + ASSERT_EQUALS(expectedFound, + _helper.btree.locate(&txn, key, _helper.dummyDiskLoc, direction, &pos, &loc)); + ASSERT_EQUALS(expectedLocation, loc); + ASSERT_EQUALS(expectedPos, pos); + } + + const BucketType* child(const BucketType* bucket, int i) const { + verify(i <= bucket->n); + + DiskLoc diskLoc; + if (i == bucket->n) { + diskLoc = bucket->nextChild; + } + else { + FullKey fullKey = BtreeLogic<BtreeLayoutType>::getFullKey(bucket, i); + diskLoc = fullKey.prevChildBucket; + } + + verify(!diskLoc.isNull()); + + return _helper.btree.getBucket(diskLoc); + } + + BucketType* head() const { + return _helper.btree.getBucket(_helper.headManager.getHead()); + } + + void forcePackBucket(const DiskLoc bucketLoc) { + BucketType* bucket = _helper.btree.getBucket(bucketLoc); + + bucket->topSize += bucket->emptySize; + bucket->emptySize = 0; + BtreeLogic<BtreeLayoutType>::setNotPacked(bucket); + } + + void truncateBucket(BucketType* bucket, int N, int &refPos) { + _helper.btree.truncateTo(bucket, N, refPos); + } + + int bucketPackedDataSize(BucketType* bucket, int refPos) { + return _helper.btree._packedDataSize(bucket, refPos); + } + + int bucketRebalancedSeparatorPos(const DiskLoc bucketLoc, int leftIndex) { + BucketType* bucket = _helper.btree.getBucket(bucketLoc); + OperationContextNoop txn; + return _helper.btree._rebalancedSeparatorPos(&txn, bucket, leftIndex); + } + + FullKey getKey(const DiskLoc bucketLoc, int pos) const { + const BucketType* bucket = _helper.btree.getBucket(bucketLoc); + return BtreeLogic<BtreeLayoutType>::getFullKey(bucket, pos); + } + + void markKeyUnused(const DiskLoc bucketLoc, int keyPos) { + BucketType* bucket = _helper.btree.getBucket(bucketLoc); + invariant(keyPos >= 0 && keyPos < bucket->n); + + _helper.btree.getKeyHeader(bucket, keyPos).setUnused(); + } + + DiskLoc newBucket() { + OperationContextNoop txn; + return _helper.btree._addBucket(&txn); + } + + /** + * Sets the nextChild pointer for the bucket at the specified location. + */ + void setBucketNextChild(const DiskLoc bucketLoc, const DiskLoc nextChild) { + OperationContextNoop txn; + + BucketType* bucket = _helper.btree.getBucket(bucketLoc); + bucket->nextChild = nextChild; + + _helper.btree.fixParentPtrs(&txn, bucket, bucketLoc); + } + + protected: + BtreeLogicTestHelper<BtreeLayoutType> _helper; + }; + + // + // TESTS + // + + template<class OnDiskFormat> + class SimpleCreate : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + this->_helper.btree.initAsEmpty(&txn); + + this->checkValidNumKeys(0); + } + }; + + template<class OnDiskFormat> + class SimpleInsertDelete : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + this->_helper.btree.initAsEmpty(&txn); + + BSONObj key = simpleKey('z'); + this->insert(key, this->_helper.dummyDiskLoc); + + this->checkValidNumKeys(1); + this->locate(key, 0, true, this->_helper.headManager.getHead(), 1); + + this->unindex(key); + + this->checkValidNumKeys(0); + this->locate(key, 0, false, DiskLoc(), 1); + } + }; + + template<class OnDiskFormat> + class SplitUnevenBucketBase : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + this->_helper.btree.initAsEmpty(&txn); + + for (int i = 0; i < 10; ++i) { + BSONObj shortKey = simpleKey(shortToken(i), 1); + this->insert(shortKey, this->_helper.dummyDiskLoc); + + BSONObj longKey = simpleKey(longToken(i), 800); + this->insert(longKey, this->_helper.dummyDiskLoc); + } + + this->checkValidNumKeys(20); + ASSERT_EQUALS(1, this->head()->n); + checkSplit(); + } + + protected: + virtual char shortToken(int i) const = 0; + virtual char longToken(int i) const = 0; + virtual void checkSplit() = 0; + + static char leftToken(int i) { + return 'a' + i; + } + + static char rightToken(int i) { + return 'z' - i; + } + }; + + template<class OnDiskFormat> + class SplitRightHeavyBucket : public SplitUnevenBucketBase<OnDiskFormat> { + private: + virtual char shortToken(int i) const { + return this->leftToken(i); + } + virtual char longToken(int i) const { + return this->rightToken(i); + } + virtual void checkSplit() { + ASSERT_EQUALS(15, this->child(this->head(), 0)->n); + ASSERT_EQUALS(4, this->child(this->head(), 1)->n); + } + }; + + template<class OnDiskFormat> + class SplitLeftHeavyBucket : public SplitUnevenBucketBase<OnDiskFormat> { + private: + virtual char shortToken(int i) const { + return this->rightToken(i); + } + virtual char longToken(int i) const { + return this->leftToken(i); + } + virtual void checkSplit() { + ASSERT_EQUALS(4, this->child(this->head(), 0)->n); + ASSERT_EQUALS(15, this->child(this->head(), 1)->n); + } + }; + + template<class OnDiskFormat> + class MissingLocate : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + this->_helper.btree.initAsEmpty(&txn); + + for (int i = 0; i < 3; ++i) { + BSONObj k = simpleKey('b' + 2 * i); + this->insert(k, this->_helper.dummyDiskLoc); + } + + locateExtended(1, 'a', 'b', this->_helper.headManager.getHead()); + locateExtended(1, 'c', 'd', this->_helper.headManager.getHead()); + locateExtended(1, 'e', 'f', this->_helper.headManager.getHead()); + locateExtended(1, 'g', 'g' + 1, DiskLoc()); // of course, 'h' isn't in the index. + + // old behavior + // locateExtended( -1, 'a', 'b', dl() ); + // locateExtended( -1, 'c', 'd', dl() ); + // locateExtended( -1, 'e', 'f', dl() ); + // locateExtended( -1, 'g', 'f', dl() ); + + locateExtended(-1, 'a', 'a' - 1, DiskLoc()); // of course, 'a' - 1 isn't in the index + locateExtended(-1, 'c', 'b', this->_helper.headManager.getHead()); + locateExtended(-1, 'e', 'd', this->_helper.headManager.getHead()); + locateExtended(-1, 'g', 'f', this->_helper.headManager.getHead()); + } + + private: + void locateExtended( + int direction, char token, char expectedMatch, DiskLoc expectedLocation) { + const BSONObj k = simpleKey(token); + int expectedPos = (expectedMatch - 'b') / 2; + + this->locate(k, expectedPos, false, expectedLocation, direction); + } + }; + + template<class OnDiskFormat> + class MissingLocateMultiBucket : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + this->_helper.btree.initAsEmpty(&txn); + + this->insert(simpleKey('A', 800), this->_helper.dummyDiskLoc); + this->insert(simpleKey('B', 800), this->_helper.dummyDiskLoc); + this->insert(simpleKey('C', 800), this->_helper.dummyDiskLoc); + this->insert(simpleKey('D', 800), this->_helper.dummyDiskLoc); + this->insert(simpleKey('E', 800), this->_helper.dummyDiskLoc); + this->insert(simpleKey('F', 800), this->_helper.dummyDiskLoc); + this->insert(simpleKey('G', 800), this->_helper.dummyDiskLoc); + this->insert(simpleKey('H', 800), this->_helper.dummyDiskLoc); + this->insert(simpleKey('J', 800), this->_helper.dummyDiskLoc); + + // This causes split + this->insert(simpleKey('I', 800), this->_helper.dummyDiskLoc); + + int pos; + DiskLoc loc; + + // 'E' is the split point and should be in the head the rest should be ~50/50 + const BSONObj splitPoint = simpleKey('E', 800); + this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &pos, &loc); + ASSERT_EQUALS(this->_helper.headManager.getHead(), loc); + ASSERT_EQUALS(0, pos); + + // Find the one before 'E' + int largePos; + DiskLoc largeLoc; + this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &largePos, &largeLoc); + this->_helper.btree.advance(&txn, &largeLoc, &largePos, -1); + + // Find the one after 'E' + int smallPos; + DiskLoc smallLoc; + this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &smallPos, &smallLoc); + this->_helper.btree.advance(&txn, &smallLoc, &smallPos, 1); + + ASSERT_NOT_EQUALS(smallLoc, largeLoc); + ASSERT_NOT_EQUALS(smallLoc, loc); + ASSERT_NOT_EQUALS(largeLoc, loc); + } + }; + + /** + * Validates that adding keys incrementally produces buckets, which are 90%/10% full. + */ + template<class OnDiskFormat> + class SERVER983 : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + this->_helper.btree.initAsEmpty(&txn); + + this->insert(simpleKey('A', 800), this->_helper.dummyDiskLoc); + this->insert(simpleKey('B', 800), this->_helper.dummyDiskLoc); + this->insert(simpleKey('C', 800), this->_helper.dummyDiskLoc); + this->insert(simpleKey('D', 800), this->_helper.dummyDiskLoc); + this->insert(simpleKey('E', 800), this->_helper.dummyDiskLoc); + this->insert(simpleKey('F', 800), this->_helper.dummyDiskLoc); + this->insert(simpleKey('G', 800), this->_helper.dummyDiskLoc); + this->insert(simpleKey('H', 800), this->_helper.dummyDiskLoc); + this->insert(simpleKey('I', 800), this->_helper.dummyDiskLoc); + + // This will cause split + this->insert(simpleKey('J', 800), this->_helper.dummyDiskLoc); + + int pos; + DiskLoc loc; + + // 'H' is the maximum 'large' interval key, 90% should be < 'H' and 10% larger + const BSONObj splitPoint = simpleKey('H', 800); + this->_helper.btree.locate(&txn, splitPoint, this->_helper.dummyDiskLoc, 1, &pos, &loc); + ASSERT_EQUALS(this->_helper.headManager.getHead(), loc); + ASSERT_EQUALS(0, pos); + + // Find the one before 'H' + int largePos; + DiskLoc largeLoc; + this->_helper.btree.locate(&txn, + splitPoint, this->_helper.dummyDiskLoc, 1, &largePos, &largeLoc); + this->_helper.btree.advance(&txn, &largeLoc, &largePos, -1); + + // Find the one after 'H' + int smallPos; + DiskLoc smallLoc; + this->_helper.btree.locate(&txn, + splitPoint, this->_helper.dummyDiskLoc, 1, &smallPos, &smallLoc); + this->_helper.btree.advance(&txn, &smallLoc, &smallPos, 1); + + ASSERT_NOT_EQUALS(smallLoc, largeLoc); + ASSERT_NOT_EQUALS(smallLoc, loc); + ASSERT_NOT_EQUALS(largeLoc, loc); + } + }; + + template<class OnDiskFormat> + class DontReuseUnused : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + this->_helper.btree.initAsEmpty(&txn); + + for (int i = 0; i < 10; ++i) { + const BSONObj k = simpleKey('b' + 2 * i, 800); + this->insert(k, this->_helper.dummyDiskLoc); + } + + const BSONObj root = simpleKey('p', 800); + this->unindex(root); + + this->insert(root, this->_helper.dummyDiskLoc); + this->locate(root, 0, true, this->head()->nextChild, 1); + } + }; + + template<class OnDiskFormat> + class MergeBucketsTestBase : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + this->_helper.btree.initAsEmpty(&txn); + + for (int i = 0; i < 10; ++i) { + const BSONObj k = simpleKey('b' + 2 * i, 800); + this->insert(k, this->_helper.dummyDiskLoc); + } + + // numRecords() - 1, because this->_helper.dummyDiskLoc is actually in the record store too + ASSERT_EQUALS(3, this->_helper.recordStore.numRecords() - 1); + + long long expectedCount = 10 - unindexKeys(); + ASSERT_EQUALS(1, this->_helper.recordStore.numRecords() - 1); + + long long unusedCount = 0; + ASSERT_EQUALS(expectedCount, this->_helper.btree.fullValidate(&txn, &unusedCount, true, true, 0)); + ASSERT_EQUALS(0, unusedCount); + } + + protected: + virtual int unindexKeys() = 0; + }; + + template<class OnDiskFormat> + class MergeBucketsLeft : public MergeBucketsTestBase<OnDiskFormat> { + virtual int unindexKeys() { + BSONObj k = simpleKey('b', 800); + this->unindex(k); + + k = simpleKey('b' + 2, 800); + this->unindex(k); + + k = simpleKey('b' + 4, 800); + this->unindex(k); + + k = simpleKey('b' + 6, 800); + this->unindex(k); + + return 4; + } + }; + + template<class OnDiskFormat> + class MergeBucketsRight : public MergeBucketsTestBase<OnDiskFormat> { + virtual int unindexKeys() { + const BSONObj k = simpleKey('b' + 2 * 9, 800); + this->unindex(k); + return 1; + } + }; + + template<class OnDiskFormat> + class MergeBucketsDontReplaceHead : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + this->_helper.btree.initAsEmpty(&txn); + + for (int i = 0; i < 18; ++i) { + const BSONObj k = simpleKey('a' + i, 800); + this->insert(k, this->_helper.dummyDiskLoc); + } + + // numRecords() - 1, because fixedDiskLoc is actually in the record store too + ASSERT_EQUALS(4, this->_helper.recordStore.numRecords() - 1); + + const BSONObj k = simpleKey('a' + 17, 800); + this->unindex(k); + ASSERT_EQUALS(3, this->_helper.recordStore.numRecords() - 1); + + long long unusedCount = 0; + ASSERT_EQUALS(17, this->_helper.btree.fullValidate(&txn, &unusedCount, true, true, 0)); + ASSERT_EQUALS(0, unusedCount); + } + }; + + template<class OnDiskFormat> + class MergeBucketsDelInternal : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{d:{b:{a:null},bb:null,_:{c:null}},_:{f:{e:null},_:{g:null}}}"); + ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(8, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << "bb"); + verify(this->unindex(k)); + + ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(6, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{b:{a:null},d:{c:null},f:{e:null},_:{g:null}}"); + } + }; + + template<class OnDiskFormat> + class MergeBucketsRightNull : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{d:{b:{a:null},bb:null,cc:{c:null}},_:{f:{e:null},h:{g:null}}}"); + ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(8, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << "bb"); + verify(this->unindex(k)); + + ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(6, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}"); + } + }; + + // This comment was here during porting, not sure what it means: + // + // "Not yet handling this case" + template<class OnDiskFormat> + class DontMergeSingleBucket : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{d:{b:{a:null},c:null}}"); + + ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(4, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << "c"); + verify(this->unindex(k)); + + ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(4, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{d:{b:{a:null}}}"); + } + }; + + template<class OnDiskFormat> + class ParentMergeNonRightToLeft : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{d:{b:{a:null},bb:null,cc:{c:null}},i:{f:{e:null},h:{g:null}}}"); + + ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(8, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << "bb"); + verify(this->unindex(k)); + + ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // Child does not currently replace parent in this case. Also, the tree + // has 6 buckets + 1 for the this->_helper.dummyDiskLoc. + ASSERT_EQUALS(7, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{i:{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}}"); + } + }; + + template<class OnDiskFormat> + class ParentMergeNonRightToRight : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{d:{b:{a:null},cc:{c:null}},i:{f:{e:null},ff:null,h:{g:null}}}"); + + ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(8, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << "ff"); + verify(this->unindex(k)); + + ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // Child does not currently replace parent in this case. Also, the tree + // has 6 buckets + 1 for the this->_helper.dummyDiskLoc. + ASSERT_EQUALS(7, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{i:{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}}"); + } + }; + + template<class OnDiskFormat> + class CantMergeRightNoMerge : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{d:{b:{a:null},bb:null,cc:{c:null}}," + "dd:null," + "_:{f:{e:null},h:{g:null}}}"); + + ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(8, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << "bb"); + verify(this->unindex(k)); + + ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(8, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{d:{b:{a:null},cc:{c:null}}," + "dd:null," + "_:{f:{e:null},h:{g:null}}}"); + } + }; + + template<class OnDiskFormat> + class CantMergeLeftNoMerge : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{c:{b:{a:null}},d:null,_:{f:{e:null},g:null}}"); + + ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(6, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << "g"); + verify(this->unindex(k)); + + ASSERT_EQUALS(6, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(6, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{c:{b:{a:null}},d:null,_:{f:{e:null}}}"); + } + }; + + template<class OnDiskFormat> + class MergeOption : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{c:{b:{a:null}},f:{e:{d:null},ee:null},_:{h:{g:null}}}"); + + ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(8, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << "ee"); + verify(this->unindex(k)); + + ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(7, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{c:{b:{a:null}},_:{e:{d:null},f:null,h:{g:null}}}"); + } + }; + + template<class OnDiskFormat> + class ForceMergeLeft : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{c:{b:{a:null}},f:{e:{d:null},ee:null},ff:null,_:{h:{g:null}}}"); + + ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(8, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << "ee"); + verify(this->unindex(k)); + + ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(7, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{f:{b:{a:null},c:null,e:{d:null}},ff:null,_:{h:{g:null}}}"); + } + }; + + template<class OnDiskFormat> + class ForceMergeRight : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{c:{b:{a:null}},cc:null,f:{e:{d:null},ee:null},_:{h:{g:null}}}"); + + ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(8, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << "ee"); + verify(this->unindex(k)); + + ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(7, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{c:{b:{a:null}},cc:null,_:{e:{d:null},f:null,h:{g:null}}}"); + } + }; + + template<class OnDiskFormat> + class RecursiveMerge : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},g:{f:null}},j:{i:null}}"); + + ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(7, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << "c"); + verify(this->unindex(k)); + + ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + + // Height is not currently reduced in this case + builder.checkStructure("{j:{g:{b:{a:null},d:null,e:null,f:null},h:null,i:null}}"); + } + }; + + template<class OnDiskFormat> + class RecursiveMergeRightBucket : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},g:{f:null}},_:{i:null}}"); + + ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(7, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << "c"); + verify(this->unindex(k)); + + ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(4, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{g:{b:{a:null},d:null,e:null,f:null},h:null,i:null}"); + } + }; + + template<class OnDiskFormat> + class RecursiveMergeDoubleRightBucket : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},_:{f:null}},_:{i:null}}"); + + ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(7, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << "c"); + verify(this->unindex(k)); + + ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + + // no recursion currently in this case + builder.checkStructure("{h:{b:{a:null},d:null,e:null,f:null},_:{i:null}}"); + } + }; + + template<class OnDiskFormat> + class MergeSizeTestBase : public BtreeLogicTestBase<OnDiskFormat> { + public: + MergeSizeTestBase() : _count(0) { + + } + + void run() { + OperationContextNoop txn; + this->_helper.btree.initAsEmpty(&txn); + + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + const BSONObj& topKey = biggestKey('m'); + + DiskLoc leftChild = this->newBucket(); + builder.push(this->_helper.headManager.getHead(), topKey, leftChild); + _count++; + + DiskLoc rightChild = this->newBucket(); + this->setBucketNextChild(this->_helper.headManager.getHead(), rightChild); + + _count += builder.fillBucketToExactSize(leftChild, leftSize(), 'a'); + _count += builder.fillBucketToExactSize(rightChild, rightSize(), 'n'); + + ASSERT(leftAdditional() <= 2); + if (leftAdditional() >= 2) { + builder.push(leftChild, bigKey('k'), DiskLoc()); + } + if (leftAdditional() >= 1) { + builder.push(leftChild, bigKey('l'), DiskLoc()); + } + + ASSERT(rightAdditional() <= 2); + if (rightAdditional() >= 2) { + builder.push(rightChild, bigKey('y'), DiskLoc()); + } + if (rightAdditional() >= 1) { + builder.push(rightChild, bigKey('z'), DiskLoc()); + } + + _count += leftAdditional() + rightAdditional(); + + initCheck(); + + const char *keys = delKeys(); + for (const char *i = keys; *i; ++i) { + long long unused = 0; + ASSERT_EQUALS(_count, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + ASSERT_EQUALS(0, unused); + + // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(4, this->_helper.recordStore.numRecords()); + + const BSONObj k = bigKey(*i); + this->unindex(k); + + --_count; + } + + long long unused = 0; + ASSERT_EQUALS(_count, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + ASSERT_EQUALS(0, unused); + + validate(); + + if (!merge()) { + // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(4, this->_helper.recordStore.numRecords()); + } + else { + // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(2, this->_helper.recordStore.numRecords()); + } + } + + protected: + virtual int leftAdditional() const { return 2; } + virtual int rightAdditional() const { return 2; } + virtual void initCheck() {} + virtual void validate() {} + virtual int leftSize() const = 0; + virtual int rightSize() const = 0; + virtual const char * delKeys() const { return "klyz"; } + virtual bool merge() const { return true; } + + static BSONObj bigKey(char a) { + return simpleKey(a, 801); + } + + static BSONObj biggestKey(char a) { + int size = OnDiskFormat::KeyMax - bigSize() + 801; + return simpleKey(a, size); + } + + static int bigSize() { + return typename BtreeLogicTestBase<OnDiskFormat>::KeyDataOwnedType(bigKey('a')).dataSize(); + } + + static int biggestSize() { + return typename BtreeLogicTestBase<OnDiskFormat>::KeyDataOwnedType(biggestKey('a')).dataSize(); + } + + int _count; + }; + + template<class OnDiskFormat> + class MergeSizeJustRightRight : public MergeSizeTestBase<OnDiskFormat> { + protected: + virtual int rightSize() const { + return BtreeLogic<OnDiskFormat>::lowWaterMark() - 1; + } + + virtual int leftSize() const { + return OnDiskFormat::BucketBodySize - + MergeSizeTestBase<OnDiskFormat>::biggestSize() - + sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) - + (BtreeLogic<OnDiskFormat>::lowWaterMark() - 1); + } + }; + + template<class OnDiskFormat> + class MergeSizeJustRightLeft : public MergeSizeTestBase<OnDiskFormat> { + protected: + virtual int leftSize() const { + return BtreeLogic<OnDiskFormat>::lowWaterMark() - 1; + } + + virtual int rightSize() const { + return OnDiskFormat::BucketBodySize - + MergeSizeTestBase<OnDiskFormat>::biggestSize() - + sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) - + (BtreeLogic<OnDiskFormat>::lowWaterMark() - 1); + } + + virtual const char * delKeys() const { return "yzkl"; } + }; + + template<class OnDiskFormat> + class MergeSizeRight : public MergeSizeJustRightRight<OnDiskFormat> { + virtual int rightSize() const { return MergeSizeJustRightRight<OnDiskFormat>::rightSize() - 1; } + virtual int leftSize() const { return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1; } + }; + + template<class OnDiskFormat> + class MergeSizeLeft : public MergeSizeJustRightLeft<OnDiskFormat> { + virtual int rightSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1; } + virtual int leftSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() - 1; } + }; + + template<class OnDiskFormat> + class NoMergeBelowMarkRight : public MergeSizeJustRightRight<OnDiskFormat> { + virtual int rightSize() const { return MergeSizeJustRightRight<OnDiskFormat>::rightSize() + 1; } + virtual int leftSize() const { return MergeSizeJustRightRight<OnDiskFormat>::leftSize() - 1; } + virtual bool merge() const { return false; } + }; + + template<class OnDiskFormat> + class NoMergeBelowMarkLeft : public MergeSizeJustRightLeft<OnDiskFormat> { + virtual int rightSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() - 1; } + virtual int leftSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() + 1; } + virtual bool merge() const { return false; } + }; + + template<class OnDiskFormat> + class MergeSizeRightTooBig : public MergeSizeJustRightLeft<OnDiskFormat> { + virtual int rightSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1; } + virtual bool merge() const { return false; } + }; + + template<class OnDiskFormat> + class MergeSizeLeftTooBig : public MergeSizeJustRightRight<OnDiskFormat> { + virtual int leftSize() const { return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1; } + virtual bool merge() const { return false; } + }; + + template<class OnDiskFormat> + class MergeRightEmpty : public MergeSizeTestBase<OnDiskFormat> { + protected: + virtual int rightAdditional() const { return 1; } + virtual int leftAdditional() const { return 1; } + virtual const char * delKeys() const { return "lz"; } + virtual int rightSize() const { return 0; } + virtual int leftSize() const { + return OnDiskFormat::BucketBodySize - + MergeSizeTestBase<OnDiskFormat>::biggestSize() - + sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType); + } + }; + + template<class OnDiskFormat> + class MergeMinRightEmpty : public MergeSizeTestBase<OnDiskFormat> { + protected: + virtual int rightAdditional() const { return 1; } + virtual int leftAdditional() const { return 0; } + virtual const char * delKeys() const { return "z"; } + virtual int rightSize() const { return 0; } + virtual int leftSize() const { + return MergeSizeTestBase<OnDiskFormat>::bigSize() + + sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType); + } + }; + + template<class OnDiskFormat> + class MergeLeftEmpty : public MergeSizeTestBase<OnDiskFormat> { + protected: + virtual int rightAdditional() const { return 1; } + virtual int leftAdditional() const { return 1; } + virtual const char * delKeys() const { return "zl"; } + virtual int leftSize() const { return 0; } + virtual int rightSize() const { + return OnDiskFormat::BucketBodySize - + MergeSizeTestBase<OnDiskFormat>::biggestSize() - + sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType); + } + }; + + template<class OnDiskFormat> + class MergeMinLeftEmpty : public MergeSizeTestBase<OnDiskFormat> { + protected: + virtual int leftAdditional() const { return 1; } + virtual int rightAdditional() const { return 0; } + virtual const char * delKeys() const { return "l"; } + virtual int leftSize() const { return 0; } + virtual int rightSize() const { + return MergeSizeTestBase<OnDiskFormat>::bigSize() + + sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType); + } + }; + + template<class OnDiskFormat> + class BalanceRightEmpty : public MergeRightEmpty<OnDiskFormat> { + protected: + virtual int leftSize() const { + return OnDiskFormat::BucketBodySize - + MergeSizeTestBase<OnDiskFormat>::biggestSize() - + sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) + 1; + } + + virtual bool merge() const { return false; } + + virtual void initCheck() { + _oldTop = this->getKey(this->_helper.headManager.getHead(), 0).data.toBson(); + } + + virtual void validate() { + ASSERT_NOT_EQUALS(_oldTop, this->getKey(this->_helper.headManager.getHead(), 0).data.toBson()); + } + + private: + BSONObj _oldTop; + }; + + template<class OnDiskFormat> + class BalanceLeftEmpty : public MergeLeftEmpty<OnDiskFormat> { + protected: + virtual int rightSize() const { + return OnDiskFormat::BucketBodySize - + MergeSizeTestBase<OnDiskFormat>::biggestSize() - + sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) + 1; + } + + virtual bool merge() const { return false; } + + virtual void initCheck() { + _oldTop = this->getKey(this->_helper.headManager.getHead(), 0).data.toBson(); + } + + virtual void validate() { + ASSERT_TRUE(_oldTop != this->getKey(this->_helper.headManager.getHead(), 0).data.toBson()); + } + + private: + BSONObj _oldTop; + }; + + template<class OnDiskFormat> + class BalanceOneLeftToRight : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null}," + "b:{$20:null,$30:null,$40:null,$50:null,a:null}," + "_:{c:null}}"); + + ASSERT_EQUALS(14, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << bigNumString(0x40, 800)); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{$6:{$1:null,$2:null,$3:null,$4:null,$5:null}," + "b:{$10:null,$20:null,$30:null,$50:null,a:null}," + "_:{c:null}}"); + } + }; + + template<class OnDiskFormat> + class BalanceOneRightToLeft : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null}," + "b:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null}," + "_:{c:null}}"); + + ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << bigNumString(0x3, 800)); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{$20:{$1:null,$2:null,$4:null,$10:null}," + "b:{$30:null,$40:null,$50:null,$60:null,$70:null}," + "_:{c:null}}"); + } + }; + + template<class OnDiskFormat> + class BalanceThreeLeftToRight : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{$20:{$1:{$0:null},$3:{$2:null},$5:{$4:null},$7:{$6:null}," + "$9:{$8:null},$11:{$10:null},$13:{$12:null},_:{$14:null}}," + "b:{$30:null,$40:{$35:null},$50:{$45:null}}," + "_:{c:null}}"); + + ASSERT_EQUALS(23, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 14 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(15, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << bigNumString(0x30, 800)); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(22, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 14 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(15, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{$9:{$1:{$0:null},$3:{$2:null}," + "$5:{$4:null},$7:{$6:null},_:{$8:null}}," + "b:{$11:{$10:null},$13:{$12:null},$20:{$14:null}," + "$40:{$35:null},$50:{$45:null}}," + "_:{c:null}}"); + } + }; + + template<class OnDiskFormat> + class BalanceThreeRightToLeft : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{$20:{$1:{$0:null},$3:{$2:null},$5:null,_:{$14:null}}," + "b:{$30:{$25:null},$40:{$35:null},$50:{$45:null},$60:{$55:null}," + "$70:{$65:null},$80:{$75:null}," + "$90:{$85:null},$100:{$95:null}}," + "_:{c:null}}"); + + ASSERT_EQUALS(25, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 15 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(16, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << bigNumString(0x5, 800)); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(24, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 15 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(16, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{$50:{$1:{$0:null},$3:{$2:null},$20:{$14:null}," + "$30:{$25:null},$40:{$35:null},_:{$45:null}}," + "b:{$60:{$55:null},$70:{$65:null},$80:{$75:null}," + "$90:{$85:null},$100:{$95:null}}," + "_:{c:null}}"); + } + }; + + template<class OnDiskFormat> + class BalanceSingleParentKey : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null}," + "_:{$20:null,$30:null,$40:null,$50:null,a:null}}"); + + ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(4, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << bigNumString(0x40, 800)); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(4, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{$6:{$1:null,$2:null,$3:null,$4:null,$5:null}," + "_:{$10:null,$20:null,$30:null,$50:null,a:null}}"); + } + }; + + template<class OnDiskFormat> + class PackEmptyBucket : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{a:null}"); + + const BSONObj k = BSON("" << "a"); + ASSERT(this->unindex(k)); + + this->forcePackBucket(this->_helper.headManager.getHead()); + + typename BtreeLogicTestBase<OnDiskFormat>::BucketType* headBucket = this->head(); + + ASSERT_EQUALS(0, headBucket->n); + ASSERT_FALSE(headBucket->flags & Packed); + + int unused = 0; + this->truncateBucket(headBucket, 0, unused); + + ASSERT_EQUALS(0, headBucket->n); + ASSERT_EQUALS(0, headBucket->topSize); + ASSERT_EQUALS((int)OnDiskFormat::BucketBodySize, headBucket->emptySize); + ASSERT_TRUE(headBucket->flags & Packed); + } + }; + + template<class OnDiskFormat> + class PackedDataSizeEmptyBucket : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{a:null}"); + + const BSONObj k = BSON("" << "a"); + ASSERT(this->unindex(k)); + + this->forcePackBucket(this->_helper.headManager.getHead()); + + typename BtreeLogicTestBase<OnDiskFormat>::BucketType* headBucket = this->head(); + + ASSERT_EQUALS(0, headBucket->n); + ASSERT_FALSE(headBucket->flags & Packed); + ASSERT_EQUALS(0, this->bucketPackedDataSize(headBucket, 0)); + ASSERT_FALSE(headBucket->flags & Packed); + } + }; + + template<class OnDiskFormat> + class BalanceSingleParentKeyPackParent : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null}," + "_:{$20:null,$30:null,$40:null,$50:null,a:null}}"); + + ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(4, this->_helper.recordStore.numRecords()); + + // force parent pack + this->forcePackBucket(this->_helper.headManager.getHead()); + + const BSONObj k = BSON("" << bigNumString(0x40, 800)); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(4, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{$6:{$1:null,$2:null,$3:null,$4:null,$5:null}," + "_:{$10:null,$20:null,$30:null,$50:null,a:null}}"); + } + }; + + template<class OnDiskFormat> + class BalanceSplitParent : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree( + "{$10$10:{$1:null,$2:null,$3:null,$4:null}," + "$100:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null,$80:null}," + "$200:null,$300:null,$400:null,$500:null,$600:null," + "$700:null,$800:null,$900:null,_:{c:null}}"); + + ASSERT_EQUALS(22, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << bigNumString(0x3, 800)); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(21, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(7, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{$500:{ $30:{$1:null,$2:null,$4:null,$10$10:null,$20:null}," + "$100:{$40:null,$50:null,$60:null,$70:null,$80:null}," + "$200:null,$300:null,$400:null}," + "_:{$600:null,$700:null,$800:null,$900:null,_:{c:null}}}"); + } + }; + + template<class OnDiskFormat> + class RebalancedSeparatorBase : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree(treeSpec()); + modTree(); + + ASSERT_EQUALS(expectedSeparator(), + this->bucketRebalancedSeparatorPos( + this->_helper.headManager.getHead(), 0)); + } + + virtual string treeSpec() const = 0; + virtual int expectedSeparator() const = 0; + virtual void modTree() {} + }; + + template<class OnDiskFormat> + class EvenRebalanceLeft : public RebalancedSeparatorBase<OnDiskFormat> { + virtual string treeSpec() const { return "{$7:{$1:null,$2$31f:null,$3:null," + "$4$31f:null,$5:null,$6:null}," + "_:{$8:null,$9:null,$10$31e:null}}"; } + virtual int expectedSeparator() const { return 4; } + }; + + template<class OnDiskFormat> + class EvenRebalanceLeftCusp : public RebalancedSeparatorBase<OnDiskFormat> { + virtual string treeSpec() const { + return "{$6:{$1:null,$2$31f:null,$3:null,$4$31f:null,$5:null}," + "_:{$7:null,$8:null,$9$31e:null,$10:null}}"; + } + virtual int expectedSeparator() const { return 4; } + }; + + template<class OnDiskFormat> + class EvenRebalanceRight : public RebalancedSeparatorBase<OnDiskFormat> { + virtual string treeSpec() const { return "{$3:{$1:null,$2$31f:null},_:{$4$31f:null,$5:null,$6:null,$7:null,$8$31e:null,$9:null,$10:null}}"; } + virtual int expectedSeparator() const { return 4; } + }; + + template<class OnDiskFormat> + class EvenRebalanceRightCusp : public RebalancedSeparatorBase<OnDiskFormat> { + virtual string treeSpec() const { return "{$4$31f:{$1:null,$2$31f:null,$3:null},_:{$5:null,$6:null,$7$31e:null,$8:null,$9:null,$10:null}}"; } + virtual int expectedSeparator() const { return 4; } + }; + + template<class OnDiskFormat> + class EvenRebalanceCenter : public RebalancedSeparatorBase<OnDiskFormat> { + virtual string treeSpec() const { return "{$5:{$1:null,$2$31f:null,$3:null,$4$31f:null},_:{$6:null,$7$31e:null,$8:null,$9:null,$10:null}}"; } + virtual int expectedSeparator() const { return 4; } + }; + + template<class OnDiskFormat> + class OddRebalanceLeft : public RebalancedSeparatorBase<OnDiskFormat> { + virtual string treeSpec() const { return "{$6$31f:{$1:null,$2:null,$3:null,$4:null,$5:null},_:{$7:null,$8:null,$9:null,$10:null}}"; } + virtual int expectedSeparator() const { return 4; } + }; + + template<class OnDiskFormat> + class OddRebalanceRight : public RebalancedSeparatorBase<OnDiskFormat> { + virtual string treeSpec() const { return "{$4:{$1:null,$2:null,$3:null},_:{$5:null,$6:null,$7:null,$8$31f:null,$9:null,$10:null}}"; } + virtual int expectedSeparator() const { return 4; } + }; + + template<class OnDiskFormat> + class OddRebalanceCenter : public RebalancedSeparatorBase<OnDiskFormat> { + virtual string treeSpec() const { return "{$5:{$1:null,$2:null,$3:null,$4:null},_:{$6:null,$7:null,$8:null,$9:null,$10$31f:null}}"; } + virtual int expectedSeparator() const { return 4; } + }; + + template<class OnDiskFormat> + class RebalanceEmptyRight : public RebalancedSeparatorBase<OnDiskFormat> { + virtual string treeSpec() const { return "{$a:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null,$7:null,$8:null,$9:null},_:{$b:null}}"; } + virtual void modTree() { + BSONObj k = BSON("" << bigNumString(0xb, 800)); + ASSERT(this->unindex(k)); + } + virtual int expectedSeparator() const { return 4; } + }; + + template<class OnDiskFormat> + class RebalanceEmptyLeft : public RebalancedSeparatorBase<OnDiskFormat> { + virtual string treeSpec() const { return "{$a:{$1:null},_:{$11:null,$12:null,$13:null,$14:null,$15:null,$16:null,$17:null,$18:null,$19:null}}"; } + virtual void modTree() { + BSONObj k = BSON("" << bigNumString(0x1, 800)); + ASSERT(this->unindex(k)); + } + virtual int expectedSeparator() const { return 4; } + }; + + template<class OnDiskFormat> + class NoMoveAtLowWaterMarkRight : public MergeSizeJustRightRight<OnDiskFormat> { + virtual int rightSize() const { return MergeSizeJustRightRight<OnDiskFormat>::rightSize() + 1; } + + virtual void initCheck() { + _oldTop = this->getKey(this->_helper.headManager.getHead(), 0).data.toBson(); + } + + virtual void validate() { + ASSERT_EQUALS(_oldTop, this->getKey(this->_helper.headManager.getHead(), 0).data.toBson()); + } + + virtual bool merge() const { return false; } + + protected: + BSONObj _oldTop; + }; + + template<class OnDiskFormat> + class MoveBelowLowWaterMarkRight : public NoMoveAtLowWaterMarkRight<OnDiskFormat> { + virtual int rightSize() const { return MergeSizeJustRightRight<OnDiskFormat>::rightSize(); } + virtual int leftSize() const { return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1; } + + virtual void validate() { + // Different top means we rebalanced + ASSERT_NOT_EQUALS(this->_oldTop, + this->getKey(this->_helper.headManager.getHead(), 0).data.toBson()); + } + }; + + template<class OnDiskFormat> + class NoMoveAtLowWaterMarkLeft : public MergeSizeJustRightLeft<OnDiskFormat> { + virtual int leftSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() + 1; } + virtual void initCheck() { + this->_oldTop = this->getKey(this->_helper.headManager.getHead(), 0).data.toBson(); + } + + virtual void validate() { + ASSERT_EQUALS(this->_oldTop, + this->getKey(this->_helper.headManager.getHead(), 0).data.toBson()); + } + virtual bool merge() const { return false; } + + protected: + BSONObj _oldTop; + }; + + template<class OnDiskFormat> + class MoveBelowLowWaterMarkLeft : public NoMoveAtLowWaterMarkLeft<OnDiskFormat> { + virtual int leftSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::leftSize(); } + virtual int rightSize() const { return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1; } + + virtual void validate() { + // Different top means we rebalanced + ASSERT_NOT_EQUALS(this->_oldTop, + this->getKey(this->_helper.headManager.getHead(), 0).data.toBson()); + } + }; + + template<class OnDiskFormat> + class PreferBalanceLeft : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null}," + "$20:{$11:null,$12:null,$13:null,$14:null}," + "_:{$30:null}}"); + + ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << bigNumString(0x12, 800)); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{$5:{$1:null,$2:null,$3:null,$4:null}," + "$20:{$6:null,$10:null,$11:null,$13:null,$14:null}," + "_:{$30:null}}"); + } + }; + + template<class OnDiskFormat> + class PreferBalanceRight : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{$10:{$1:null}," + "$20:{$11:null,$12:null,$13:null,$14:null}," + "_:{$31:null,$32:null,$33:null,$34:null,$35:null,$36:null}}"); + + ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << bigNumString(0x12, 800)); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{$10:{$1:null}," + "$31:{$11:null,$13:null,$14:null,$20:null}," + "_:{$32:null,$33:null,$34:null,$35:null,$36:null}}"); + } + }; + + template<class OnDiskFormat> + class RecursiveMergeThenBalance : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{$10:{$5:{$1:null,$2:null},$8:{$6:null,$7:null}}," + "_:{$20:null,$30:null,$40:null,$50:null," + "$60:null,$70:null,$80:null,$90:null}}"); + + ASSERT_EQUALS(15, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(6, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << bigNumString(0x7, 800)); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(14, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + + builder.checkStructure( + "{$40:{$8:{$1:null,$2:null,$5:null,$6:null},$10:null,$20:null,$30:null}," + "_:{$50:null,$60:null,$70:null,$80:null,$90:null}}"); + } + }; + + template<class OnDiskFormat> + class DelEmptyNoNeighbors : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{b:{a:null}}"); + + ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(3, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << "a"); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(1, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(2, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{b:null}"); + } + }; + + template<class OnDiskFormat> + class DelEmptyEmptyNeighbors : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{a:null,c:{b:null},d:null}"); + + ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(3, this->_helper.recordStore.numRecords()); + + const BSONObj k = BSON("" << "b"); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, NULL, true, true, 0)); + + // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(2, this->_helper.recordStore.numRecords()); + + builder.checkStructure("{a:null,c:null,d:null}"); + } + }; + + template<class OnDiskFormat> + class DelInternal : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{a:null,c:{b:null},d:null}"); + + long long unused = 0; + ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(3, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(0, unused); + + const BSONObj k = BSON("" << "c"); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(2, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(0, unused); + + builder.checkStructure("{a:null,b:null,d:null}"); + } + }; + + template<class OnDiskFormat> + class DelInternalReplaceWithUnused : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{a:null,c:{b:null},d:null}"); + + const DiskLoc prevChildBucket = + this->getKey(this->_helper.headManager.getHead(), 1).prevChildBucket; + this->markKeyUnused(prevChildBucket, 0); + + long long unused = 0; + ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(3, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(1, unused); + + const BSONObj k = BSON("" << "c"); + ASSERT(this->unindex(k)); + + unused = 0; + ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(2, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(1, unused); + + // doesn't discriminate between used and unused + builder.checkStructure("{a:null,b:null,d:null}"); + } + }; + + template<class OnDiskFormat> + class DelInternalReplaceRight : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{a:null,_:{b:null}}"); + + long long unused = 0; + ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(3, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(0, unused); + + const BSONObj k = BSON("" << "a"); + ASSERT(this->unindex(k)); + + unused = 0; + ASSERT_EQUALS(1, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(2, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(0, unused); + + builder.checkStructure("{b:null}"); + } + }; + + template<class OnDiskFormat> + class DelInternalPromoteKey : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{a:null,y:{d:{c:{b:null}},_:{e:null}},z:null}"); + + long long unused = 0; + ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(6, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(0, unused); + + const BSONObj k = BSON("" << "y"); + ASSERT(this->unindex(k)); + + unused = 0; + ASSERT_EQUALS(6, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(4, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(0, unused); + + builder.checkStructure("{a:null,e:{c:{b:null},d:null},z:null}"); + } + }; + + template<class OnDiskFormat> + class DelInternalPromoteRightKey : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{a:null,_:{e:{c:null},_:{f:null}}}"); + + long long unused = 0; + ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(0, unused); + + const BSONObj k = BSON("" << "a"); + ASSERT(this->unindex(k)); + + unused = 0; + ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(3, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(0, unused); + + builder.checkStructure("{c:null,_:{e:null,f:null}}"); + } + }; + + template<class OnDiskFormat> + class DelInternalReplacementPrevNonNull : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{a:null,d:{c:{b:null}},e:null}"); + + long long unused = 0; + ASSERT_EQUALS(5, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(4, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(0, unused); + + const BSONObj k = BSON("" << "d"); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(4, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(1, unused); + + builder.checkStructure("{a:null,d:{c:{b:null}},e:null}"); + + // Check 'unused' key + ASSERT(this->getKey(this->_helper.headManager.getHead(), 1).recordLoc.getOfs() & 1); + } + }; + + template<class OnDiskFormat> + class DelInternalReplacementNextNonNull : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{a:null,_:{c:null,_:{d:null}}}"); + + long long unused = 0; + ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(4, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(0, unused); + + const BSONObj k = BSON("" << "a"); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(4, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(1, unused); + + builder.checkStructure("{a:null,_:{c:null,_:{d:null}}}"); + + // Check 'unused' key + ASSERT(this->getKey(this->_helper.headManager.getHead(), 0).recordLoc.getOfs() & 1); + } + }; + + template<class OnDiskFormat> + class DelInternalSplitPromoteLeft : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{$10:null,$20:null,$30$10:{$25:{$23:null},_:{$27:null}}," + "$40:null,$50:null,$60:null,$70:null,$80:null,$90:null,$100:null}"); + + long long unused = 0; + ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(0, unused); + + const BSONObj k = BSON("" << bigNumString(0x30, 0x10)); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(0, unused); + + builder.checkStructure("{$60:{$10:null,$20:null," + "$27:{$23:null,$25:null},$40:null,$50:null}," + "_:{$70:null,$80:null,$90:null,$100:null}}"); + } + }; + + template<class OnDiskFormat> + class DelInternalSplitPromoteRight : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + OperationContextNoop txn; + ArtificialTreeBuilder<OnDiskFormat> builder(&txn, &this->_helper); + + builder.makeTree("{$10:null,$20:null,$30:null,$40:null,$50:null,$60:null,$70:null," + "$80:null,$90:null,$100$10:{$95:{$93:null},_:{$97:null}}}"); + + long long unused = 0; + ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(0, unused); + + const BSONObj k = BSON("" << bigNumString(0x100, 0x10)); + ASSERT(this->unindex(k)); + + ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&txn, &unused, true, true, 0)); + + // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc + ASSERT_EQUALS(5, this->_helper.recordStore.numRecords()); + ASSERT_EQUALS(0, unused); + + builder.checkStructure( + "{$80:{$10:null,$20:null,$30:null,$40:null,$50:null,$60:null,$70:null}," + "_:{$90:null,$97:{$93:null,$95:null}}}"); + } + }; + + /* This test requires the entire server to be linked-in and it is better implemented using + the JS framework. Disabling here and will put in jsCore. + + template<class OnDiskFormat> + class SignedZeroDuplication : public BtreeLogicTestBase<OnDiskFormat> { + public: + void run() { + ASSERT_EQUALS(0.0, -0.0); + DBDirectClient c; + + static const string ns("unittests.SignedZeroDuplication"); + + c.ensureIndex(ns, BSON("b" << 1), true); + c.insert(ns, BSON("b" << 0.0)); + c.insert(ns, BSON("b" << 1.0)); + c.update(ns, BSON("b" << 1.0), BSON("b" << -0.0)); + + ASSERT_EQUALS(1U, c.count(ns, BSON("b" << 0.0))); + } + }; + */ + +/* +// QUERY_MIGRATION: port later + class PackUnused : public Base { + public: + void run() { + for ( long long i = 0; i < 1000000; i += 1000 ) { + insert( i ); + } + string orig, after; + { + stringstream ss; + bt()->shape( ss ); + orig = ss.str(); + } + vector< string > toDel; + vector< string > other; + BSONObjBuilder start; + start.appendMinKey( "a" ); + BSONObjBuilder end; + end.appendMaxKey( "a" ); + auto_ptr< BtreeCursor > c( BtreeCursor::make( nsdetails( ns() ), + id(), + start.done(), + end.done(), + false, + 1 ) ); + while( c->ok() ) { + bool has_child = + c->getBucket().btree()->keyNode(c->getKeyOfs()).prevChildBucket.isNull(); + + if (has_child) { + toDel.push_back( c->currKey().firstElement().valuestr() ); + } + else { + other.push_back( c->currKey().firstElement().valuestr() ); + } + c->advance(); + } + ASSERT( toDel.size() > 0 ); + for( vector< string >::const_iterator i = toDel.begin(); i != toDel.end(); ++i ) { + BSONObj o = BSON( "a" << *i ); + this->unindex( o ); + } + ASSERT( other.size() > 0 ); + for( vector< string >::const_iterator i = other.begin(); i != other.end(); ++i ) { + BSONObj o = BSON( "a" << *i ); + this->unindex( o ); + } + + long long unused = 0; + ASSERT_EQUALS( 0, bt()->fullValidate(&txn, dl(), order(), &unused, true ) ); + + for ( long long i = 50000; i < 50100; ++i ) { + insert( i ); + } + + long long unused2 = 0; + ASSERT_EQUALS( 100, bt()->fullValidate(&txn, dl(), order(), &unused2, true ) ); + +// log() << "old unused: " << unused << ", new unused: " << unused2 << endl; +// + ASSERT( unused2 <= unused ); + } + protected: + void insert( long long n ) { + string val = bigNumString( n ); + BSONObj k = BSON( "a" << val ); + Base::insert( k ); + } + }; + + class DontDropReferenceKey : public PackUnused { + public: + void run() { + // with 80 root node is full + for ( long long i = 0; i < 80; i += 1 ) { + insert( i ); + } + + BSONObjBuilder start; + start.appendMinKey( "a" ); + BSONObjBuilder end; + end.appendMaxKey( "a" ); + BSONObj l = bt()->keyNode( 0 ).key.toBson(); + string toInsert; + auto_ptr< BtreeCursor > c( BtreeCursor::make( nsdetails( ns() ), + id(), + start.done(), + end.done(), + false, + 1 ) ); + while( c->ok() ) { + if ( c->currKey().woCompare( l ) > 0 ) { + toInsert = c->currKey().firstElement().valuestr(); + break; + } + c->advance(); + } + // too much work to try to make this happen through inserts and deletes + // we are intentionally manipulating the btree bucket directly here + BtreeBucket::Loc* L = const_cast< BtreeBucket::Loc* >( &bt()->keyNode( 1 ).prevChildBucket ); + getDur().writing(L)->Null(); + getDur().writingInt( const_cast< BtreeBucket::Loc& >( bt()->keyNode( 1 ).recordLoc ).GETOFS() ) |= 1; // make unused + BSONObj k = BSON( "a" << toInsert ); + Base::insert( k ); + } + }; + */ + + // + // TEST SUITE DEFINITION + // + + template<class OnDiskFormat> + class BtreeLogicTestSuite : public unittest::Suite { + public: + BtreeLogicTestSuite(const std::string& name) : Suite(name) { + + } + + void setupTests() { + add< SimpleCreate<OnDiskFormat> >(); + add< SimpleInsertDelete<OnDiskFormat> >(); + add< SplitRightHeavyBucket<OnDiskFormat> >(); + add< SplitLeftHeavyBucket<OnDiskFormat> >(); + add< MissingLocate<OnDiskFormat> >(); + add< MissingLocateMultiBucket<OnDiskFormat> >(); + add< SERVER983<OnDiskFormat> >(); + add< DontReuseUnused<OnDiskFormat> >(); + add< MergeBucketsLeft<OnDiskFormat> >(); + add< MergeBucketsRight<OnDiskFormat> >(); + add< MergeBucketsDontReplaceHead<OnDiskFormat> >(); + add< MergeBucketsDelInternal<OnDiskFormat> >(); + add< MergeBucketsRightNull<OnDiskFormat> >(); + add< DontMergeSingleBucket<OnDiskFormat> >(); + add< ParentMergeNonRightToLeft<OnDiskFormat> >(); + add< ParentMergeNonRightToRight<OnDiskFormat> >(); + add< CantMergeRightNoMerge<OnDiskFormat> >(); + add< CantMergeLeftNoMerge<OnDiskFormat> >(); + add< MergeOption<OnDiskFormat> >(); + add< ForceMergeLeft<OnDiskFormat> >(); + add< ForceMergeRight<OnDiskFormat> >(); + add< RecursiveMerge<OnDiskFormat> >(); + add< RecursiveMergeRightBucket<OnDiskFormat> >(); + add< RecursiveMergeDoubleRightBucket<OnDiskFormat> >(); + + add< MergeSizeJustRightRight<OnDiskFormat> >(); + add< MergeSizeJustRightLeft<OnDiskFormat> >(); + add< MergeSizeRight<OnDiskFormat> >(); + add< MergeSizeLeft<OnDiskFormat> >(); + add< NoMergeBelowMarkRight<OnDiskFormat> >(); + add< NoMergeBelowMarkLeft<OnDiskFormat> >(); + add< MergeSizeRightTooBig<OnDiskFormat> >(); + add< MergeSizeLeftTooBig<OnDiskFormat> >(); + add< MergeRightEmpty<OnDiskFormat> >(); + add< MergeMinRightEmpty<OnDiskFormat> >(); + add< MergeLeftEmpty<OnDiskFormat> >(); + add< MergeMinLeftEmpty<OnDiskFormat> >(); + add< BalanceRightEmpty<OnDiskFormat> >(); + add< BalanceLeftEmpty<OnDiskFormat> >(); + + add< BalanceOneLeftToRight<OnDiskFormat> >(); + add< BalanceOneRightToLeft<OnDiskFormat> >(); + add< BalanceThreeLeftToRight<OnDiskFormat> >(); + add< BalanceThreeRightToLeft<OnDiskFormat> >(); + add< BalanceSingleParentKey<OnDiskFormat> >(); + + add< PackEmptyBucket<OnDiskFormat> >(); + add< PackedDataSizeEmptyBucket<OnDiskFormat> >(); + + add< BalanceSingleParentKeyPackParent<OnDiskFormat> >(); + add< BalanceSplitParent<OnDiskFormat> >(); + add< EvenRebalanceLeft<OnDiskFormat> >(); + add< EvenRebalanceLeftCusp<OnDiskFormat> >(); + add< EvenRebalanceRight<OnDiskFormat> >(); + add< EvenRebalanceRightCusp<OnDiskFormat> >(); + add< EvenRebalanceCenter<OnDiskFormat> >(); + add< OddRebalanceLeft<OnDiskFormat> >(); + add< OddRebalanceRight<OnDiskFormat> >(); + add< OddRebalanceCenter<OnDiskFormat> >(); + add< RebalanceEmptyRight<OnDiskFormat> >(); + add< RebalanceEmptyLeft<OnDiskFormat> >(); + + add< NoMoveAtLowWaterMarkRight<OnDiskFormat> >(); + add< MoveBelowLowWaterMarkRight<OnDiskFormat> >(); + add< NoMoveAtLowWaterMarkLeft<OnDiskFormat> >(); + add< MoveBelowLowWaterMarkLeft<OnDiskFormat> >(); + + add< PreferBalanceLeft<OnDiskFormat> >(); + add< PreferBalanceRight<OnDiskFormat> >(); + add< RecursiveMergeThenBalance<OnDiskFormat> >(); + add< DelEmptyNoNeighbors<OnDiskFormat> >(); + add< DelEmptyEmptyNeighbors<OnDiskFormat> >(); + add< DelInternal<OnDiskFormat> >(); + add< DelInternalReplaceWithUnused<OnDiskFormat> >(); + add< DelInternalReplaceRight<OnDiskFormat> >(); + add< DelInternalPromoteKey<OnDiskFormat> >(); + add< DelInternalPromoteRightKey<OnDiskFormat> >(); + add< DelInternalReplacementPrevNonNull<OnDiskFormat> >(); + add< DelInternalReplacementNextNonNull<OnDiskFormat> >(); + add< DelInternalSplitPromoteLeft<OnDiskFormat> >(); + add< DelInternalSplitPromoteRight<OnDiskFormat> >(); + } + }; + + // Test suite for both V0 and V1 + static BtreeLogicTestSuite<BtreeLayoutV0> SUITE_V0("BTreeLogicTests_V0"); + static BtreeLogicTestSuite<BtreeLayoutV1> SUITE_V1("BTreeLogicTests_V1"); +} diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h b/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h new file mode 100644 index 00000000000..7f91cd2fb27 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h @@ -0,0 +1,380 @@ +/** + * Copyright (C) 2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/diskloc.h" +#include "mongo/db/jsobj.h" +#include "mongo/db/storage/mmap_v1/btree/key.h" + +namespace mongo { + + const int OldBucketSize = 8192; + + // + // On-disk index format + // + +#pragma pack(1) + /** + * This is the fixed width data component for storage of a key within a bucket. It contains an + * offset pointer to the variable width bson data component. This may be 'unused', please see + * below. + * + * Why is this templated on Loc? Because V0 and V1 have different size DiskLoc(s) but otherwise + * the same layout. + */ + template <class LocType> + struct FixedWidthKey { + // + // Data + // + + /** + * The 'left' child bucket of this key. If this is the i-th key, it points to the i index + * child bucket. + */ + LocType prevChildBucket; + + /** + * The location of the record associated with this key. + */ + LocType recordLoc; + + /** + * Offset within current bucket of the variable width bson key for this _KeyNode. + */ + unsigned short _kdo; + + // + // Accessors / mutators + // + + short keyDataOfs() const { + return static_cast<short>(_kdo); + } + + void setKeyDataOfs(short s) { + _kdo = s; + invariant(s>=0); + } + + void setKeyDataOfsSavingUse(short s) { + // XXX kill this func + setKeyDataOfs(s); + } + + /** + * Unused keys are not returned by read operations. Keys may be marked + * as unused in cases where it is difficult to delete them while + * maintaining the constraints required of a btree. + * + * Setting ofs to odd is the sentinel for unused, as real recordLoc's + * are always even numbers. Note we need to keep its value basically + * the same as we use the recordLoc as part of the key in the index + * (to handle duplicate keys efficiently). + * + * Flagging keys as unused is a feature that is being phased out in favor + * of deleting the keys outright. The current btree implementation is + * not expected to mark a key as unused in a non legacy btree. + */ + void setUnused() { + recordLoc.GETOFS() |= 1; + } + + void setUsed() { recordLoc.GETOFS() &= ~1; } + + int isUnused() const { + return recordLoc.getOfs() & 1; + } + + int isUsed() const { + return !isUnused(); + } + }; + + /** + * This structure represents header data for a btree bucket. An object of + * this type is typically allocated inside of a buffer of size BucketSize, + * resulting in a full bucket with an appropriate header. + * + * The body of a btree bucket contains an array of _KeyNode objects starting + * from its lowest indexed bytes and growing to higher indexed bytes. The + * body also contains variable width bson keys, which are allocated from the + * highest indexed bytes toward lower indexed bytes. + * + * |hhhh|kkkkkkk--------bbbbbbbbbbbuuubbbuubbb| + * h = header data + * k = KeyNode data + * - = empty space + * b = bson key data + * u = unused (old) bson key data, that may be garbage collected + */ + struct BtreeBucketV0 { + /** + * Parent bucket of this bucket, which isNull() for the root bucket. + */ + DiskLoc parent; + + /** + * Given that there are n keys, this is the n index child. + */ + DiskLoc nextChild; + + /** + * Can be reused, value is 8192 in current pdfile version Apr2010 + */ + unsigned short _wasSize; + + /** + * zero + */ + unsigned short _reserved1; + + int flags; + + /** basicInsert() assumes the next three members are consecutive and in this order: */ + + /** Size of the empty region. */ + int emptySize; + + /** Size used for bson storage, including storage of old keys. */ + int topSize; + + /* Number of keys in the bucket. */ + int n; + + int reserved; + + /* Beginning of the bucket's body */ + char data[4]; + + // Precalculated size constants + enum { HeaderSize = 40 }; + }; + + // BtreeBucketV0 is part of the on-disk format, so it should never be changed + BOOST_STATIC_ASSERT( + sizeof(BtreeBucketV0) - sizeof(reinterpret_cast<BtreeBucketV0*>(NULL)->data) + == BtreeBucketV0::HeaderSize); + + /** + * A variant of DiskLoc Used by the V1 bucket type. + */ + struct DiskLoc56Bit { + // + // Data + // + + int ofs; + + unsigned char _a[3]; + + // + // Accessors XXX rename these, this is terrible + // + + int& GETOFS() { return ofs; } + + int getOfs() const { return ofs; } + + // + // Comparison + // + + bool isNull() const { return ofs < 0; } + + unsigned long long toLongLong() const { + // endian + unsigned long long result = ofs; + char* cursor = reinterpret_cast<char *>(&result); + *reinterpret_cast<uint16_t*>(cursor + 4) = *reinterpret_cast<const uint16_t*>(&_a[0]); + *reinterpret_cast<uint8_t*>(cursor + 6) = *reinterpret_cast<const uint8_t*>(&_a[2]); + *reinterpret_cast<uint8_t*>(cursor + 7) = uint8_t(0); + return result; + } + + bool operator<(const DiskLoc56Bit& rhs) const { + // the orderering of dup keys in btrees isn't too critical, but we'd like to put items + // that are close together on disk close together in the tree, so we do want the file # + // to be the most significant bytes + return toLongLong() < rhs.toLongLong(); + } + + int compare(const DiskLoc56Bit& rhs) const { + unsigned long long a = toLongLong(); + unsigned long long b = rhs.toLongLong(); + if ( a < b ) { + return -1; + } + else { + return a == b ? 0 : 1; + } + } + + bool operator==(const DiskLoc56Bit& rhs) const { + return toLongLong() == rhs.toLongLong(); + } + + bool operator!=(const DiskLoc56Bit& rhs) const { + return toLongLong() != rhs.toLongLong(); + } + + bool operator==(const DiskLoc& rhs) const { + return DiskLoc(*this) == rhs; + } + + bool operator!=(const DiskLoc& rhs) const { + return !(*this==rhs); + } + + // + // Mutation + // + + enum { + // first bit of offsets used in _KeyNode we don't use -1 here. + OurNullOfs = -2 + }; + + void Null() { + ofs = OurNullOfs; + _a[0] = _a[1] = _a[2] = 0; + } + + void operator=(const DiskLoc& loc) { + ofs = loc.getOfs(); + int la = loc.a(); + invariant( la <= 0xffffff ); // must fit in 3 bytes + if( la < 0 ) { + if ( la != -1 ) { + log() << "btree diskloc isn't negative 1: " << la << std::endl; + invariant ( la == -1 ); + } + la = 0; + ofs = OurNullOfs; + } + memcpy(_a, &la, 3); // endian + } + + // + // Type Conversion + // + + operator const DiskLoc() const { + // endian + if( isNull() ) return DiskLoc(); + unsigned a = *((unsigned *) (_a-1)); + return DiskLoc(a >> 8, ofs); + } + + std::string toString() const { return DiskLoc(*this).toString(); } + }; + + struct BtreeBucketV1 { + /** Parent bucket of this bucket, which isNull() for the root bucket. */ + DiskLoc56Bit parent; + + /** Given that there are n keys, this is the n index child. */ + DiskLoc56Bit nextChild; + + unsigned short flags; + + /** Size of the empty region. */ + unsigned short emptySize; + + /** Size used for bson storage, including storage of old keys. */ + unsigned short topSize; + + /* Number of keys in the bucket. */ + unsigned short n; + + /* Beginning of the bucket's body */ + char data[4]; + + // Precalculated size constants + enum { HeaderSize = 22 }; + }; + + // BtreeBucketV1 is part of the on-disk format, so it should never be changed + BOOST_STATIC_ASSERT( + sizeof(BtreeBucketV1) - sizeof(reinterpret_cast<BtreeBucketV1*>(NULL)->data) + == BtreeBucketV1::HeaderSize); + + enum Flags { + Packed = 1 + }; + + struct BtreeLayoutV0 { + typedef FixedWidthKey<DiskLoc> FixedWidthKeyType; + typedef DiskLoc LocType; + typedef KeyBson KeyType; + typedef KeyBson KeyOwnedType; + typedef BtreeBucketV0 BucketType; + + enum { BucketSize = 8192, + BucketBodySize = BucketSize - BucketType::HeaderSize + }; + + // largest key size we allow. note we very much need to support bigger keys (somehow) in + // the future. + + static const int KeyMax = OldBucketSize / 10; + + // A sentinel value sometimes used to identify a deallocated bucket. + static const int INVALID_N_SENTINEL = -1; + + static void initBucket(BucketType* bucket) { + bucket->_reserved1 = 0; + bucket->_wasSize = BucketSize; + bucket->reserved = 0; + } + }; + + struct BtreeLayoutV1 { + typedef FixedWidthKey<DiskLoc56Bit> FixedWidthKeyType; + typedef KeyV1 KeyType; + typedef KeyV1Owned KeyOwnedType; + typedef DiskLoc56Bit LocType; + typedef BtreeBucketV1 BucketType; + + enum { BucketSize = 8192 - 16, // The -16 is to leave room for the Record header + BucketBodySize = BucketSize - BucketType::HeaderSize + }; + + static const int KeyMax = 1024; + + // A sentinel value sometimes used to identify a deallocated bucket. + static const unsigned short INVALID_N_SENTINEL = 0xffff; + + static void initBucket(BucketType* bucket) { } + }; + +#pragma pack() + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp new file mode 100644 index 00000000000..99385d46e86 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp @@ -0,0 +1,247 @@ +// btree_test_help.cpp : Helper functions for Btree unit-testing +// + +/** + * Copyright (C) 2014 MongoDB + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects + * for all of the code used other than as permitted herein. If you modify + * file(s) with this exception, you may extend this exception to your + * version of the file(s), but you are not obligated to do so. If you do not + * wish to do so, delete this exception statement from your version. If you + * delete this exception statement from all source files in the program, + * then also delete it in the license file. + */ + +#include "mongo/db/storage/mmap_v1/btree/btree_test_help.h" + +#include "mongo/db/operation_context_noop.h" +#include "mongo/unittest/unittest.h" + + +namespace mongo { + + string bigNumString(long long n, int len) { + char sub[17]; + sprintf(sub, "%.16llx", n); + string val(len, ' '); + for (int i = 0; i < len; ++i) { + val[i] = sub[i % 16]; + } + return val; + } + + BSONObj simpleKey(char c, int n) { + BSONObjBuilder builder; + string val(n, c); + builder.append("a", val); + return builder.obj(); + } + + // + // BtreeLogicTestHelper + // + + static BucketDeletionNotification dummyBucketDeletionNotification; + + template <class OnDiskFormat> + BtreeLogicTestHelper<OnDiskFormat>::BtreeLogicTestHelper(const BSONObj& order) + : recordStore("TestRecordStore"), + btree(&headManager, + &recordStore, + Ordering::make(order), + "TestIndex", + &dummyBucketDeletionNotification) { + + static const string randomData("RandomStuff"); + + // Generate a valid record location for a "fake" record, which we will repeatedly use + // thoughout the tests. + OperationContextNoop txn; + StatusWith<DiskLoc> s = + recordStore.insertRecord(&txn, randomData.c_str(), randomData.length(), false); + + ASSERT_TRUE(s.isOK()); + ASSERT_EQUALS(1, recordStore.numRecords()); + + dummyDiskLoc = s.getValue(); + } + + + // + // ArtificialTreeBuilder + // + + template <class OnDiskFormat> + void ArtificialTreeBuilder<OnDiskFormat>::makeTree(const string &spec) { + _helper->headManager.setHead(_txn, makeTree(fromjson(spec))); + } + + template <class OnDiskFormat> + DiskLoc ArtificialTreeBuilder<OnDiskFormat>::makeTree(const BSONObj &spec) { + DiskLoc bucketLoc = _helper->btree._addBucket(_txn); + BucketType* bucket = _helper->btree.getBucket(bucketLoc); + + BSONObjIterator i(spec); + while (i.more()) { + BSONElement e = i.next(); + DiskLoc child; + if (e.type() == Object) { + child = makeTree(e.embeddedObject()); + } + + if (e.fieldName() == string("_")) { + bucket->nextChild = child; + } + else { + KeyDataOwnedType key(BSON("" << expectedKey(e.fieldName()))); + _helper->btree._pushBack(bucket, _helper->dummyDiskLoc, key, child); + } + } + + _helper->btree.fixParentPtrs(_txn, bucket, bucketLoc); + return bucketLoc; + } + + template <class OnDiskFormat> + void ArtificialTreeBuilder<OnDiskFormat>::checkStructure(const string &spec) const { + checkStructure(fromjson(spec), _helper->headManager.getHead()); + } + + template <class OnDiskFormat> + void ArtificialTreeBuilder<OnDiskFormat>::push( + const DiskLoc bucketLoc, const BSONObj& key, const DiskLoc child) { + KeyDataOwnedType k(key); + BucketType* bucket = _helper->btree.getBucket(bucketLoc); + + _helper->btree._pushBack(bucket, _helper->dummyDiskLoc, k, child); + _helper->btree.fixParentPtrs(_txn, bucket, bucketLoc); + } + + template <class OnDiskFormat> + void ArtificialTreeBuilder<OnDiskFormat>::checkStructure( + const BSONObj &spec, const DiskLoc node) const { + BucketType* bucket = _helper->btree.getBucket(node); + + BSONObjIterator j(spec); + for (int i = 0; i < bucket->n; ++i) { + ASSERT(j.more()); + BSONElement e = j.next(); + KeyHeaderType kn = BtreeLogic<OnDiskFormat>::getKeyHeader(bucket, i); + string expected = expectedKey(e.fieldName()); + ASSERT(isPresent(BSON("" << expected), 1)); + ASSERT(isPresent(BSON("" << expected), -1)); + + // ASSERT_EQUALS(expected, kn.key.toBson().firstElement().valuestr()); + if (kn.prevChildBucket.isNull()) { + ASSERT(e.type() == jstNULL); + } + else { + ASSERT(e.type() == Object); + checkStructure(e.embeddedObject(), kn.prevChildBucket); + } + } + if (bucket->nextChild.isNull()) { + // maybe should allow '_' field with null value? + ASSERT(!j.more()); + } + else { + BSONElement e = j.next(); + ASSERT_EQUALS(string("_"), e.fieldName()); + ASSERT(e.type() == Object); + checkStructure(e.embeddedObject(), bucket->nextChild); + } + ASSERT(!j.more()); + } + + template <class OnDiskFormat> + bool ArtificialTreeBuilder<OnDiskFormat>::isPresent(const BSONObj &key, int direction) const { + int pos; + DiskLoc loc; + OperationContextNoop txn; + return _helper->btree.locate(&txn, key, _helper->dummyDiskLoc, direction, &pos, &loc); + } + + // Static + template <class OnDiskFormat> + string ArtificialTreeBuilder<OnDiskFormat>::expectedKey(const char *spec) { + if (spec[0] != '$') { + return spec; + } + char *endPtr; + + // parsing a long long is a pain, so just allow shorter keys for now + unsigned long long num = strtol(spec + 1, &endPtr, 16); + int len = 800; + if (*endPtr == '$') { + len = strtol(endPtr + 1, 0, 16); + } + + return bigNumString(num, len); + } + + template <class OnDiskFormat> + int ArtificialTreeBuilder<OnDiskFormat>::fillBucketToExactSize( + const DiskLoc bucketLoc, int targetSize, char startKey) { + ASSERT_FALSE(bucketLoc.isNull()); + + BucketType* bucket = _helper->btree.getBucket(bucketLoc); + ASSERT_EQUALS(0, bucket->n); + + static const int bigSize = KeyDataOwnedType(simpleKey('a', 801)).dataSize(); + + int size = 0; + int keyCount = 0; + while (size < targetSize) { + int space = targetSize - size; + int nextSize = space - sizeof(FixedWidthKeyType); + verify(nextSize > 0); + + BSONObj newKey; + if (nextSize >= bigSize) { + newKey = simpleKey(startKey++, 801); + } + else { + newKey = simpleKey(startKey++, nextSize - (bigSize - 801)); + } + + push(bucketLoc, newKey, DiskLoc()); + + size += KeyDataOwnedType(newKey).dataSize() + + sizeof(FixedWidthKeyType); + keyCount += 1; + } + + ASSERT_EQUALS(_helper->btree._packedDataSize(bucket, 0), targetSize); + + return keyCount; + } + + // + // This causes actual code to be generated for the usages of the templates in this file. + // + + // V0 format. + template struct BtreeLogicTestHelper<BtreeLayoutV0>; + template class ArtificialTreeBuilder<BtreeLayoutV0>; + + // V1 format. + template struct BtreeLogicTestHelper<BtreeLayoutV1>; + template class ArtificialTreeBuilder<BtreeLayoutV1>; +} diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_test_help.h b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.h new file mode 100644 index 00000000000..52d468f053a --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.h @@ -0,0 +1,154 @@ +/** + * Copyright (C) 2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include <string> + +#include "mongo/db/json.h" +#include "mongo/db/storage/heap1/record_store_heap.h" // XXX why is this here? +#include "mongo/db/storage/mmap_v1//btree/btree_logic.h" +#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h" + + +namespace mongo { + + /** + * Generates a string of the specified length containing repeated concatenation of the + * hexadecimal representation of the input value. + */ + std::string bigNumString(long long n, int len); + + /** + * Generates key on a field 'a', with the specified number of repetitions of the character. + */ + BSONObj simpleKey(char c, int n = 1); + + /** + * Simple head manager, which performs no validity checking or persistence. + */ + class TestHeadManager : public HeadManager { + public: + virtual const DiskLoc getHead() const { + return _head; + } + + virtual void setHead(OperationContext* txn, const DiskLoc newHead) { + _head = newHead; + } + + private: + DiskLoc _head; + }; + + + /** + * This structure encapsulates a Btree and all the infrastructure needed by it (head manager, + * record store and a valid disk location to use by the tests). + */ + template <class OnDiskFormat> + struct BtreeLogicTestHelper { + BtreeLogicTestHelper(const BSONObj& order); + + // Everything needed for a fully-functional Btree logic + TestHeadManager headManager; + HeapRecordStore recordStore; + BtreeLogic<OnDiskFormat> btree; + DiskLoc dummyDiskLoc; + }; + + + /** + * Tool to construct custom tree shapes for tests. + */ + template <class OnDiskFormat> + class ArtificialTreeBuilder { + public: + + typedef typename BtreeLogic<OnDiskFormat>::BucketType BucketType; + typedef typename BtreeLogic<OnDiskFormat>::KeyDataOwnedType KeyDataOwnedType; + typedef typename BtreeLogic<OnDiskFormat>::KeyHeaderType KeyHeaderType; + + typedef typename OnDiskFormat::FixedWidthKeyType FixedWidthKeyType; + + /** + * The tree builder wraps around the passed-in helper and will invoke methods on it. It + * does not do any cleanup, so constructing multiple trees over the same helper will + * cause leaked records. + */ + ArtificialTreeBuilder(OperationContext* txn, + BtreeLogicTestHelper<OnDiskFormat>* helper) + : _txn(txn), _helper(helper) { + + } + + /** + * Causes the specified tree shape to be built on the associated helper and the tree's + * root installed as the head. Uses a custom JSON-based language with the following + * syntax: + * + * Btree := BTreeBucket + * BtreeBucket := { Child_1_Key: <BtreeBucket | null>, + * Child_2_Key: <BtreeBucket | null>, + * ..., + * _: <BtreeBucket | null> } + * + * The _ key name specifies the content of the nextChild pointer. The value null means + * use a fixed disk loc. + */ + void makeTree(const std::string& spec); + + /** + * Validates that the structure of the Btree in the helper matches the specification. + */ + void checkStructure(const std::string& spec) const; + + /** + * Adds the following key to the bucket and fixes up the child pointers. + */ + void push(const DiskLoc bucketLoc, const BSONObj& key, const DiskLoc child); + + /** + * @return The number of keys inserted. + */ + int fillBucketToExactSize(const DiskLoc bucketLoc, int targetSize, char startKey); + + private: + DiskLoc makeTree(const BSONObj& spec); + + void checkStructure(const BSONObj& spec, const DiskLoc node) const; + + bool isPresent(const BSONObj& key, int direction) const; + + static string expectedKey(const char* spec); + + OperationContext* _txn; + BtreeLogicTestHelper<OnDiskFormat>* _helper; + }; + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/bucket_deletion_notification.h b/src/mongo/db/storage/mmap_v1/btree/bucket_deletion_notification.h new file mode 100644 index 00000000000..5d6fa99434f --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/btree/bucket_deletion_notification.h @@ -0,0 +1,54 @@ +/** +* Copyright (C) 2014 MongoDB Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#pragma once + +#include "mongo/db/diskloc.h" + +namespace mongo { + + /** + * Notifies interested parties before a bucket is about to be deleted. Currently used by + * the cursor manager, so the appropriate cursors can be invalidated. + * + * The default implementation is a no-op. + */ + class BucketDeletionNotification { + public: + + /** + * If the same object is passed in to different BtreeLogic implementations, this + * notification may be invoked on multiple threads, so it is up to the implementor + * to ensure thread-safety. + */ + virtual void aboutToDeleteBucket(const DiskLoc& bucket) { } + + virtual ~BucketDeletionNotification() { } + }; + +} diff --git a/src/mongo/db/storage/mmap_v1/btree/key.cpp b/src/mongo/db/storage/mmap_v1/btree/key.cpp new file mode 100644 index 00000000000..a6ccd61d2cf --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/btree/key.cpp @@ -0,0 +1,691 @@ +/** + * Copyright (C) 2011 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/storage/mmap_v1/btree/key.h" + +#include "mongo/bson/util/builder.h" +#include "mongo/platform/float_utils.h" +#include "mongo/util/startup_test.h" + + +namespace mongo { + + extern const Ordering nullOrdering = Ordering::make(BSONObj()); + + // KeyBson is for V0 (version #0) indexes + + int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o); + + // "old" = pre signed dates & such; i.e. btree V0 + /* must be same canon type when called */ + int oldCompareElementValues(const BSONElement& l, const BSONElement& r) { + dassert( l.canonicalType() == r.canonicalType() ); + int f; + double x; + + switch ( l.type() ) { + case EOO: + case Undefined: // EOO and Undefined are same canonicalType + case jstNULL: + case MaxKey: + case MinKey: + return 0; + case Bool: + return *l.value() - *r.value(); + case Timestamp: + case Date: + // unsigned dates for old version + if ( l.date() < r.date() ) + return -1; + return l.date() == r.date() ? 0 : 1; + case NumberLong: + if( r.type() == NumberLong ) { + long long L = l._numberLong(); + long long R = r._numberLong(); + if( L < R ) return -1; + if( L == R ) return 0; + return 1; + } + // else fall through + case NumberInt: + case NumberDouble: { + double left = l.number(); + double right = r.number(); + bool lNan = !( left <= numeric_limits< double >::max() && + left >= -numeric_limits< double >::max() ); + bool rNan = !( right <= numeric_limits< double >::max() && + right >= -numeric_limits< double >::max() ); + if ( lNan ) { + if ( rNan ) { + return 0; + } + else { + return -1; + } + } + else if ( rNan ) { + return 1; + } + x = left - right; + if ( x < 0 ) return -1; + return x == 0 ? 0 : 1; + } + case jstOID: + return memcmp(l.value(), r.value(), 12); + case Code: + case Symbol: + case String: + // nulls not allowed in the middle of strings in the old version + return strcmp(l.valuestr(), r.valuestr()); + case Object: + case Array: + return oldCompare(l.embeddedObject(), r.embeddedObject(), nullOrdering); + case DBRef: { + int lsz = l.valuesize(); + int rsz = r.valuesize(); + if ( lsz - rsz != 0 ) return lsz - rsz; + return memcmp(l.value(), r.value(), lsz); + } + case BinData: { + int lsz = l.objsize(); // our bin data size in bytes, not including the subtype byte + int rsz = r.objsize(); + if ( lsz - rsz != 0 ) return lsz - rsz; + return memcmp(l.value()+4, r.value()+4, lsz+1); + } + case RegEx: { + int c = strcmp(l.regex(), r.regex()); + if ( c ) + return c; + return strcmp(l.regexFlags(), r.regexFlags()); + } + case CodeWScope : { + f = l.canonicalType() - r.canonicalType(); + if ( f ) + return f; + f = strcmp( l.codeWScopeCode() , r.codeWScopeCode() ); + if ( f ) + return f; + f = strcmp( l.codeWScopeScopeDataUnsafe() , r.codeWScopeScopeDataUnsafe() ); + if ( f ) + return f; + return 0; + } + default: + log() << "oldCompareElementValues: bad type " << (int) l.type() << endl; + verify(false); + } + return -1; + } + + int oldElemCompare(const BSONElement&l , const BSONElement& r) { + int lt = (int) l.canonicalType(); + int rt = (int) r.canonicalType(); + int x = lt - rt; + if( x ) + return x; + return oldCompareElementValues(l, r); + } + + // pre signed dates & such + int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o) { + BSONObjIterator i(l); + BSONObjIterator j(r); + unsigned mask = 1; + while ( 1 ) { + // so far, equal... + + BSONElement l = i.next(); + BSONElement r = j.next(); + if ( l.eoo() ) + return r.eoo() ? 0 : -1; + if ( r.eoo() ) + return 1; + + int x; + { + x = oldElemCompare(l, r); + if( o.descending(mask) ) + x = -x; + } + if ( x != 0 ) + return x; + mask <<= 1; + } + return -1; + } + + /* old style compares: + - dates are unsigned + - strings no nulls + */ + int KeyBson::woCompare(const KeyBson& r, const Ordering &o) const { + return oldCompare(_o, r._o, o); + } + + // woEqual could be made faster than woCompare but this is for backward compatibility so not worth a big effort + bool KeyBson::woEqual(const KeyBson& r) const { + return oldCompare(_o, r._o, nullOrdering) == 0; + } + + // [ ][HASMORE][x][y][canontype_4bits] + enum CanonicalsEtc { + cminkey=1, + cnull=2, + cdouble=4, + cstring=6, + cbindata=7, + coid=8, + cfalse=10, + ctrue=11, + cdate=12, + cmaxkey=14, + cCANONTYPEMASK = 0xf, + cY = 0x10, + cint = cY | cdouble, + cX = 0x20, + clong = cX | cdouble, + cHASMORE = 0x40, + cNOTUSED = 0x80 // but see IsBSON sentinel - this bit not usable without great care + }; + + // bindata bson type + const unsigned BinDataLenMask = 0xf0; // lengths are powers of 2 of this value + const unsigned BinDataTypeMask = 0x0f; // 0-7 as you would expect, 8-15 are 128+value. see BinDataType. + const int BinDataLenMax = 32; + const int BinDataLengthToCode[] = { + 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, -1/*9*/, 0x90/*10*/, -1/*11*/, 0xa0/*12*/, -1/*13*/, 0xb0/*14*/, -1/*15*/, + 0xc0/*16*/, -1, -1, -1, 0xd0/*20*/, -1, -1, -1, + 0xe0/*24*/, -1, -1, -1, -1, -1, -1, -1, + 0xf0/*32*/ + }; + const int BinDataCodeToLength[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 32 + }; + + int binDataCodeToLength(int codeByte) { + return BinDataCodeToLength[codeByte >> 4]; + } + + /** object cannot be represented in compact format. so store in traditional bson format + with a leading sentinel byte IsBSON to indicate it's in that format. + + Given that the KeyV1Owned constructor already grabbed a bufbuilder, we reuse it here + so that we don't have to do an extra malloc. + */ + void KeyV1Owned::traditional(const BSONObj& obj) { + b.reset(); + b.appendUChar(IsBSON); + b.appendBuf(obj.objdata(), obj.objsize()); + _keyData = (const unsigned char *) b.buf(); + } + + KeyV1Owned::KeyV1Owned(const KeyV1& rhs) { + b.appendBuf( rhs.data(), rhs.dataSize() ); + _keyData = (const unsigned char *) b.buf(); + dassert( b.len() == dataSize() ); // check datasize method is correct + dassert( (*_keyData & cNOTUSED) == 0 ); + } + + // fromBSON to Key format + KeyV1Owned::KeyV1Owned(const BSONObj& obj) { + BSONObj::iterator i(obj); + unsigned char bits = 0; + while( 1 ) { + BSONElement e = i.next(); + if( i.more() ) + bits |= cHASMORE; + switch( e.type() ) { + case MinKey: + b.appendUChar(cminkey|bits); + break; + case jstNULL: + b.appendUChar(cnull|bits); + break; + case MaxKey: + b.appendUChar(cmaxkey|bits); + break; + case Bool: + b.appendUChar( (e.boolean()?ctrue:cfalse) | bits ); + break; + case jstOID: + b.appendUChar(coid|bits); + b.appendBuf(&e.__oid(), sizeof(OID)); + break; + case BinData: + { + int t = e.binDataType(); + // 0-7 and 0x80 to 0x87 are supported by Key + if( (t & 0x78) == 0 && t != ByteArrayDeprecated ) { + int len; + const char * d = e.binData(len); + if( len <= BinDataLenMax ) { + int code = BinDataLengthToCode[len]; + if( code >= 0 ) { + if( t >= 128 ) + t = (t-128) | 0x08; + dassert( (code&t) == 0 ); + b.appendUChar( cbindata|bits ); + b.appendUChar( code | t ); + b.appendBuf(d, len); + break; + } + } + } + traditional(obj); + return; + } + case Date: + b.appendUChar(cdate|bits); + b.appendStruct(e.date()); + break; + case String: + { + b.appendUChar(cstring|bits); + // note we do not store the terminating null, to save space. + unsigned x = (unsigned) e.valuestrsize() - 1; + if( x > 255 ) { + traditional(obj); + return; + } + b.appendUChar(x); + b.appendBuf(e.valuestr(), x); + break; + } + case NumberInt: + b.appendUChar(cint|bits); + b.appendNum((double) e._numberInt()); + break; + case NumberLong: + { + long long n = e._numberLong(); + long long m = 2LL << 52; + DEV { + long long d = m-1; + verify( ((long long) ((double) -d)) == -d ); + } + if( n >= m || n <= -m ) { + // can't represent exactly as a double + traditional(obj); + return; + } + b.appendUChar(clong|bits); + b.appendNum((double) n); + break; + } + case NumberDouble: + { + double d = e._numberDouble(); + if( isNaN(d) ) { + traditional(obj); + return; + } + b.appendUChar(cdouble|bits); + b.appendNum(d); + break; + } + default: + // if other types involved, store as traditional BSON + traditional(obj); + return; + } + if( !i.more() ) + break; + bits = 0; + } + _keyData = (const unsigned char *) b.buf(); + dassert( b.len() == dataSize() ); // check datasize method is correct + dassert( (*_keyData & cNOTUSED) == 0 ); + } + + BSONObj KeyV1::toBson() const { + verify( _keyData != 0 ); + if( !isCompactFormat() ) + return bson(); + + BSONObjBuilder b(512); + const unsigned char *p = _keyData; + while( 1 ) { + unsigned bits = *p++; + + switch( bits & 0x3f ) { + case cminkey: b.appendMinKey(""); break; + case cnull: b.appendNull(""); break; + case cfalse: b.appendBool("", false); break; + case ctrue: b.appendBool("", true); break; + case cmaxkey: + b.appendMaxKey(""); + break; + case cstring: + { + unsigned sz = *p++; + // we build the element ourself as we have to null terminate it + BufBuilder &bb = b.bb(); + bb.appendNum((char) String); + bb.appendUChar(0); // fieldname "" + bb.appendNum(sz+1); + bb.appendBuf(p, sz); + bb.appendUChar(0); // null char at end of string + p += sz; + break; + } + case coid: + b.appendOID("", (OID *) p); + p += sizeof(OID); + break; + case cbindata: + { + int len = binDataCodeToLength(*p); + int subtype = (*p) & BinDataTypeMask; + if( subtype & 0x8 ) { + subtype = (subtype & 0x7) | 0x80; + } + b.appendBinData("", len, (BinDataType) subtype, ++p); + p += len; + break; + } + case cdate: + b.appendDate("", (Date_t&) *p); + p += 8; + break; + case cdouble: + b.append("", (double&) *p); + p += sizeof(double); + break; + case cint: + b.append("", static_cast< int >((reinterpret_cast< const PackedDouble& >(*p)).d)); + p += sizeof(double); + break; + case clong: + b.append("", static_cast< long long>((reinterpret_cast< const PackedDouble& >(*p)).d)); + p += sizeof(double); + break; + default: + verify(false); + } + + if( (bits & cHASMORE) == 0 ) + break; + } + return b.obj(); + } + + static int compare(const unsigned char *&l, const unsigned char *&r) { + int lt = (*l & cCANONTYPEMASK); + int rt = (*r & cCANONTYPEMASK); + int x = lt - rt; + if( x ) + return x; + + l++; r++; + + // same type + switch( lt ) { + case cdouble: + { + double L = (reinterpret_cast< const PackedDouble* >(l))->d; + double R = (reinterpret_cast< const PackedDouble* >(r))->d; + if( L < R ) + return -1; + if( L != R ) + return 1; + l += 8; r += 8; + break; + } + case cstring: + { + int lsz = *l; + int rsz = *r; + int common = min(lsz, rsz); + l++; r++; // skip the size byte + // use memcmp as we (will) allow zeros in UTF8 strings + int res = memcmp(l, r, common); + if( res ) + return res; + // longer string is the greater one + int diff = lsz-rsz; + if( diff ) + return diff; + l += lsz; r += lsz; + break; + } + case cbindata: + { + int L = *l; + int R = *r; + int llen = binDataCodeToLength(L); + int diff = L-R; // checks length and subtype simultaneously + if( diff ) { + // unfortunately nibbles are backwards to do subtype and len in one check (could bit swap...) + int rlen = binDataCodeToLength(R); + if( llen != rlen ) + return llen - rlen; + return diff; + } + // same length, same type + l++; r++; + int res = memcmp(l, r, llen); + if( res ) + return res; + l += llen; r += llen; + break; + } + case cdate: + { + long long L = *((long long *) l); + long long R = *((long long *) r); + if( L < R ) + return -1; + if( L > R ) + return 1; + l += 8; r += 8; + break; + } + case coid: + { + int res = memcmp(l, r, sizeof(OID)); + if( res ) + return res; + l += 12; r += 12; + break; + } + default: + // all the others are a match -- e.g. null == null + ; + } + + return 0; + } + + // at least one of this and right are traditional BSON format + int NOINLINE_DECL KeyV1::compareHybrid(const KeyV1& right, const Ordering& order) const { + BSONObj L = toBson(); + BSONObj R = right.toBson(); + return L.woCompare(R, order, /*considerfieldname*/false); + } + + int KeyV1::woCompare(const KeyV1& right, const Ordering &order) const { + const unsigned char *l = _keyData; + const unsigned char *r = right._keyData; + + if( (*l|*r) == IsBSON ) // only can do this if cNOTUSED maintained + return compareHybrid(right, order); + + unsigned mask = 1; + while( 1 ) { + char lval = *l; + char rval = *r; + { + int x = compare(l, r); // updates l and r pointers + if( x ) { + if( order.descending(mask) ) + x = -x; + return x; + } + } + + { + int x = ((int)(lval & cHASMORE)) - ((int)(rval & cHASMORE)); + if( x ) + return x; + if( (lval & cHASMORE) == 0 ) + break; + } + + mask <<= 1; + } + + return 0; + } + + static unsigned sizes[] = { + 0, + 1, //cminkey=1, + 1, //cnull=2, + 0, + 9, //cdouble=4, + 0, + 0, //cstring=6, + 0, + 13, //coid=8, + 0, + 1, //cfalse=10, + 1, //ctrue=11, + 9, //cdate=12, + 0, + 1, //cmaxkey=14, + 0 + }; + + inline unsigned sizeOfElement(const unsigned char *p) { + unsigned type = *p & cCANONTYPEMASK; + unsigned sz = sizes[type]; + if( sz == 0 ) { + if( type == cstring ) { + sz = ((unsigned) p[1]) + 2; + } + else { + verify( type == cbindata ); + sz = binDataCodeToLength(p[1]) + 2; + } + } + return sz; + } + + int KeyV1::dataSize() const { + const unsigned char *p = _keyData; + if( !isCompactFormat() ) { + return bson().objsize() + 1; + } + + bool more; + do { + unsigned z = sizeOfElement(p); + more = (*p & cHASMORE) != 0; + p += z; + } while( more ); + return p - _keyData; + } + + bool KeyV1::woEqual(const KeyV1& right) const { + const unsigned char *l = _keyData; + const unsigned char *r = right._keyData; + + if( (*l|*r) == IsBSON ) { + return toBson().equal(right.toBson()); + } + + while( 1 ) { + char lval = *l; + char rval = *r; + if( (lval&(cCANONTYPEMASK|cHASMORE)) != (rval&(cCANONTYPEMASK|cHASMORE)) ) + return false; + l++; r++; + switch( lval&cCANONTYPEMASK ) { + case coid: + if( *((unsigned*) l) != *((unsigned*) r) ) + return false; + l += 4; r += 4; + case cdate: + if( *((unsigned long long *) l) != *((unsigned long long *) r) ) + return false; + l += 8; r += 8; + break; + case cdouble: + if( (reinterpret_cast< const PackedDouble* > (l))->d != (reinterpret_cast< const PackedDouble* >(r))->d ) + return false; + l += 8; r += 8; + break; + case cstring: + { + if( *l != *r ) + return false; // not same length + unsigned sz = ((unsigned) *l) + 1; + if( memcmp(l, r, sz) ) + return false; + l += sz; r += sz; + break; + } + case cbindata: + { + if( *l != *r ) + return false; // len or subtype mismatch + int len = binDataCodeToLength(*l) + 1; + if( memcmp(l, r, len) ) + return false; + l += len; r += len; + break; + } + case cminkey: + case cnull: + case cfalse: + case ctrue: + case cmaxkey: + break; + default: + verify(false); + } + if( (lval&cHASMORE) == 0 ) + break; + } + return true; + } + + struct CmpUnitTest : public StartupTest { + void run() { + char a[2]; + char b[2]; + a[0] = -3; + a[1] = 0; + b[0] = 3; + b[1] = 0; + verify( strcmp(a,b)>0 && memcmp(a,b,2)>0 ); + } + } cunittest; + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/key.h b/src/mongo/db/storage/mmap_v1/btree/key.h new file mode 100644 index 00000000000..83203b0fee2 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/btree/key.h @@ -0,0 +1,130 @@ +// @file key.h class(es) representing individual keys in a btree + +/** +* Copyright (C) 2011 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#pragma once + +#include "mongo/db/jsobj.h" + +namespace mongo { + + /** Key class for precomputing a small format index key that is denser than a traditional BSONObj. + + KeyBson is a legacy wrapper implementation for old BSONObj style keys for v:0 indexes. + + KeyV1 is the new implementation. + */ + class KeyBson /* "KeyV0" */ { + public: + KeyBson() { } + explicit KeyBson(const char *keyData) : _o(keyData) { } + explicit KeyBson(const BSONObj& obj) : _o(obj) { } + int woCompare(const KeyBson& r, const Ordering &o) const; + BSONObj toBson() const { return _o; } + std::string toString() const { return _o.toString(); } + int dataSize() const { return _o.objsize(); } + const char * data() const { return _o.objdata(); } + BSONElement _firstElement() const { return _o.firstElement(); } + bool isCompactFormat() const { return false; } + bool woEqual(const KeyBson& r) const; + void assign(const KeyBson& rhs) { *this = rhs; } + bool isValid() const { return true; } + private: + BSONObj _o; + }; + + class KeyV1Owned; + + // corresponding to BtreeData_V1 + class KeyV1 { + void operator=(const KeyV1&); // disallowed just to make people be careful as we don't own the buffer + KeyV1(const KeyV1Owned&); // disallowed as this is not a great idea as KeyV1Owned likely will go out of scope + public: + KeyV1() { _keyData = 0; } + ~KeyV1() { DEV _keyData = (const unsigned char *) 1; } + + KeyV1(const KeyV1& rhs) : _keyData(rhs._keyData) { + dassert( _keyData > (const unsigned char *) 1 ); + } + + // explicit version of operator= to be safe + void assign(const KeyV1& rhs) { + _keyData = rhs._keyData; + } + + /** @param keyData can be a buffer containing data in either BSON format, OR in KeyV1 format. + when BSON, we are just a wrapper + */ + explicit KeyV1(const char *keyData) : _keyData((unsigned char *) keyData) { } + + int woCompare(const KeyV1& r, const Ordering &o) const; + bool woEqual(const KeyV1& r) const; + BSONObj toBson() const; + std::string toString() const { return toBson().toString(); } + + /** get the key data we want to store in the btree bucket */ + const char * data() const { return (const char *) _keyData; } + + /** @return size of data() */ + int dataSize() const; + + /** only used by geo, which always has bson keys */ + BSONElement _firstElement() const { return bson().firstElement(); } + bool isCompactFormat() const { return *_keyData != IsBSON; } + + bool isValid() const { return _keyData > (const unsigned char*)1; } + protected: + enum { IsBSON = 0xff }; + const unsigned char *_keyData; + BSONObj bson() const { + dassert( !isCompactFormat() ); + return BSONObj((const char *) _keyData+1); + } + private: + int compareHybrid(const KeyV1& right, const Ordering& order) const; + }; + + class KeyV1Owned : public KeyV1 { + void operator=(const KeyV1Owned&); + public: + /** @obj a BSON object to be translated to KeyV1 format. If the object isn't + representable in KeyV1 format (which happens, intentionally, at times) + it will stay as bson herein. + */ + KeyV1Owned(const BSONObj& obj); + + /** makes a copy (memcpy's the whole thing) */ + KeyV1Owned(const KeyV1& rhs); + + private: + StackBufBuilder b; + void traditional(const BSONObj& obj); // store as traditional bson not as compact format + }; + +}; diff --git a/src/mongo/db/storage/mmap_v1/catalog/hashtab.h b/src/mongo/db/storage/mmap_v1/catalog/hashtab.h new file mode 100644 index 00000000000..07916dc873d --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/catalog/hashtab.h @@ -0,0 +1,180 @@ +/* hashtab.h + + Simple, fixed size hash table. Darn simple. + + Uses a contiguous block of memory, so you can put it in a memory mapped file very easily. +*/ + +/* Copyright 2009 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects + * for all of the code used other than as permitted herein. If you modify + * file(s) with this exception, you may extend this exception to your + * version of the file(s), but you are not obligated to do so. If you do not + * wish to do so, delete this exception statement from your version. If you + * delete this exception statement from all source files in the program, + * then also delete it in the license file. + */ + +#pragma once + +#include "mongo/pch.h" +#include <map> +#include "mongo/db/storage/mmap_v1/dur.h" +#include "mongo/db/operation_context.h" +#include "mongo/stdx/functional.h" + +namespace mongo { + +#pragma pack(1) + + /* you should define: + + int Key::hash() return > 0 always. + */ + + template <class Key,class Type> + class HashTable : boost::noncopyable { + public: + const char *name; + struct Node { + int hash; + Key k; + Type value; + bool inUse() { + return hash != 0; + } + void setUnused() { + hash = 0; + } + }; + void* _buf; + int n; // number of hashtable buckets + int maxChain; + + Node& nodes(int i) { + Node *nodes = (Node *) _buf; + return nodes[i]; + } + + int _find(const Key& k, bool& found) { + found = false; + int h = k.hash(); + int i = h % n; + int start = i; + int chain = 0; + int firstNonUsed = -1; + while ( 1 ) { + if ( !nodes(i).inUse() ) { + if ( firstNonUsed < 0 ) + firstNonUsed = i; + } + + if ( nodes(i).hash == h && nodes(i).k == k ) { + if ( chain >= 200 ) + log() << "warning: hashtable " << name << " long chain " << std::endl; + found = true; + return i; + } + chain++; + i = (i+1) % n; + if ( i == start ) { + // shouldn't get here / defensive for infinite loops + log() << "error: hashtable " << name << " is full n:" << n << std::endl; + return -1; + } + if( chain >= maxChain ) { + if ( firstNonUsed >= 0 ) + return firstNonUsed; + log() << "error: hashtable " << name << " max chain reached:" << maxChain << std::endl; + return -1; + } + } + } + + public: + /* buf must be all zeroes on initialization. */ + HashTable(void* buf, int buflen, const char *_name) : name(_name) { + int m = sizeof(Node); + // log() << "hashtab init, buflen:" << buflen << " m:" << m << std::endl; + n = buflen / m; + if ( (n & 1) == 0 ) + n--; + maxChain = (int) (n * 0.05); + _buf = buf; + //nodes = (Node *) buf; + + if ( sizeof(Node) != 628 ) { + log() << "HashTable() " << _name << " sizeof(node):" << sizeof(Node) << " n:" << n << " sizeof(Key): " << sizeof(Key) << " sizeof(Type):" << sizeof(Type) << std::endl; + verify( sizeof(Node) == 628 ); + } + + } + + Type* get(const Key& k) { + bool found; + int i = _find(k, found); + if ( found ) + return &nodes(i).value; + return 0; + } + + void kill(OperationContext* txn, const Key& k) { + bool found; + int i = _find(k, found); + if ( i >= 0 && found ) { + Node* n = &nodes(i); + n = txn->recoveryUnit()->writing(n); + n->k.kill(); + n->setUnused(); + } + } + + /** returns false if too full */ + bool put(OperationContext* txn, const Key& k, const Type& value) { + bool found; + int i = _find(k, found); + if ( i < 0 ) + return false; + Node* n = txn->recoveryUnit()->writing( &nodes(i) ); + if ( !found ) { + n->k = k; + n->hash = k.hash(); + } + else { + verify( n->hash == k.hash() ); + } + n->value = value; + return true; + } + + typedef stdx::function< void ( const Key& k , Type& v ) > IteratorCallback; + void iterAll( IteratorCallback callback ) { + for ( int i=0; i<n; i++ ) { + if ( nodes(i).inUse() ) { + callback( nodes(i).k , nodes(i).value ); + } + } + } + + }; + +#pragma pack() + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/catalog/index_details.cpp b/src/mongo/db/storage/mmap_v1/catalog/index_details.cpp new file mode 100644 index 00000000000..bc9cc3ee791 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/catalog/index_details.cpp @@ -0,0 +1,40 @@ +// index_details.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#include "mongo/db/storage/mmap_v1/catalog/index_details.h" + +namespace mongo { + + void IndexDetails::_reset() { + head.setInvalid(); + info.setInvalid(); + } + +} diff --git a/src/mongo/db/storage/mmap_v1/catalog/index_details.h b/src/mongo/db/storage/mmap_v1/catalog/index_details.h new file mode 100644 index 00000000000..b2f34ec0681 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/catalog/index_details.h @@ -0,0 +1,69 @@ +// index_details.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#pragma once + +#include "mongo/db/diskloc.h" + +namespace mongo { + + /* Details about a particular index. There is one of these effectively for each object in + system.namespaces (although this also includes the head pointer, which is not in that + collection). + + This is an internal part of the catalog. Nothing outside of the catalog should use this. + + ** MemoryMapped Record ** (i.e., this is on disk data) + */ + struct IndexDetails { + /** + * btree head disk location + */ + DiskLoc head; + + /* Location of index info object. Format: + + { name:"nameofindex", ns:"parentnsname", key: {keypattobject} + [, unique: <bool>, background: <bool>, v:<version>] + } + + This object is in the system.indexes collection. Note that since we + have a pointer to the object here, the object in system.indexes MUST NEVER MOVE. + */ + DiskLoc info; + + /** + * makes head and info invalid + */ + void _reset(); + + }; + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace-inl.h b/src/mongo/db/storage/mmap_v1/catalog/namespace-inl.h new file mode 100644 index 00000000000..5cd45963f1f --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/catalog/namespace-inl.h @@ -0,0 +1,74 @@ +// namespace-inl.h + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#pragma once + +namespace mongo { + + inline Namespace& Namespace::operator=(const StringData& ns) { + // we fill the remaining space with all zeroes here. as the full Namespace struct is in + // the datafiles (the .ns files specifically), that is helpful as then they are deterministic + // in the bytes they have for a given sequence of operations. that makes testing and debugging + // the data files easier. + // + // if profiling indicates this method is a significant bottleneck, we could have a version we + // use for reads which does not fill with zeroes, and keep the zeroing behavior on writes. + // + memset( buf, 0, sizeof(buf) ); + uassert( 10080 , "ns name too long, max size is 127 bytes", ns.size() <= MaxNsLen); + uassert( 17380 , "ns name can't contain embedded '\0' byte", ns.find('\0') == std::string::npos); + ns.copyTo( buf, true ); + return *this; + } + + inline std::string Namespace::extraName(int i) const { + char ex[] = "$extra"; + ex[5] += i; + std::string s = std::string(buf) + ex; + massert( 10348 , "$extra: ns name too long", s.size() <= MaxNsLen); + return s; + } + + inline bool Namespace::isExtra() const { + const char *p = strstr(buf, "$extr"); + return p && p[5] && p[6] == 0; //==0 important in case an index uses name "$extra_1" for example + } + + inline int Namespace::hash() const { + unsigned x = 0; + const char *p = buf; + while ( *p ) { + x = x * 131 + *p; + p++; + } + return (x & 0x7fffffff) | 0x8000000; // must be > 0 + } + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace.cpp new file mode 100644 index 00000000000..822ed26dedb --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/catalog/namespace.cpp @@ -0,0 +1,49 @@ +// namespace.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#include "mongo/pch.h" + +#include "mongo/db/storage/mmap_v1/catalog/namespace.h" + +#include <boost/static_assert.hpp> + +#include "mongo/db/namespace_string.h" + +namespace mongo { + namespace { + BOOST_STATIC_ASSERT( sizeof(Namespace) == 128 ); + BOOST_STATIC_ASSERT( Namespace::MaxNsLenWithNUL == MaxDatabaseNameLen ); + BOOST_STATIC_ASSERT((int)Namespace::MaxNsLenWithNUL == (int)NamespaceString::MaxNsLenWithNUL); + BOOST_STATIC_ASSERT((int)Namespace::MaxNsLen == (int)NamespaceString::MaxNsLen); + // Note the typo. + BOOST_STATIC_ASSERT((int)Namespace::MaxNsColletionLen == (int)NamespaceString::MaxNsCollectionLen); + } +} + diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace.h b/src/mongo/db/storage/mmap_v1/catalog/namespace.h new file mode 100644 index 00000000000..40e70ac9857 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/catalog/namespace.h @@ -0,0 +1,92 @@ +// namespace.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#pragma once + +#include <cstring> +#include <string> + +#include "mongo/base/string_data.h" + +namespace mongo { + +#pragma pack(1) + /** + * This is used for storing a namespace on disk in a fixed witdh form + * it should only be used for that, not for passing internally + * for that, please use NamespaceString + */ + class Namespace { + public: + Namespace(const StringData& ns) { *this = ns; } + Namespace& operator=(const StringData& ns); + + void kill() { buf[0] = 0x7f; } + + bool operator==(const char *r) const { return strcmp(buf, r) == 0; } + bool operator==(const Namespace& r) const { return strcmp(buf, r.buf) == 0; } + bool operator!=(const char *r) const { return strcmp(buf, r) != 0; } + bool operator!=(const Namespace& r) const { return strcmp(buf, r.buf) != 0; } + + bool hasDollarSign() const { return strchr( buf , '$' ) != NULL; } + + int hash() const; // value returned is always > 0 + + size_t size() const { return strlen( buf ); } + + std::string toString() const { return buf; } + operator std::string() const { return buf; } + + /* NamespaceDetails::Extra was added after fact to allow chaining of data blocks to support more than 10 indexes + (more than 10 IndexDetails). It's a bit hacky because of this late addition with backward + file support. */ + std::string extraName(int i) const; + bool isExtra() const; /* ends with $extr... -- when true an extra block not a normal NamespaceDetails block */ + + enum MaxNsLenValue { + // Maximum possible length of name any namespace, including special ones like $extra. + // This includes rum for the NUL byte so it can be used when sizing buffers. + MaxNsLenWithNUL = 128, + + // MaxNsLenWithNUL excluding the NUL byte. Use this when comparing std::string lengths. + MaxNsLen = MaxNsLenWithNUL - 1, + + // Maximum allowed length of fully qualified namespace name of any real collection. + // Does not include NUL so it can be directly compared to std::string lengths. + MaxNsColletionLen = MaxNsLen - 7/*strlen(".$extra")*/, + }; + private: + char buf[MaxNsLenWithNUL]; + }; +#pragma pack() + +} // namespace mongo + +#include "mongo/db/storage/mmap_v1/catalog/namespace-inl.h" diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp new file mode 100644 index 00000000000..2fe3d226e5d --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp @@ -0,0 +1,244 @@ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#include "mongo/pch.h" + +#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h" + +#include <algorithm> +#include <list> + +#include "mongo/base/counter.h" +#include "mongo/db/catalog/collection.h" +#include "mongo/db/catalog/collection_options.h" +#include "mongo/db/clientcursor.h" +#include "mongo/db/commands/server_status.h" +#include "mongo/db/db.h" +#include "mongo/db/index_legacy.h" +#include "mongo/db/json.h" +#include "mongo/db/ops/delete.h" +#include "mongo/db/ops/update.h" +#include "mongo/db/storage/mmap_v1/catalog/hashtab.h" +#include "mongo/db/operation_context.h" +#include "mongo/scripting/engine.h" +#include "mongo/util/startup_test.h" + + +namespace mongo { + + + BSONObj idKeyPattern = fromjson("{\"_id\":1}"); + + NamespaceDetails::NamespaceDetails( const DiskLoc &loc, bool capped ) { + BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails) ); + + /* be sure to initialize new fields here -- doesn't default to zeroes the way we use it */ + firstExtent = lastExtent = capExtent = loc; + stats.datasize = stats.nrecords = 0; + lastExtentSize = 0; + nIndexes = 0; + isCapped = capped; + maxDocsInCapped = 0x7fffffff; // no limit (value is for pre-v2.3.2 compatibility) + paddingFactor = 1.0; + systemFlagsOldDoNotUse = 0; + userFlags = 0; + capFirstNewRecord = DiskLoc(); + // Signal that we are on first allocation iteration through extents. + capFirstNewRecord.setInvalid(); + // For capped case, signal that we are doing initial extent allocation. + if ( capped ) { + // WAS: cappedLastDelRecLastExtent().setInvalid(); + deletedList[1].setInvalid(); + } + verify( sizeof(_dataFileVersion) == 2 ); + _dataFileVersion = 0; + _indexFileVersion = 0; + multiKeyIndexBits = 0; + _reservedA = 0; + _extraOffset = 0; + indexBuildsInProgress = 0; + memset(_reserved, 0, sizeof(_reserved)); + } + + NamespaceDetails::Extra* NamespaceDetails::allocExtra( OperationContext* txn, + const StringData& ns, + NamespaceIndex& ni, + int nindexessofar) { + txn->lockState()->assertWriteLocked(ns); + + int i = (nindexessofar - NIndexesBase) / NIndexesExtra; + verify( i >= 0 && i <= 1 ); + + Namespace fullns( ns ); + Namespace extrans( fullns.extraName(i) ); // throws UserException if ns name too long + + massert( 10350, "allocExtra: base ns missing?", this ); + massert( 10351, "allocExtra: extra already exists", ni.details(extrans) == 0 ); + + Extra temp; + temp.init(); + + ni.add_ns( txn, extrans, reinterpret_cast<NamespaceDetails*>( &temp ) ); + Extra* e = reinterpret_cast<NamespaceDetails::Extra*>( ni.details( extrans ) ); + + long ofs = e->ofsFrom(this); + if( i == 0 ) { + verify( _extraOffset == 0 ); + *txn->recoveryUnit()->writing(&_extraOffset) = ofs; + verify( extra() == e ); + } + else { + Extra *hd = extra(); + verify( hd->next(this) == 0 ); + hd->setNext(txn, ofs); + } + return e; + } + + IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected) { + if( idxNo < NIndexesBase ) { + IndexDetails& id = _indexes[idxNo]; + return id; + } + Extra *e = extra(); + if ( ! e ) { + if ( missingExpected ) + throw MsgAssertionException( 13283 , "Missing Extra" ); + massert(14045, "missing Extra", e); + } + int i = idxNo - NIndexesBase; + if( i >= NIndexesExtra ) { + e = e->next(this); + if ( ! e ) { + if ( missingExpected ) + throw MsgAssertionException( 14823 , "missing extra" ); + massert(14824, "missing Extra", e); + } + i -= NIndexesExtra; + } + return e->details[i]; + } + + + const IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected) const { + if( idxNo < NIndexesBase ) { + const IndexDetails& id = _indexes[idxNo]; + return id; + } + const Extra *e = extra(); + if ( ! e ) { + if ( missingExpected ) + throw MsgAssertionException( 17421 , "Missing Extra" ); + massert(17422, "missing Extra", e); + } + int i = idxNo - NIndexesBase; + if( i >= NIndexesExtra ) { + e = e->next(this); + if ( ! e ) { + if ( missingExpected ) + throw MsgAssertionException( 17423 , "missing extra" ); + massert(17424, "missing Extra", e); + } + i -= NIndexesExtra; + } + return e->details[i]; + } + + NamespaceDetails::IndexIterator::IndexIterator(const NamespaceDetails *_d, + bool includeBackgroundInProgress) { + d = _d; + i = 0; + n = d->nIndexes; + if ( includeBackgroundInProgress ) + n += d->indexBuildsInProgress; + } + + // must be called when renaming a NS to fix up extra + void NamespaceDetails::copyingFrom( OperationContext* txn, + const StringData& thisns, + NamespaceIndex& ni, + NamespaceDetails* src) { + _extraOffset = 0; // we are a copy -- the old value is wrong. fixing it up below. + Extra *se = src->extra(); + int n = NIndexesBase; + if( se ) { + Extra *e = allocExtra(txn, thisns, ni, n); + while( 1 ) { + n += NIndexesExtra; + e->copy(this, *se); + se = se->next(src); + if( se == 0 ) break; + Extra *nxt = allocExtra(txn, thisns, ni, n); + e->setNext( txn, nxt->ofsFrom(this) ); + e = nxt; + } + verify( _extraOffset ); + } + } + + NamespaceDetails* NamespaceDetails::writingWithoutExtra( OperationContext* txn ) { + return txn->recoveryUnit()->writing( this ); + } + + + // XXX - this method should go away + NamespaceDetails *NamespaceDetails::writingWithExtra( OperationContext* txn ) { + for( Extra *e = extra(); e; e = e->next( this ) ) { + txn->recoveryUnit()->writing( e ); + } + return writingWithoutExtra( txn ); + } + + void NamespaceDetails::setMaxCappedDocs( OperationContext* txn, long long max ) { + massert( 16499, + "max in a capped collection has to be < 2^31 or -1", + CollectionOptions::validMaxCappedDocs( &max ) ); + maxDocsInCapped = max; + } + + /* ------------------------------------------------------------------------- */ + + + int NamespaceDetails::_catalogFindIndexByName(const Collection* coll, + const StringData& name, + bool includeBackgroundInProgress) const { + IndexIterator i = ii(includeBackgroundInProgress); + while( i.more() ) { + const BSONObj obj = coll->docFor(i.next().info); + if ( name == obj.getStringField("name") ) + return i.pos()-1; + } + return -1; + } + + void NamespaceDetails::Extra::setNext( OperationContext* txn, + long ofs ) { + *txn->recoveryUnit()->writing(&_next) = ofs; + } + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.h new file mode 100644 index 00000000000..0a6734e7d9d --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.h @@ -0,0 +1,229 @@ +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/diskloc.h" +#include "mongo/db/namespace_string.h" +#include "mongo/db/storage/mmap_v1/catalog/index_details.h" +#include "mongo/db/storage/mmap_v1/catalog/namespace.h" +#include "mongo/db/storage/mmap_v1/catalog/namespace_index.h" + +namespace mongo { + + class Collection; + class OperationContext; + + /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes + so you can look for a deleterecord about the right size. + */ + const int Buckets = 19; + const int MaxBucket = 18; + + extern int bucketSizes[]; + +#pragma pack(1) + /* NamespaceDetails : this is the "header" for a collection that has all its details. + It's in the .ns file and this is a memory mapped region (thus the pack pragma above). + */ + class NamespaceDetails { + public: + enum { NIndexesMax = 64, NIndexesExtra = 30, NIndexesBase = 10 }; + + + + /*-------- data fields, as present on disk : */ + + DiskLoc firstExtent; + DiskLoc lastExtent; + + /* NOTE: capped collections v1 override the meaning of deletedList. + deletedList[0] points to a list of free records (DeletedRecord's) for all extents in + the capped namespace. + deletedList[1] points to the last record in the prev extent. When the "current extent" + changes, this value is updated. !deletedList[1].isValid() when this value is not + yet computed. + */ + DiskLoc deletedList[Buckets]; + + // ofs 168 (8 byte aligned) + struct Stats { + // datasize and nrecords MUST Be adjacent code assumes! + long long datasize; // this includes padding, but not record headers + long long nrecords; + } stats; + + + int lastExtentSize; + + int nIndexes; + + // ofs 192 + IndexDetails _indexes[NIndexesBase]; + + public: + // ofs 352 (16 byte aligned) + int isCapped; // there is wasted space here if I'm right (ERH) + + int maxDocsInCapped; // max # of objects for a capped table, -1 for inf. + + double paddingFactor; // 1.0 = no padding. + // ofs 368 (16) + int systemFlagsOldDoNotUse; // things that the system sets/cares about + + DiskLoc capExtent; // the "current" extent we're writing too for a capped collection + DiskLoc capFirstNewRecord; + + unsigned short _dataFileVersion; // NamespaceDetails version. So we can do backward compatibility in the future. See filever.h + unsigned short _indexFileVersion; + + unsigned long long multiKeyIndexBits; + + // ofs 400 (16) + unsigned long long _reservedA; + long long _extraOffset; // where the $extra info is located (bytes relative to this) + + public: + int indexBuildsInProgress; // Number of indexes currently being built + + int userFlags; + + char _reserved[72]; + /*-------- end data 496 bytes */ + public: + explicit NamespaceDetails( const DiskLoc &loc, bool _capped ); + + class Extra { + long long _next; + public: + IndexDetails details[NIndexesExtra]; + private: + unsigned reserved2; + unsigned reserved3; + Extra(const Extra&) { verify(false); } + Extra& operator=(const Extra& r) { verify(false); return *this; } + public: + Extra() { } + long ofsFrom(NamespaceDetails *d) { + return ((char *) this) - ((char *) d); + } + void init() { memset(this, 0, sizeof(Extra)); } + Extra* next(const NamespaceDetails *d) const { + if( _next == 0 ) return 0; + return (Extra*) (((char *) d) + _next); + } + void setNext(OperationContext* txn, long ofs); + void copy(NamespaceDetails *d, const Extra& e) { + memcpy(this, &e, sizeof(Extra)); + _next = 0; + } + }; + Extra* extra() const { + if( _extraOffset == 0 ) return 0; + return (Extra *) (((char *) this) + _extraOffset); + } + /* add extra space for indexes when more than 10 */ + Extra* allocExtra( OperationContext* txn, + const StringData& ns, + NamespaceIndex& ni, + int nindexessofar ); + + void copyingFrom( OperationContext* txn, + const StringData& thisns, + NamespaceIndex& ni, + NamespaceDetails *src); // must be called when renaming a NS to fix up extra + + public: + void setMaxCappedDocs( OperationContext* txn, long long max ); + + enum UserFlags { + Flag_UsePowerOf2Sizes = 1 << 0 + }; + + IndexDetails& idx(int idxNo, bool missingExpected = false ); + const IndexDetails& idx(int idxNo, bool missingExpected = false ) const; + + class IndexIterator { + public: + int pos() { return i; } // note this is the next one to come + bool more() { return i < n; } + const IndexDetails& next() { return d->idx(i++); } + private: + friend class NamespaceDetails; + int i, n; + const NamespaceDetails *d; + IndexIterator(const NamespaceDetails *_d, bool includeBackgroundInProgress); + }; + + IndexIterator ii( bool includeBackgroundInProgress = false ) const { + return IndexIterator(this, includeBackgroundInProgress); + } + + /** + * This fetches the IndexDetails for the next empty index slot. The caller must populate + * returned object. This handles allocating extra index space, if necessary. + */ + IndexDetails& getNextIndexDetails(OperationContext* txn, Collection* collection); + + NamespaceDetails *writingWithoutExtra( OperationContext* txn ); + + /** Make all linked Extra objects writeable as well */ + NamespaceDetails *writingWithExtra( OperationContext* txn ); + + /** + * Returns the offset of the specified index name within the array of indexes. Must be + * passed-in the owning collection to resolve the index record entries to objects. + * + * @return > 0 if index name was found, -1 otherwise. + */ + int _catalogFindIndexByName(const Collection* coll, + const StringData& name, + bool includeBackgroundInProgress) const; + + private: + + /** + * swaps all meta data for 2 indexes + * a and b are 2 index ids, whose contents will be swapped + * must have a lock on the entire collection to do this + */ + void swapIndex( OperationContext* txn, int a, int b ); + + friend class IndexCatalog; + friend class IndexCatalogEntry; + + /** Update cappedLastDelRecLastExtent() after capExtent changed in cappedTruncateAfter() */ + void cappedTruncateLastDelUpdate(); + BOOST_STATIC_ASSERT( NIndexesMax <= NIndexesBase + NIndexesExtra*2 ); + BOOST_STATIC_ASSERT( NIndexesMax <= 64 ); // multiKey bits + BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) == 496 ); + }; // NamespaceDetails + BOOST_STATIC_ASSERT( sizeof(NamespaceDetails) == 496 ); +#pragma pack() + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp new file mode 100644 index 00000000000..27957a297a5 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp @@ -0,0 +1,333 @@ +// namespace_details_collection_entry.h + +/** +* Copyright (C) 2014 MongoDB Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#include "mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h" + +#include "mongo/db/index/index_descriptor.h" +#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h" +#include "mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h" +#include "mongo/db/storage/record_store.h" +#include "mongo/util/startup_test.h" + +namespace mongo { + NamespaceDetailsCollectionCatalogEntry::NamespaceDetailsCollectionCatalogEntry( const StringData& ns, + NamespaceDetails* details, + RecordStore* indexRecordStore, + MMAPV1DatabaseCatalogEntry* db ) + : CollectionCatalogEntry( ns ), + _details( details ), + _indexRecordStore( indexRecordStore ), + _db( db ) { + } + + CollectionOptions NamespaceDetailsCollectionCatalogEntry::getCollectionOptions(OperationContext* txn) const { + return _db->getCollectionOptions( txn, ns().ns() ); + } + + int NamespaceDetailsCollectionCatalogEntry::getTotalIndexCount() const { + return _details->nIndexes + _details->indexBuildsInProgress; + } + + int NamespaceDetailsCollectionCatalogEntry::getCompletedIndexCount() const { + return _details->nIndexes; + } + + int NamespaceDetailsCollectionCatalogEntry::getMaxAllowedIndexes() const { + return NamespaceDetails::NIndexesMax; + } + + void NamespaceDetailsCollectionCatalogEntry::getAllIndexes( std::vector<std::string>* names ) const { + NamespaceDetails::IndexIterator i = _details->ii( true ); + while ( i.more() ) { + const IndexDetails& id = i.next(); + const BSONObj obj = _indexRecordStore->dataFor( id.info ).toBson(); + names->push_back( obj.getStringField("name") ); + } + } + + bool NamespaceDetailsCollectionCatalogEntry::isIndexMultikey(const StringData& idxName) const { + int idxNo = _findIndexNumber( idxName ); + invariant( idxNo >= 0 ); + return isIndexMultikey( idxNo ); + } + + bool NamespaceDetailsCollectionCatalogEntry::isIndexMultikey(int idxNo) const { + return (_details->multiKeyIndexBits & (((unsigned long long) 1) << idxNo)) != 0; + } + + bool NamespaceDetailsCollectionCatalogEntry::setIndexIsMultikey(OperationContext* txn, + const StringData& indexName, + bool multikey ) { + + int idxNo = _findIndexNumber( indexName ); + invariant( idxNo >= 0 ); + return setIndexIsMultikey( txn, idxNo, multikey ); + } + + bool NamespaceDetailsCollectionCatalogEntry::setIndexIsMultikey(OperationContext* txn, + int idxNo, + bool multikey ) { + unsigned long long mask = 1ULL << idxNo; + + if (multikey) { + // Shortcut if the bit is already set correctly + if (_details->multiKeyIndexBits & mask) { + return false; + } + + *txn->recoveryUnit()->writing(&_details->multiKeyIndexBits) |= mask; + } + else { + // Shortcut if the bit is already set correctly + if (!(_details->multiKeyIndexBits & mask)) { + return false; + } + + // Invert mask: all 1's except a 0 at the ith bit + mask = ~mask; + *txn->recoveryUnit()->writing(&_details->multiKeyIndexBits) &= mask; + } + + return true; + } + + DiskLoc NamespaceDetailsCollectionCatalogEntry::getIndexHead( const StringData& idxName ) const { + int idxNo = _findIndexNumber( idxName ); + invariant( idxNo >= 0 ); + return _details->idx( idxNo ).head; + } + + BSONObj NamespaceDetailsCollectionCatalogEntry::getIndexSpec( const StringData& idxName ) const { + int idxNo = _findIndexNumber( idxName ); + invariant( idxNo >= 0 ); + const IndexDetails& id = _details->idx( idxNo ); + return _indexRecordStore->dataFor( id.info ).toBson(); + } + + void NamespaceDetailsCollectionCatalogEntry::setIndexHead( OperationContext* txn, + const StringData& idxName, + const DiskLoc& newHead ) { + int idxNo = _findIndexNumber( idxName ); + invariant( idxNo >= 0 ); + *txn->recoveryUnit()->writing( &_details->idx( idxNo ).head) = newHead; + } + + bool NamespaceDetailsCollectionCatalogEntry::isIndexReady( const StringData& idxName ) const { + int idxNo = _findIndexNumber( idxName ); + invariant( idxNo >= 0 ); + return idxNo < getCompletedIndexCount(); + } + + int NamespaceDetailsCollectionCatalogEntry::_findIndexNumber( const StringData& idxName ) const { + NamespaceDetails::IndexIterator i = _details->ii( true ); + while ( i.more() ) { + const IndexDetails& id = i.next(); + int idxNo = i.pos() - 1; + const BSONObj obj = _indexRecordStore->dataFor( id.info ).toBson(); + if ( idxName == obj.getStringField("name") ) + return idxNo; + } + return -1; + } + + /* remove bit from a bit array - actually remove its slot, not a clear + note: this function does not work with x == 63 -- that is ok + but keep in mind in the future if max indexes were extended to + exactly 64 it would be a problem + */ + unsigned long long removeAndSlideBit(unsigned long long b, int x) { + unsigned long long tmp = b; + return + (tmp & ((((unsigned long long) 1) << x)-1)) | + ((tmp >> (x+1)) << x); + } + + class IndexUpdateTest : public StartupTest { + public: + void run() { + verify( removeAndSlideBit(1, 0) == 0 ); + verify( removeAndSlideBit(2, 0) == 1 ); + verify( removeAndSlideBit(2, 1) == 0 ); + verify( removeAndSlideBit(255, 1) == 127 ); + verify( removeAndSlideBit(21, 2) == 9 ); + verify( removeAndSlideBit(0x4000000000000001ULL, 62) == 1 ); + } + } iu_unittest; + + Status NamespaceDetailsCollectionCatalogEntry::removeIndex( OperationContext* txn, + const StringData& indexName ) { + int idxNo = _findIndexNumber( indexName ); + if ( idxNo < 0 ) + return Status( ErrorCodes::NamespaceNotFound, "index not found to remove" ); + + DiskLoc infoLocation = _details->idx( idxNo ).info; + + { // sanity check + BSONObj info = _indexRecordStore->dataFor( infoLocation ).toBson(); + invariant( info["name"].String() == indexName ); + } + + { // drop the namespace + string indexNamespace = IndexDescriptor::makeIndexNamespace( ns().ns(), indexName ); + Status status = _db->dropCollection( txn, indexNamespace ); + if ( !status.isOK() ) { + return status; + } + } + + { // all info in the .ns file + NamespaceDetails* d = _details->writingWithExtra( txn ); + + // fix the _multiKeyIndexBits, by moving all bits above me down one + d->multiKeyIndexBits = removeAndSlideBit(d->multiKeyIndexBits, idxNo); + + if ( idxNo >= d->nIndexes ) + d->indexBuildsInProgress--; + else + d->nIndexes--; + + for ( int i = idxNo; i < getTotalIndexCount(); i++ ) + d->idx(i) = d->idx(i+1); + + d->idx( getTotalIndexCount() ) = IndexDetails(); + } + + // remove from system.indexes + _indexRecordStore->deleteRecord( txn, infoLocation ); + + return Status::OK(); + } + + Status NamespaceDetailsCollectionCatalogEntry::prepareForIndexBuild( OperationContext* txn, + const IndexDescriptor* desc ) { + BSONObj spec = desc->infoObj(); + // 1) entry in system.indexs + StatusWith<DiskLoc> systemIndexesEntry = _indexRecordStore->insertRecord( txn, + spec.objdata(), + spec.objsize(), + -1 ); + if ( !systemIndexesEntry.isOK() ) + return systemIndexesEntry.getStatus(); + + // 2) NamespaceDetails mods + IndexDetails *id; + try { + id = &_details->idx(getTotalIndexCount(), true); + } + catch( DBException& ) { + _details->allocExtra(txn, + ns().ns(), + _db->_namespaceIndex, + getTotalIndexCount()); + id = &_details->idx(getTotalIndexCount(), false); + } + + *txn->recoveryUnit()->writing( &id->info ) = systemIndexesEntry.getValue(); + *txn->recoveryUnit()->writing( &id->head ) = DiskLoc(); + + txn->recoveryUnit()->writingInt( _details->indexBuildsInProgress ) += 1; + + // 3) indexes entry in .ns file + NamespaceIndex& nsi = _db->_namespaceIndex; + invariant( nsi.details( desc->indexNamespace() ) == NULL ); + nsi.add_ns( txn, desc->indexNamespace(), DiskLoc(), false ); + + // 4) system.namespaces entry index ns + _db->_addNamespaceToNamespaceCollection( txn, desc->indexNamespace(), NULL); + + return Status::OK(); + } + + void NamespaceDetailsCollectionCatalogEntry::indexBuildSuccess( OperationContext* txn, + const StringData& indexName ) { + int idxNo = _findIndexNumber( indexName ); + fassert( 17202, idxNo >= 0 ); + + // Make sure the newly created index is relocated to nIndexes, if it isn't already there + if ( idxNo != getCompletedIndexCount() ) { + int toIdxNo = getCompletedIndexCount(); + + //_details->swapIndex( txn, idxNo, toIdxNo ); + + // flip main meta data + IndexDetails temp = _details->idx(idxNo); + *txn->recoveryUnit()->writing(&_details->idx(idxNo)) = _details->idx(toIdxNo); + *txn->recoveryUnit()->writing(&_details->idx(toIdxNo)) = temp; + + // flip multi key bits + bool tempMultikey = isIndexMultikey(idxNo); + setIndexIsMultikey( txn, idxNo, isIndexMultikey(toIdxNo) ); + setIndexIsMultikey( txn, toIdxNo, tempMultikey ); + + idxNo = toIdxNo; + invariant( idxNo = _findIndexNumber( indexName ) ); + } + + txn->recoveryUnit()->writingInt( _details->indexBuildsInProgress ) -= 1; + txn->recoveryUnit()->writingInt( _details->nIndexes ) += 1; + + invariant( isIndexReady( indexName ) ); + } + + void NamespaceDetailsCollectionCatalogEntry::updateTTLSetting( OperationContext* txn, + const StringData& idxName, + long long newExpireSeconds ) { + int idx = _findIndexNumber( idxName ); + invariant( idx >= 0 ); + + IndexDetails& indexDetails = _details->idx( idx ); + + BSONObj obj = _indexRecordStore->dataFor( indexDetails.info ).toBson(); + const BSONElement oldExpireSecs = obj.getField("expireAfterSeconds"); + + // Important that we set the new value in-place. We are writing directly to the + // object here so must be careful not to overwrite with a longer numeric type. + + char* nonConstPtr = const_cast<char*>(oldExpireSecs.value()); + switch( oldExpireSecs.type() ) { + case EOO: + massert( 16631, "index does not have an 'expireAfterSeconds' field", false ); + break; + case NumberInt: + *txn->recoveryUnit()->writing(reinterpret_cast<int*>(nonConstPtr)) = newExpireSeconds; + break; + case NumberDouble: + *txn->recoveryUnit()->writing(reinterpret_cast<double*>(nonConstPtr)) = newExpireSeconds; + break; + case NumberLong: + *txn->recoveryUnit()->writing(reinterpret_cast<long long*>(nonConstPtr)) = newExpireSeconds; + break; + default: + massert( 16632, "current 'expireAfterSeconds' is not a number", false ); + } + } + + +} diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h new file mode 100644 index 00000000000..78a5b96f181 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h @@ -0,0 +1,109 @@ +// namespace_details_collection_entry.h + +#pragma once + +/** +* Copyright (C) 2014 MongoDB Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#include "mongo/base/string_data.h" +#include "mongo/bson/bsonobj.h" +#include "mongo/db/catalog/collection_catalog_entry.h" +#include "mongo/db/diskloc.h" + +namespace mongo { + + class NamespaceDetails; + + class MMAPV1DatabaseCatalogEntry;; + class RecordStore; + class OperationContext; + + class NamespaceDetailsCollectionCatalogEntry : public CollectionCatalogEntry { + public: + NamespaceDetailsCollectionCatalogEntry( const StringData& ns, + NamespaceDetails* details, + RecordStore* indexRecordStore, + MMAPV1DatabaseCatalogEntry* db ); + + virtual ~NamespaceDetailsCollectionCatalogEntry(){} + + virtual CollectionOptions getCollectionOptions(OperationContext* txn) const; + + virtual int getTotalIndexCount() const; + + virtual int getCompletedIndexCount() const; + + virtual int getMaxAllowedIndexes() const; + + virtual void getAllIndexes( std::vector<std::string>* names ) const; + + virtual BSONObj getIndexSpec( const StringData& idxName ) const; + + virtual bool isIndexMultikey(const StringData& indexName) const; + virtual bool isIndexMultikey(int idxNo) const; + + virtual bool setIndexIsMultikey(OperationContext* txn, + int idxNo, + bool multikey = true); + virtual bool setIndexIsMultikey(OperationContext* txn, + const StringData& indexName, + bool multikey = true); + + virtual DiskLoc getIndexHead( const StringData& indexName ) const; + + virtual void setIndexHead( OperationContext* txn, + const StringData& indexName, + const DiskLoc& newHead ); + + virtual bool isIndexReady( const StringData& indexName ) const; + + virtual Status removeIndex( OperationContext* txn, + const StringData& indexName ); + + virtual Status prepareForIndexBuild( OperationContext* txn, + const IndexDescriptor* spec ); + + virtual void indexBuildSuccess( OperationContext* txn, + const StringData& indexName ); + + virtual void updateTTLSetting( OperationContext* txn, + const StringData& idxName, + long long newExpireSeconds ); + + // not part of interface, but available to my storage engine + + int _findIndexNumber( const StringData& indexName) const; + + private: + NamespaceDetails* _details; + RecordStore* _indexRecordStore; + MMAPV1DatabaseCatalogEntry* _db; + + friend class MMAPV1DatabaseCatalogEntry; + }; +} diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp new file mode 100644 index 00000000000..2f168bd19a6 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp @@ -0,0 +1,225 @@ +// namespace_details_rsv1_metadata.cpp + +/** + * Copyright (C) 2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/ops/update.h" +#include "mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h" + +namespace mongo { + NamespaceDetailsRSV1MetaData::NamespaceDetailsRSV1MetaData( const StringData& ns, + NamespaceDetails* details, + RecordStore* namespaceRecordStore ) + : _ns( ns.toString() ), + _details( details ), + _namespaceRecordStore( namespaceRecordStore ) { + } + + const DiskLoc& NamespaceDetailsRSV1MetaData::capExtent() const { + return _details->capExtent; + } + + void NamespaceDetailsRSV1MetaData::setCapExtent( OperationContext* txn, const DiskLoc& loc ) { + *txn->recoveryUnit()->writing( &_details->capExtent ) = loc; + } + + const DiskLoc& NamespaceDetailsRSV1MetaData::capFirstNewRecord() const { + return _details->capFirstNewRecord; + } + + void NamespaceDetailsRSV1MetaData::setCapFirstNewRecord( OperationContext* txn, + const DiskLoc& loc ) { + *txn->recoveryUnit()->writing( &_details->capFirstNewRecord ) = loc; + } + + bool NamespaceDetailsRSV1MetaData::capLooped() const { + return _details->capFirstNewRecord.isValid(); + } + + long long NamespaceDetailsRSV1MetaData::dataSize() const { + return _details->stats.datasize; + } + long long NamespaceDetailsRSV1MetaData::numRecords() const { + return _details->stats.nrecords; + } + + void NamespaceDetailsRSV1MetaData::incrementStats( OperationContext* txn, + long long dataSizeIncrement, + long long numRecordsIncrement ) { + // durability todo : this could be a bit annoying / slow to record constantly + NamespaceDetails::Stats* s = txn->recoveryUnit()->writing( &_details->stats ); + s->datasize += dataSizeIncrement; + s->nrecords += numRecordsIncrement; + } + + void NamespaceDetailsRSV1MetaData::setStats( OperationContext* txn, + long long dataSize, + long long numRecords ) { + NamespaceDetails::Stats* s = txn->recoveryUnit()->writing( &_details->stats ); + s->datasize = dataSize; + s->nrecords = numRecords; + } + + const DiskLoc& NamespaceDetailsRSV1MetaData::deletedListEntry( int bucket ) const { + return _details->deletedList[ bucket ]; + } + + void NamespaceDetailsRSV1MetaData::setDeletedListEntry( OperationContext* txn, + int bucket, + const DiskLoc& loc ) { + *txn->recoveryUnit()->writing( &_details->deletedList[bucket] ) = loc; + } + + void NamespaceDetailsRSV1MetaData::orphanDeletedList( OperationContext* txn ) { + for( int i = 0; i < Buckets; i++ ) { + setDeletedListEntry( txn, i, DiskLoc() ); + } + } + + const DiskLoc& NamespaceDetailsRSV1MetaData::firstExtent( OperationContext* txn ) const { + return _details->firstExtent; + } + + void NamespaceDetailsRSV1MetaData::setFirstExtent( OperationContext* txn, const DiskLoc& loc ) { + *txn->recoveryUnit()->writing( &_details->firstExtent ) = loc; + } + + const DiskLoc& NamespaceDetailsRSV1MetaData::lastExtent( OperationContext* txn ) const { + return _details->lastExtent; + } + + void NamespaceDetailsRSV1MetaData::setLastExtent( OperationContext* txn, const DiskLoc& loc ) { + *txn->recoveryUnit()->writing( &_details->lastExtent ) = loc; + } + + bool NamespaceDetailsRSV1MetaData::isCapped() const { + return _details->isCapped; + } + + bool NamespaceDetailsRSV1MetaData::isUserFlagSet( int flag ) const { + return _details->userFlags & flag; + } + + int NamespaceDetailsRSV1MetaData::userFlags() const { + return _details->userFlags; + } + + bool NamespaceDetailsRSV1MetaData::setUserFlag( OperationContext* txn, int flag ) { + if ( ( _details->userFlags & flag ) == flag ) + return false; + + txn->recoveryUnit()->writingInt( _details->userFlags) |= flag; + _syncUserFlags( txn ); + return true; + } + + bool NamespaceDetailsRSV1MetaData::clearUserFlag( OperationContext* txn, int flag ) { + if ( ( _details->userFlags & flag ) == 0 ) + return false; + + txn->recoveryUnit()->writingInt(_details->userFlags) &= ~flag; + _syncUserFlags( txn ); + return true; + } + + bool NamespaceDetailsRSV1MetaData::replaceUserFlags( OperationContext* txn, int flags ) { + if ( _details->userFlags == flags ) + return false; + + txn->recoveryUnit()->writingInt(_details->userFlags) = flags; + _syncUserFlags( txn ); + return true; + } + + int NamespaceDetailsRSV1MetaData::lastExtentSize( OperationContext* txn ) const { + return _details->lastExtentSize; + } + + void NamespaceDetailsRSV1MetaData::setLastExtentSize( OperationContext* txn, int newMax ) { + if ( _details->lastExtentSize == newMax ) + return; + txn->recoveryUnit()->writingInt(_details->lastExtentSize) = newMax; + } + + long long NamespaceDetailsRSV1MetaData::maxCappedDocs() const { + invariant( _details->isCapped ); + if ( _details->maxDocsInCapped == 0x7fffffff ) + return numeric_limits<long long>::max(); + return _details->maxDocsInCapped; + } + + double NamespaceDetailsRSV1MetaData::paddingFactor() const { + return _details->paddingFactor; + } + + void NamespaceDetailsRSV1MetaData::setPaddingFactor( OperationContext* txn, double paddingFactor ) { + if ( paddingFactor == _details->paddingFactor ) + return; + + if ( _details->isCapped ) + return; + + *txn->recoveryUnit()->writing(&_details->paddingFactor) = paddingFactor; + } + + void NamespaceDetailsRSV1MetaData::_syncUserFlags( OperationContext* txn ) { + if ( !_namespaceRecordStore ) + return; + + scoped_ptr<RecordIterator> iterator( _namespaceRecordStore->getIterator( txn, + DiskLoc(), + false, + CollectionScanParams::FORWARD ) ); + while ( !iterator->isEOF() ) { + DiskLoc loc = iterator->getNext(); + + BSONObj oldEntry = iterator->dataFor( loc ).toBson(); + BSONElement e = oldEntry["name"]; + if ( e.type() != String ) + continue; + + if ( e.String() != _ns ) + continue; + + BSONObj newEntry = applyUpdateOperators( oldEntry, + BSON( "$set" << BSON( "options.flags" << userFlags() ) ) ); + + StatusWith<DiskLoc> result = _namespaceRecordStore->updateRecord( txn, + loc, + newEntry.objdata(), + newEntry.objsize(), + -1, + NULL ); + fassert( 17486, result.isOK() ); + return; + } + + fassertFailed( 17488 ); + } + +} diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h new file mode 100644 index 00000000000..9f933d003e5 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h @@ -0,0 +1,111 @@ +// namespace_details_rsv1_metadata.h + +/** + * Copyright (C) 2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include <string> + +#include "mongo/base/string_data.h" +#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h" +#include "mongo/db/storage/mmap_v1/record_store_v1_base.h" + +namespace mongo { + + class RecordStore; + + /* + * NOTE: NamespaceDetails will become a struct + * all dur, etc... will move here + */ + class NamespaceDetailsRSV1MetaData : public RecordStoreV1MetaData { + public: + explicit NamespaceDetailsRSV1MetaData( const StringData& ns, + NamespaceDetails* details, + RecordStore* namespaceRecordStore ); + + virtual ~NamespaceDetailsRSV1MetaData(){} + + virtual const DiskLoc& capExtent() const; + virtual void setCapExtent( OperationContext* txn, const DiskLoc& loc ); + + virtual const DiskLoc& capFirstNewRecord() const; + virtual void setCapFirstNewRecord( OperationContext* txn, const DiskLoc& loc ); + + virtual bool capLooped() const; + + virtual long long dataSize() const; + virtual long long numRecords() const; + + virtual void incrementStats( OperationContext* txn, + long long dataSizeIncrement, + long long numRecordsIncrement ); + + virtual void setStats( OperationContext* txn, + long long dataSize, + long long numRecords ); + + virtual const DiskLoc& deletedListEntry( int bucket ) const; + virtual void setDeletedListEntry( OperationContext* txn, + int bucket, + const DiskLoc& loc ); + virtual void orphanDeletedList(OperationContext* txn); + + virtual const DiskLoc& firstExtent( OperationContext* txn ) const; + virtual void setFirstExtent( OperationContext* txn, const DiskLoc& loc ); + + virtual const DiskLoc& lastExtent( OperationContext* txn ) const; + virtual void setLastExtent( OperationContext* txn, const DiskLoc& loc ); + + virtual bool isCapped() const; + + virtual bool isUserFlagSet( int flag ) const; + virtual int userFlags() const; + virtual bool setUserFlag( OperationContext* txn, int flag ); + virtual bool clearUserFlag( OperationContext* txn, int flag ); + virtual bool replaceUserFlags( OperationContext* txn, int flags ); + + virtual int lastExtentSize( OperationContext* txn ) const; + virtual void setLastExtentSize( OperationContext* txn, int newMax ); + + virtual long long maxCappedDocs() const; + + virtual double paddingFactor() const; + virtual void setPaddingFactor( OperationContext* txn, double paddingFactor ); + + private: + + void _syncUserFlags( OperationContext* txn ); + + std::string _ns; + NamespaceDetails* _details; + RecordStore* _namespaceRecordStore; + }; + +} diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp new file mode 100644 index 00000000000..9bbf8ef6303 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp @@ -0,0 +1,205 @@ +// namespace_index.cpp + +/** + * Copyright (C) 2013 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/platform/basic.h" +#include "mongo/db/storage/mmap_v1/catalog/namespace_index.h" + +#include <boost/filesystem/operations.hpp> + +#include "mongo/db/d_concurrency.h" +#include "mongo/db/operation_context.h" +#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h" +#include "mongo/util/exit.h" +#include "mongo/util/log.h" + +namespace mongo { + + MONGO_LOG_DEFAULT_COMPONENT_FILE(::mongo::logger::LogComponent::kIndexing); + + NamespaceDetails* NamespaceIndex::details(const StringData& ns) { + Namespace n(ns); + return details(n); + } + + NamespaceDetails* NamespaceIndex::details(const Namespace& ns) { + if ( !_ht.get() ) + return 0; + return _ht->get(ns); + } + + void NamespaceIndex::add_ns( OperationContext* txn, + const StringData& ns, const DiskLoc& loc, bool capped) { + NamespaceDetails details( loc, capped ); + add_ns( txn, ns, &details ); + } + + void NamespaceIndex::add_ns( OperationContext* txn, + const StringData& ns, const NamespaceDetails* details ) { + Namespace n(ns); + add_ns( txn, n, details ); + } + + void NamespaceIndex::add_ns( OperationContext* txn, + const Namespace& ns, const NamespaceDetails* details ) { + string nsString = ns.toString(); + txn->lockState()->assertWriteLocked( nsString ); + massert( 17315, "no . in ns", nsString.find( '.' ) != string::npos ); + init( txn ); + uassert( 10081, "too many namespaces/collections", _ht->put(txn, ns, *details)); + } + + void NamespaceIndex::kill_ns( OperationContext* txn, const StringData& ns) { + txn->lockState()->assertWriteLocked(ns); + if ( !_ht.get() ) + return; + Namespace n(ns); + _ht->kill(txn, n); + + if (ns.size() <= Namespace::MaxNsColletionLen) { + // Larger namespace names don't have room for $extras so they can't exist. The code + // below would cause an "$extra: ns too large" error and stacktrace to be printed to the + // log even though everything is fine. + for( int i = 0; i<=1; i++ ) { + try { + Namespace extra(n.extraName(i)); + _ht->kill(txn, extra); + } + catch(DBException&) { + LOG(3) << "caught exception in kill_ns" << endl; + } + } + } + } + + bool NamespaceIndex::pathExists() const { + return boost::filesystem::exists(path()); + } + + boost::filesystem::path NamespaceIndex::path() const { + boost::filesystem::path ret( _dir ); + if (storageGlobalParams.directoryperdb) + ret /= _database; + ret /= ( _database + ".ns" ); + return ret; + } + + static void namespaceGetNamespacesCallback( const Namespace& k , NamespaceDetails& v , list<string>* l ) { + if ( ! k.hasDollarSign() || k == "local.oplog.$main" ) { + // we call out local.oplog.$main specifically as its the only "normal" + // collection that has a $, so we make sure it gets added + l->push_back( k.toString() ); + } + } + + void NamespaceIndex::getCollectionNamespaces( list<string>* tofill ) const { + if ( _ht.get() ) + _ht->iterAll( stdx::bind( namespaceGetNamespacesCallback, + stdx::placeholders::_1, stdx::placeholders::_2, tofill) ); + } + + void NamespaceIndex::maybeMkdir() const { + if (!storageGlobalParams.directoryperdb) + return; + boost::filesystem::path dir( _dir ); + dir /= _database; + if ( !boost::filesystem::exists( dir ) ) + MONGO_ASSERT_ON_EXCEPTION_WITH_MSG( boost::filesystem::create_directory( dir ), "create dir for db " ); + } + + NOINLINE_DECL void NamespaceIndex::_init( OperationContext* txn ) { + verify( !_ht.get() ); + + txn->lockState()->assertWriteLocked(_database); + + /* if someone manually deleted the datafiles for a database, + we need to be sure to clear any cached info for the database in + local.*. + */ + /* + if ( "local" != _database ) { + DBInfo i(_database.c_str()); + i.dbDropped(); + } + */ + + unsigned long long len = 0; + boost::filesystem::path nsPath = path(); + string pathString = nsPath.string(); + void *p = 0; + if ( boost::filesystem::exists(nsPath) ) { + if( _f.open(pathString, true) ) { + len = _f.length(); + if ( len % (1024*1024) != 0 ) { + log() << "bad .ns file: " << pathString << endl; + uassert( 10079 , "bad .ns file length, cannot open database", len % (1024*1024) == 0 ); + } + p = _f.getView(); + } + } + else { + // use storageGlobalParams.lenForNewNsFiles, we are making a new database + massert(10343, "bad storageGlobalParams.lenForNewNsFiles", + storageGlobalParams.lenForNewNsFiles >= 1024*1024); + maybeMkdir(); + unsigned long long l = storageGlobalParams.lenForNewNsFiles; + if ( _f.create(pathString, l, true) ) { + // The writes done in this function must not be rolled back. If the containing + // UnitOfWork rolls back it should roll back to the state *after* these writes. This + // will leave the file empty, but available for future use. That is why we go + // directly to the global dur dirty list rather than going through the + // OperationContext. + getDur().createdFile(pathString, l); // always a new file + len = l; + verify(len == storageGlobalParams.lenForNewNsFiles); + p = _f.getView(); + + if ( p ) { + // we do this so the durability system isn't mad at us for + // only initiating file and not doing a write + // grep for 17388 + getDur().writingPtr( p, 5 ); // throw away + } + } + } + + if ( p == 0 ) { + /** TODO: this shouldn't terminate? */ + log() << "error couldn't open file " << pathString << " terminating" << endl; + dbexit( EXIT_FS ); + } + + + verify( len <= 0x7fffffff ); + _ht.reset(new HashTable<Namespace,NamespaceDetails>(p, (int) len, "namespace index")); + } + + +} + diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_index.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.h new file mode 100644 index 00000000000..3ce2c2e0194 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.h @@ -0,0 +1,94 @@ +// namespace_index.h + +/** + * Copyright (C) 2013 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include <list> +#include <string> + +#include "mongo/base/disallow_copying.h" +#include "mongo/db/diskloc.h" +#include "mongo/db/storage/mmap_v1/catalog/hashtab.h" +#include "mongo/db/storage/mmap_v1/catalog/namespace.h" + +namespace mongo { + + class NamespaceDetails; + class OperationContext; + + /* NamespaceIndex is the ".ns" file you see in the data directory. It is the "system catalog" + if you will: at least the core parts. (Additional info in system.* collections.) + */ + class NamespaceIndex { + MONGO_DISALLOW_COPYING(NamespaceIndex); + public: + NamespaceIndex(const std::string &dir, const std::string &database) : + _ht( 0 ), _dir( dir ), _database( database ) {} + + /* returns true if the file represented by this file exists on disk */ + bool pathExists() const; + + void init( OperationContext* txn ) { + if ( !_ht.get() ) + _init( txn ); + } + + void add_ns( OperationContext* txn, + const StringData& ns, const DiskLoc& loc, bool capped); + void add_ns( OperationContext* txn, + const StringData& ns, const NamespaceDetails* details ); + void add_ns( OperationContext* txn, + const Namespace& ns, const NamespaceDetails* details ); + + NamespaceDetails* details(const StringData& ns); + NamespaceDetails* details(const Namespace& ns); + + void kill_ns( OperationContext* txn, + const StringData& ns); + + bool allocated() const { return _ht.get() != 0; } + + void getCollectionNamespaces( std::list<std::string>* tofill ) const; + + boost::filesystem::path path() const; + + unsigned long long fileLength() const { return _f.length(); } + + private: + void _init( OperationContext* txn ); + void maybeMkdir() const; + + DurableMappedFile _f; + scoped_ptr<HashTable<Namespace,NamespaceDetails> > _ht; + std::string _dir; + std::string _database; + }; + +} diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp new file mode 100644 index 00000000000..7c50b86a5bf --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp @@ -0,0 +1,67 @@ +// namespace_test.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#include "mongo/unittest/unittest.h" + +#include "mongo/db/storage/mmap_v1/catalog/namespace.h" + +namespace mongo { + + TEST( NamespaceTest, Basics ) { + Namespace foo( "foo.bar" ); + Namespace bar( "bar.foo" ); + + ASSERT_EQUALS( foo.toString(), foo.toString() ); + ASSERT_EQUALS( foo.hash(), foo.hash() ); + + ASSERT_NOT_EQUALS( foo.hash(), bar.hash() ); + + ASSERT( foo == foo ); + ASSERT( !( foo != foo ) ); + ASSERT( foo != bar ); + ASSERT( !( foo == bar ) ); + } + + TEST( NamespaceTest, ExtraName ) { + Namespace foo( "foo.bar" ); + ASSERT_FALSE( foo.isExtra() ); + + string str0 = foo.extraName( 0 ); + ASSERT_EQUALS( "foo.bar$extra", str0 ); + Namespace ex0( str0 ); + ASSERT_TRUE( ex0.isExtra() ); + + string str1 = foo.extraName( 1 ); + ASSERT_EQUALS( "foo.bar$extrb", str1 ); + Namespace ex1( str1 ); + ASSERT_TRUE( ex1.isExtra() ); + + } +} diff --git a/src/mongo/db/storage/mmap_v1/dur_recover.cpp b/src/mongo/db/storage/mmap_v1/dur_recover.cpp index 9d4e679808a..52836e7977f 100644 --- a/src/mongo/db/storage/mmap_v1/dur_recover.cpp +++ b/src/mongo/db/storage/mmap_v1/dur_recover.cpp @@ -40,6 +40,7 @@ #include "mongo/db/catalog/database.h" #include "mongo/db/db.h" #include "mongo/db/storage/storage_engine.h" +#include "mongo/db/storage/mmap_v1/catalog/namespace.h" #include "mongo/db/storage/mmap_v1/dur.h" #include "mongo/db/storage/mmap_v1/dur_commitjob.h" #include "mongo/db/storage/mmap_v1/dur_journal.h" diff --git a/src/mongo/db/storage/mmap_v1/extent.h b/src/mongo/db/storage/mmap_v1/extent.h index 8a27e271c04..f009e283380 100644 --- a/src/mongo/db/storage/mmap_v1/extent.h +++ b/src/mongo/db/storage/mmap_v1/extent.h @@ -34,7 +34,7 @@ #include <vector> #include "mongo/db/diskloc.h" -#include "mongo/db/structure/catalog/namespace.h" +#include "mongo/db/storage/mmap_v1/catalog/namespace.h" namespace mongo { diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp index 303ac49e507..f8ca6265c5f 100644 --- a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp +++ b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp @@ -42,11 +42,12 @@ #include "mongo/db/pdfile_version.h" #include "mongo/db/server_parameters.h" #include "mongo/db/storage/mmap_v1/data_file.h" -#include "mongo/db/structure/catalog/namespace_details.h" -#include "mongo/db/structure/catalog/namespace_details_collection_entry.h" -#include "mongo/db/structure/catalog/namespace_details_rsv1_metadata.h" -#include "mongo/db/structure/record_store_v1_capped.h" -#include "mongo/db/structure/record_store_v1_simple.h" +#include "mongo/db/storage/mmap_v1/btree/btree_interface.h" +#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h" +#include "mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h" +#include "mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h" +#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h" +#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h" namespace mongo { @@ -444,7 +445,7 @@ namespace mongo { void MMAPV1DatabaseCatalogEntry::_lazyInit( OperationContext* txn ) { // this is sort of insane - // it's because the whole structure is highly recursive + // it's because the whole storage/mmap_v1 is highly recursive _namespaceIndex.init( txn ); @@ -682,13 +683,13 @@ namespace mongo { rs = entry->recordStore.get(); } - std::auto_ptr<BtreeInterface> btree( - BtreeInterface::getInterface(entry->headManager(), - rs, - entry->ordering(), - entry->descriptor()->indexNamespace(), - entry->descriptor()->version(), - &BtreeBasedAccessMethod::invalidateCursors)); + std::auto_ptr<SortedDataInterface> btree( + getMMAPV1Interface(entry->headManager(), + rs, + entry->ordering(), + entry->descriptor()->indexNamespace(), + entry->descriptor()->version(), + &BtreeBasedAccessMethod::invalidateCursors)); if (IndexNames::HASHED == type) return new HashAccessMethod( entry, btree.release() ); diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h index 16a88b84ede..fa5a5874061 100644 --- a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h +++ b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h @@ -33,8 +33,8 @@ #include "mongo/base/status.h" #include "mongo/base/string_data.h" #include "mongo/db/catalog/database_catalog_entry.h" +#include "mongo/db/storage/mmap_v1/catalog/namespace_index.h" #include "mongo/db/storage/mmap_v1/mmap_v1_extent_manager.h" -#include "mongo/db/structure/catalog/namespace_index.h" namespace mongo { diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp new file mode 100644 index 00000000000..3a1bed72dd9 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp @@ -0,0 +1,974 @@ +// record_store_v1_base.cpp + +/** + * Copyright (C) 2013-2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/storage/mmap_v1/record_store_v1_base.h" + +#include "mongo/db/catalog/collection.h" +#include "mongo/db/concurrency/lock_mgr.h" +#include "mongo/db/operation_context.h" +#include "mongo/db/storage/mmap_v1/extent.h" +#include "mongo/db/storage/mmap_v1/extent_manager.h" +#include "mongo/db/storage/mmap_v1/record.h" +#include "mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h" +#include "mongo/util/progress_meter.h" +#include "mongo/util/timer.h" +#include "mongo/util/touch_pages.h" + +namespace mongo { + + const int RecordStoreV1Base::Buckets = 19; + const int RecordStoreV1Base::MaxBucket = 18; + + /* Deleted list buckets are used to quickly locate free space based on size. Each bucket + contains records up to that size. All records >= 4mb are placed into the 16mb bucket. + */ + const int RecordStoreV1Base::bucketSizes[] = { + 0x20, 0x40, 0x80, 0x100, // 32, 64, 128, 256 + 0x200, 0x400, 0x800, 0x1000, // 512, 1K, 2K, 4K + 0x2000, 0x4000, 0x8000, 0x10000, // 8K, 16K, 32K, 64K + 0x20000, 0x40000, 0x80000, 0x100000, // 128K, 256K, 512K, 1M + 0x200000, 0x400000, 0x1000000, // 2M, 4M, 16M (see above) + }; + + + RecordStoreV1Base::RecordStoreV1Base( const StringData& ns, + RecordStoreV1MetaData* details, + ExtentManager* em, + bool isSystemIndexes ) + : RecordStore( ns ), + _details( details ), + _extentManager( em ), + _isSystemIndexes( isSystemIndexes ) { + } + + RecordStoreV1Base::~RecordStoreV1Base() { + } + + + int64_t RecordStoreV1Base::storageSize( OperationContext* txn, + BSONObjBuilder* extraInfo, + int level ) const { + BSONArrayBuilder extentInfo; + + int64_t total = 0; + int n = 0; + + DiskLoc cur = _details->firstExtent(txn); + + while ( !cur.isNull() ) { + Extent* e = _extentManager->getExtent( cur ); + + total += e->length; + n++; + + if ( extraInfo && level > 0 ) { + extentInfo.append( BSON( "len" << e->length << "loc: " << e->myLoc.toBSONObj() ) ); + } + cur = e->xnext; + } + + if ( extraInfo ) { + extraInfo->append( "numExtents", n ); + if ( level > 0 ) + extraInfo->append( "extents", extentInfo.arr() ); + } + + return total; + } + + RecordData RecordStoreV1Base::dataFor( const DiskLoc& loc ) const { + return recordFor(loc)->toRecordData(); + } + + Record* RecordStoreV1Base::recordFor( const DiskLoc& loc ) const { + return _extentManager->recordForV1( loc ); + } + + const DeletedRecord* RecordStoreV1Base::deletedRecordFor( const DiskLoc& loc ) const { + invariant( loc.a() != -1 ); + return reinterpret_cast<const DeletedRecord*>( recordFor( loc ) ); + } + + DeletedRecord* RecordStoreV1Base::drec( const DiskLoc& loc ) const { + invariant( loc.a() != -1 ); + return reinterpret_cast<DeletedRecord*>( recordFor( loc ) ); + } + + Extent* RecordStoreV1Base::_getExtent( OperationContext* txn, const DiskLoc& loc ) const { + return _extentManager->getExtent( loc ); + } + + DiskLoc RecordStoreV1Base::_getExtentLocForRecord( OperationContext* txn, const DiskLoc& loc ) const { + return _extentManager->extentLocForV1( loc ); + } + + + DiskLoc RecordStoreV1Base::getNextRecord( OperationContext* txn, const DiskLoc& loc ) const { + DiskLoc next = getNextRecordInExtent( txn, loc ); + if ( !next.isNull() ) { + return next; + } + + // now traverse extents + + Extent* e = _getExtent( txn, _getExtentLocForRecord(txn, loc) ); + while ( 1 ) { + if ( e->xnext.isNull() ) + return DiskLoc(); // end of collection + e = _getExtent( txn, e->xnext ); + if ( !e->firstRecord.isNull() ) + break; + // entire extent could be empty, keep looking + } + return e->firstRecord; + } + + DiskLoc RecordStoreV1Base::getPrevRecord( OperationContext* txn, const DiskLoc& loc ) const { + DiskLoc prev = getPrevRecordInExtent( txn, loc ); + if ( !prev.isNull() ) { + return prev; + } + + // now traverse extents + + Extent *e = _getExtent(txn, _getExtentLocForRecord(txn, loc)); + while ( 1 ) { + if ( e->xprev.isNull() ) + return DiskLoc(); // end of collection + e = _getExtent( txn, e->xprev ); + if ( !e->firstRecord.isNull() ) + break; + // entire extent could be empty, keep looking + } + return e->lastRecord; + + } + + DiskLoc RecordStoreV1Base::_findFirstSpot( OperationContext* txn, + const DiskLoc& extDiskLoc, + Extent* e ) { + DiskLoc emptyLoc = extDiskLoc; + emptyLoc.inc( Extent::HeaderSize() ); + int delRecLength = e->length - Extent::HeaderSize(); + if ( delRecLength >= 32*1024 && _ns.find('$') != string::npos && !isCapped() ) { + // probably an index. so skip forward to keep its records page aligned + int& ofs = emptyLoc.GETOFS(); + int newOfs = (ofs + 0xfff) & ~0xfff; + delRecLength -= (newOfs-ofs); + dassert( delRecLength > 0 ); + ofs = newOfs; + } + + DeletedRecord* empty = txn->recoveryUnit()->writing(drec(emptyLoc)); + empty->lengthWithHeaders() = delRecLength; + empty->extentOfs() = e->myLoc.getOfs(); + empty->nextDeleted().Null(); + return emptyLoc; + + } + + DiskLoc RecordStoreV1Base::getNextRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const { + int nextOffset = recordFor( loc )->nextOfs(); + + if ( nextOffset == DiskLoc::NullOfs ) + return DiskLoc(); + + fassert( 17441, abs(nextOffset) >= 8 ); // defensive + DiskLoc result( loc.a(), nextOffset ); + return result; + } + + DiskLoc RecordStoreV1Base::getPrevRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const { + int prevOffset = recordFor( loc )->prevOfs(); + + if ( prevOffset == DiskLoc::NullOfs ) + return DiskLoc(); + + fassert( 17442, abs(prevOffset) >= 8 ); // defensive + DiskLoc result( loc.a(), prevOffset ); + return result; + } + + + StatusWith<DiskLoc> RecordStoreV1Base::insertRecord( OperationContext* txn, + const DocWriter* doc, + bool enforceQuota ) { + int docSize = doc->documentSize(); + if ( docSize < 4 ) { + return StatusWith<DiskLoc>( ErrorCodes::InvalidLength, + "record has to be >= 4 bytes" ); + } + int lenWHdr = docSize + Record::HeaderSize; + if ( doc->addPadding() ) + lenWHdr = getRecordAllocationSize( lenWHdr ); + + StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, enforceQuota ); + if ( !loc.isOK() ) + return loc; + + Record *r = recordFor( loc.getValue() ); + fassert( 17319, r->lengthWithHeaders() >= lenWHdr ); + + r = reinterpret_cast<Record*>( txn->recoveryUnit()->writingPtr(r, lenWHdr) ); + doc->writeDocument( r->data() ); + + _addRecordToRecListInExtent(txn, r, loc.getValue()); + + _details->incrementStats( txn, r->netLength(), 1 ); + + _paddingFits( txn ); + + return loc; + } + + + StatusWith<DiskLoc> RecordStoreV1Base::insertRecord( OperationContext* txn, + const char* data, + int len, + bool enforceQuota ) { + if ( len < 4 ) { + return StatusWith<DiskLoc>( ErrorCodes::InvalidLength, + "record has to be >= 4 bytes" ); + } + + StatusWith<DiskLoc> status = _insertRecord( txn, data, len, enforceQuota ); + if ( status.isOK() ) + _paddingFits( txn ); + + return status; + } + + StatusWith<DiskLoc> RecordStoreV1Base::_insertRecord( OperationContext* txn, + const char* data, + int len, + bool enforceQuota ) { + + int lenWHdr = getRecordAllocationSize( len + Record::HeaderSize ); + fassert( 17208, lenWHdr >= ( len + Record::HeaderSize ) ); + + StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, enforceQuota ); + if ( !loc.isOK() ) + return loc; + + Record *r = recordFor( loc.getValue() ); + fassert( 17210, r->lengthWithHeaders() >= lenWHdr ); + + // copy the data + r = reinterpret_cast<Record*>( txn->recoveryUnit()->writingPtr(r, lenWHdr) ); + memcpy( r->data(), data, len ); + + _addRecordToRecListInExtent(txn, r, loc.getValue()); + + _details->incrementStats( txn, r->netLength(), 1 ); + + return loc; + } + + StatusWith<DiskLoc> RecordStoreV1Base::updateRecord( OperationContext* txn, + const DiskLoc& oldLocation, + const char* data, + int dataSize, + bool enforceQuota, + UpdateMoveNotifier* notifier ) { + Record* oldRecord = recordFor( oldLocation ); + if ( oldRecord->netLength() >= dataSize ) { + // we fit + _paddingFits( txn ); + memcpy( txn->recoveryUnit()->writingPtr( oldRecord->data(), dataSize ), data, dataSize ); + return StatusWith<DiskLoc>( oldLocation ); + } + + if ( isCapped() ) + return StatusWith<DiskLoc>( ErrorCodes::InternalError, + "failing update: objects in a capped ns cannot grow", + 10003 ); + + // we have to move + + _paddingTooSmall( txn ); + + StatusWith<DiskLoc> newLocation = _insertRecord( txn, data, dataSize, enforceQuota ); + if ( !newLocation.isOK() ) + return newLocation; + + // insert worked, so we delete old record + if ( notifier ) { + Status moveStatus = notifier->recordStoreGoingToMove( txn, + oldLocation, + oldRecord->data(), + oldRecord->netLength() ); + if ( !moveStatus.isOK() ) + return StatusWith<DiskLoc>( moveStatus ); + } + + deleteRecord( txn, oldLocation ); + + return newLocation; + } + + + Status RecordStoreV1Base::updateWithDamages( OperationContext* txn, + const DiskLoc& loc, + const char* damageSource, + const mutablebson::DamageVector& damages ) { + _paddingFits( txn ); + + Record* rec = recordFor( loc ); + char* root = rec->data(); + + // All updates were in place. Apply them via durability and writing pointer. + mutablebson::DamageVector::const_iterator where = damages.begin(); + const mutablebson::DamageVector::const_iterator end = damages.end(); + for( ; where != end; ++where ) { + const char* sourcePtr = damageSource + where->sourceOffset; + void* targetPtr = txn->recoveryUnit()->writingPtr(root + where->targetOffset, where->size); + std::memcpy(targetPtr, sourcePtr, where->size); + } + + return Status::OK(); + } + + void RecordStoreV1Base::deleteRecord( OperationContext* txn, const DiskLoc& dl ) { + + Record* todelete = recordFor( dl ); + invariant( todelete->netLength() >= 4 ); // this is required for defensive code + + /* remove ourself from the record next/prev chain */ + { + if ( todelete->prevOfs() != DiskLoc::NullOfs ) { + DiskLoc prev = getPrevRecordInExtent( txn, dl ); + Record* prevRecord = recordFor( prev ); + txn->recoveryUnit()->writingInt( prevRecord->nextOfs() ) = todelete->nextOfs(); + } + + if ( todelete->nextOfs() != DiskLoc::NullOfs ) { + DiskLoc next = getNextRecord( txn, dl ); + Record* nextRecord = recordFor( next ); + txn->recoveryUnit()->writingInt( nextRecord->prevOfs() ) = todelete->prevOfs(); + } + } + + /* remove ourself from extent pointers */ + { + DiskLoc extentLoc = todelete->myExtentLoc(dl); + Extent *e = _getExtent( txn, extentLoc ); + if ( e->firstRecord == dl ) { + txn->recoveryUnit()->writing(&e->firstRecord); + if ( todelete->nextOfs() == DiskLoc::NullOfs ) + e->firstRecord.Null(); + else + e->firstRecord.set(dl.a(), todelete->nextOfs() ); + } + if ( e->lastRecord == dl ) { + txn->recoveryUnit()->writing(&e->lastRecord); + if ( todelete->prevOfs() == DiskLoc::NullOfs ) + e->lastRecord.Null(); + else + e->lastRecord.set(dl.a(), todelete->prevOfs() ); + } + } + + /* add to the free list */ + { + _details->incrementStats( txn, -1 * todelete->netLength(), -1 ); + + if ( _isSystemIndexes ) { + /* temp: if in system.indexes, don't reuse, and zero out: we want to be + careful until validated more, as IndexDetails has pointers + to this disk location. so an incorrectly done remove would cause + a lot of problems. + */ + memset( txn->recoveryUnit()->writingPtr(todelete, todelete->lengthWithHeaders() ), + 0, todelete->lengthWithHeaders() ); + } + else { + // this is defensive so we can detect if we are still using a location + // that was deleted + memset(txn->recoveryUnit()->writingPtr(todelete->data(), 4), 0xee, 4); + addDeletedRec(txn, dl); + } + } + + } + + RecordIterator* RecordStoreV1Base::getIteratorForRepair(OperationContext* txn) const { + return new RecordStoreV1RepairIterator(txn, this); + } + + void RecordStoreV1Base::_addRecordToRecListInExtent(OperationContext* txn, + Record *r, + DiskLoc loc) { + dassert( recordFor(loc) == r ); + DiskLoc extentLoc = _getExtentLocForRecord( txn, loc ); + Extent *e = _getExtent( txn, extentLoc ); + if ( e->lastRecord.isNull() ) { + *txn->recoveryUnit()->writing(&e->firstRecord) = loc; + *txn->recoveryUnit()->writing(&e->lastRecord) = loc; + r->prevOfs() = r->nextOfs() = DiskLoc::NullOfs; + } + else { + Record *oldlast = recordFor(e->lastRecord); + r->prevOfs() = e->lastRecord.getOfs(); + r->nextOfs() = DiskLoc::NullOfs; + txn->recoveryUnit()->writingInt(oldlast->nextOfs()) = loc.getOfs(); + *txn->recoveryUnit()->writing(&e->lastRecord) = loc; + } + } + + void RecordStoreV1Base::increaseStorageSize( OperationContext* txn, + int size, + bool enforceQuota ) { + DiskLoc eloc = _extentManager->allocateExtent( txn, + isCapped(), + size, + enforceQuota ); + Extent *e = _extentManager->getExtent( eloc ); + invariant( e ); + + *txn->recoveryUnit()->writing( &e->nsDiagnostic ) = _ns; + + txn->recoveryUnit()->writing( &e->xnext )->Null(); + txn->recoveryUnit()->writing( &e->xprev )->Null(); + txn->recoveryUnit()->writing( &e->firstRecord )->Null(); + txn->recoveryUnit()->writing( &e->lastRecord )->Null(); + + DiskLoc emptyLoc = _findFirstSpot( txn, eloc, e ); + + if ( _details->lastExtent(txn).isNull() ) { + invariant( _details->firstExtent(txn).isNull() ); + _details->setFirstExtent( txn, eloc ); + _details->setLastExtent( txn, eloc ); + _details->setCapExtent( txn, eloc ); + invariant( e->xprev.isNull() ); + invariant( e->xnext.isNull() ); + } + else { + invariant( !_details->firstExtent(txn).isNull() ); + *txn->recoveryUnit()->writing(&e->xprev) = _details->lastExtent(txn); + *txn->recoveryUnit()->writing(&_extentManager->getExtent(_details->lastExtent(txn))->xnext) = eloc; + _details->setLastExtent( txn, eloc ); + } + + _details->setLastExtentSize( txn, e->length ); + + addDeletedRec(txn, emptyLoc); + } + + Status RecordStoreV1Base::validate( OperationContext* txn, + bool full, bool scanData, + ValidateAdaptor* adaptor, + ValidateResults* results, BSONObjBuilder* output ) const { + + // 1) basic status that require no iteration + // 2) extent level info + // 3) check extent start and end + // 4) check each non-deleted record + // 5) check deleted list + + // ------------- + + // 1111111111111111111 + if ( isCapped() ){ + output->appendBool("capped", true); + output->appendNumber("max", _details->maxCappedDocs()); + } + + output->appendNumber("datasize", _details->dataSize()); + output->appendNumber("nrecords", _details->numRecords()); + output->appendNumber("lastExtentSize", _details->lastExtentSize(txn)); + output->appendNumber("padding", _details->paddingFactor()); + + if ( _details->firstExtent(txn).isNull() ) + output->append( "firstExtent", "null" ); + else + output->append( "firstExtent", + str::stream() << _details->firstExtent(txn).toString() + << " ns:" + << _getExtent( txn, _details->firstExtent(txn) )->nsDiagnostic.toString()); + if ( _details->lastExtent(txn).isNull() ) + output->append( "lastExtent", "null" ); + else + output->append( "lastExtent", str::stream() << _details->lastExtent(txn).toString() + << " ns:" + << _getExtent( txn, _details->lastExtent(txn) )->nsDiagnostic.toString()); + + // 22222222222222222222222222 + { // validate extent basics + BSONArrayBuilder extentData; + int extentCount = 0; + DiskLoc extentDiskLoc; + try { + if ( !_details->firstExtent(txn).isNull() ) { + _getExtent( txn, _details->firstExtent(txn) )->assertOk(); + _getExtent( txn, _details->lastExtent(txn) )->assertOk(); + } + + extentDiskLoc = _details->firstExtent(txn); + while (!extentDiskLoc.isNull()) { + Extent* thisExtent = _getExtent( txn, extentDiskLoc ); + if (full) { + extentData << thisExtent->dump(); + } + if (!thisExtent->validates(extentDiskLoc, &results->errors)) { + results->valid = false; + } + DiskLoc nextDiskLoc = thisExtent->xnext; + + if (extentCount > 0 && !nextDiskLoc.isNull() + && _getExtent( txn, nextDiskLoc )->xprev != extentDiskLoc) { + StringBuilder sb; + sb << "'xprev' pointer " << _getExtent( txn, nextDiskLoc )->xprev.toString() + << " in extent " << nextDiskLoc.toString() + << " does not point to extent " << extentDiskLoc.toString(); + results->errors.push_back( sb.str() ); + results->valid = false; + } + if (nextDiskLoc.isNull() && extentDiskLoc != _details->lastExtent(txn)) { + StringBuilder sb; + sb << "'lastExtent' pointer " << _details->lastExtent(txn).toString() + << " does not point to last extent in list " << extentDiskLoc.toString(); + results->errors.push_back( sb.str() ); + results->valid = false; + } + extentDiskLoc = nextDiskLoc; + extentCount++; + txn->checkForInterrupt(); + } + } + catch (const DBException& e) { + StringBuilder sb; + sb << "exception validating extent " << extentCount + << ": " << e.what(); + results->errors.push_back( sb.str() ); + results->valid = false; + return Status::OK(); + } + output->append("extentCount", extentCount); + + if ( full ) + output->appendArray( "extents" , extentData.arr() ); + + } + + try { + // 333333333333333333333333333 + bool testingLastExtent = false; + try { + DiskLoc firstExtentLoc = _details->firstExtent(txn); + if (firstExtentLoc.isNull()) { + // this is ok + } + else { + output->append("firstExtentDetails", _getExtent(txn, firstExtentLoc)->dump()); + if (!_getExtent(txn, firstExtentLoc)->xprev.isNull()) { + StringBuilder sb; + sb << "'xprev' pointer in 'firstExtent' " << _details->firstExtent(txn).toString() + << " is " << _getExtent(txn, firstExtentLoc)->xprev.toString() + << ", should be null"; + results->errors.push_back( sb.str() ); + results->valid = false; + } + } + testingLastExtent = true; + DiskLoc lastExtentLoc = _details->lastExtent(txn); + if (lastExtentLoc.isNull()) { + // this is ok + } + else { + if (firstExtentLoc != lastExtentLoc) { + output->append("lastExtentDetails", _getExtent(txn, lastExtentLoc)->dump()); + if (!_getExtent(txn, lastExtentLoc)->xnext.isNull()) { + StringBuilder sb; + sb << "'xnext' pointer in 'lastExtent' " << lastExtentLoc.toString() + << " is " << _getExtent(txn, lastExtentLoc)->xnext.toString() + << ", should be null"; + results->errors.push_back( sb.str() ); + results->valid = false; + } + } + } + } + catch (const DBException& e) { + StringBuilder sb; + sb << "exception processing '" + << (testingLastExtent ? "lastExtent" : "firstExtent") + << "': " << e.what(); + results->errors.push_back( sb.str() ); + results->valid = false; + } + + // 4444444444444444444444444 + + set<DiskLoc> recs; + if( scanData ) { + int n = 0; + int nInvalid = 0; + long long nQuantizedSize = 0; + long long nPowerOf2QuantizedSize = 0; + long long len = 0; + long long nlen = 0; + long long bsonLen = 0; + int outOfOrder = 0; + DiskLoc cl_last; + + scoped_ptr<RecordIterator> iterator( getIterator( txn, + DiskLoc(), + false, + CollectionScanParams::FORWARD ) ); + DiskLoc cl; + while ( !( cl = iterator->getNext() ).isNull() ) { + n++; + + if ( n < 1000000 ) + recs.insert(cl); + if ( isCapped() ) { + if ( cl < cl_last ) + outOfOrder++; + cl_last = cl; + } + + Record *r = recordFor(cl); + len += r->lengthWithHeaders(); + nlen += r->netLength(); + + if ( r->lengthWithHeaders() == + quantizeAllocationSpace( r->lengthWithHeaders() ) ) { + // Count the number of records having a size consistent with + // the quantizeAllocationSpace quantization implementation. + ++nQuantizedSize; + } + + if ( r->lengthWithHeaders() == + quantizePowerOf2AllocationSpace( r->lengthWithHeaders() ) ) { + // Count the number of records having a size consistent with the + // quantizePowerOf2AllocationSpace quantization implementation. + ++nPowerOf2QuantizedSize; + } + + if (full){ + size_t dataSize = 0; + const Status status = adaptor->validate( r->toRecordData(), &dataSize ); + if (!status.isOK()) { + results->valid = false; + if (nInvalid == 0) // only log once; + results->errors.push_back( "invalid object detected (see logs)" ); + + nInvalid++; + log() << "Invalid object detected in " << _ns + << ": " << status.reason(); + } + else { + bsonLen += dataSize; + } + } + } + + if ( isCapped() && !_details->capLooped() ) { + output->append("cappedOutOfOrder", outOfOrder); + if ( outOfOrder > 1 ) { + results->valid = false; + results->errors.push_back( "too many out of order records" ); + } + } + output->append("objectsFound", n); + + if (full) { + output->append("invalidObjects", nInvalid); + } + + output->appendNumber("nQuantizedSize", nQuantizedSize); + output->appendNumber("nPowerOf2QuantizedSize", nPowerOf2QuantizedSize); + output->appendNumber("bytesWithHeaders", len); + output->appendNumber("bytesWithoutHeaders", nlen); + + if (full) { + output->appendNumber("bytesBson", bsonLen); + } + } // end scanData + + // 55555555555555555555555555 + BSONArrayBuilder deletedListArray; + for ( int i = 0; i < Buckets; i++ ) { + deletedListArray << _details->deletedListEntry(i).isNull(); + } + + int ndel = 0; + long long delSize = 0; + BSONArrayBuilder delBucketSizes; + int incorrect = 0; + for ( int i = 0; i < Buckets; i++ ) { + DiskLoc loc = _details->deletedListEntry(i); + try { + int k = 0; + while ( !loc.isNull() ) { + if ( recs.count(loc) ) + incorrect++; + ndel++; + + if ( loc.questionable() ) { + if( isCapped() && !loc.isValid() && i == 1 ) { + /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid + see comments in namespace.h + */ + break; + } + + string err( str::stream() << "bad pointer in deleted record list: " + << loc.toString() + << " bucket: " << i + << " k: " << k ); + results->errors.push_back( err ); + results->valid = false; + break; + } + + const DeletedRecord* d = deletedRecordFor(loc); + delSize += d->lengthWithHeaders(); + loc = d->nextDeleted(); + k++; + txn->checkForInterrupt(); + } + delBucketSizes << k; + } + catch (...) { + results->errors.push_back( (string)"exception in deleted chain for bucket " + + BSONObjBuilder::numStr(i) ); + results->valid = false; + } + } + output->appendNumber("deletedCount", ndel); + output->appendNumber("deletedSize", delSize); + if ( full ) { + output->append( "delBucketSizes", delBucketSizes.arr() ); + } + + if ( incorrect ) { + results->errors.push_back( BSONObjBuilder::numStr(incorrect) + + " records from datafile are in deleted list" ); + results->valid = false; + } + + } + catch (AssertionException) { + results->errors.push_back( "exception during validate" ); + results->valid = false; + } + + return Status::OK(); + } + + void RecordStoreV1Base::appendCustomStats( OperationContext* txn, + BSONObjBuilder* result, + double scale ) const { + result->append( "lastExtentSize", _details->lastExtentSize(txn) / scale ); + result->append( "paddingFactor", _details->paddingFactor() ); + result->append( "userFlags", _details->userFlags() ); + + if ( isCapped() ) { + result->appendBool( "capped", true ); + result->appendNumber( "max", _details->maxCappedDocs() ); + } + } + + + namespace { + struct touch_location { + const char* root; + size_t length; + }; + } + + Status RecordStoreV1Base::touch( OperationContext* txn, BSONObjBuilder* output ) const { + Timer t; + + std::vector<touch_location> ranges; + { + DiskLoc nextLoc = _details->firstExtent(txn); + Extent* ext = _getExtent( txn, nextLoc ); + while ( ext ) { + touch_location tl; + tl.root = reinterpret_cast<const char*>(ext); + tl.length = ext->length; + ranges.push_back(tl); + + nextLoc = ext->xnext; + if ( nextLoc.isNull() ) + ext = NULL; + else + ext = _getExtent( txn, nextLoc ); + } + } + + std::string progress_msg = "touch " + std::string(txn->getNS()) + " extents"; + ProgressMeterHolder pm(*txn->setMessage(progress_msg.c_str(), + "Touch Progress", + ranges.size())); + + for ( std::vector<touch_location>::iterator it = ranges.begin(); it != ranges.end(); ++it ) { + touch_pages( it->root, it->length ); + pm.hit(); + txn->checkForInterrupt(); + } + pm.finished(); + + if ( output ) { + output->append( "numRanges", static_cast<int>( ranges.size() ) ); + output->append( "millis", t.millis() ); + } + + return Status::OK(); + } + + int RecordStoreV1Base::getRecordAllocationSize( int minRecordSize ) const { + + if ( isCapped() ) + return minRecordSize; + + invariant( _details->paddingFactor() >= 1 ); + + if ( _details->isUserFlagSet( Flag_UsePowerOf2Sizes ) ) { + // quantize to the nearest bucketSize (or nearest 1mb boundary for large sizes). + return quantizePowerOf2AllocationSpace(minRecordSize); + } + + // adjust for padding factor + return static_cast<int>(minRecordSize * _details->paddingFactor()); + } + + DiskLoc RecordStoreV1Base::IntraExtentIterator::getNext() { + if (_curr.isNull()) + return DiskLoc(); + + const DiskLoc out = _curr; // we always return where we were, not where we will be. + const Record* rec = recordFor(_curr); + const int nextOfs = _forward ? rec->nextOfs() : rec->prevOfs(); + _curr = (nextOfs == DiskLoc::NullOfs ? DiskLoc() : DiskLoc(_curr.a(), nextOfs)); + return out; + } + + void RecordStoreV1Base::IntraExtentIterator::invalidate(const DiskLoc& dl) { + if (dl == _curr) { + getNext(); + } + } + + /* @return the size for an allocated record quantized to 1/16th of the BucketSize + @param allocSize requested size to allocate + */ + int RecordStoreV1Base::quantizeAllocationSpace(int allocSize) { + const int bucketIdx = bucket(allocSize); + int bucketSize = bucketSizes[bucketIdx]; + int quantizeUnit = bucketSize / 16; + if (allocSize >= (1 << 22)) // 4mb + // all allocatons >= 4mb result in 4mb/16 quantization units, even if >= 8mb. idea is + // to reduce quantization overhead of large records at the cost of increasing the + // DeletedRecord size distribution in the largest bucket by factor of 4. + quantizeUnit = (1 << 18); // 256k + if (allocSize % quantizeUnit == 0) + // size is already quantized + return allocSize; + const int quantizedSpace = (allocSize | (quantizeUnit - 1)) + 1; + fassert(16484, quantizedSpace >= allocSize); + return quantizedSpace; + } + + int RecordStoreV1Base::quantizePowerOf2AllocationSpace(int allocSize) { + for ( int i = 0; i < MaxBucket; i++ ) { // skips the largest (16MB) bucket + if ( bucketSizes[i] >= allocSize ) { + // Return the size of the first bucket sized >= the requested size. + return bucketSizes[i]; + } + } + + // if we get here, it means we're allocating more than 4mb, so round up + // to the nearest megabyte >= allocSize + const int MB = 1024*1024; + invariant(allocSize > 4*MB); + return (allocSize + (MB - 1)) & ~(MB - 1); // round up to MB alignment + } + + int RecordStoreV1Base::bucket(int size) { + for ( int i = 0; i < Buckets; i++ ) { + if ( bucketSizes[i] > size ) { + // Return the first bucket sized _larger_ than the requested size. + return i; + } + } + return MaxBucket; + } + + void RecordStoreV1Base::_paddingFits( OperationContext* txn ) { + MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less + double x = max(1.0, _details->paddingFactor() - 0.001 ); + _details->setPaddingFactor( txn, x ); + } + } + + void RecordStoreV1Base::_paddingTooSmall( OperationContext* txn ) { + MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less + /* the more indexes we have, the higher the cost of a move. so we take that into + account herein. note on a move that insert() calls paddingFits(), thus + here for example with no inserts and nIndexes = 1 we have + .001*4-.001 or a 3:1 ratio to non moves -> 75% nonmoves. insert heavy + can pushes this down considerably. further tweaking will be a good idea but + this should be an adequate starting point. + */ + double N = 4; // magic + double x = min(2.0,_details->paddingFactor() + (0.001 * N)); + _details->setPaddingFactor( txn, x ); + } + } + + Status RecordStoreV1Base::setCustomOption( OperationContext* txn, + const BSONElement& option, + BSONObjBuilder* info ) { + if ( str::equals( "usePowerOf2Sizes", option.fieldName() ) ) { + bool oldPowerOf2 = _details->isUserFlagSet( Flag_UsePowerOf2Sizes ); + bool newPowerOf2 = option.trueValue(); + + if ( oldPowerOf2 != newPowerOf2 ) { + // change userFlags + info->appendBool( "usePowerOf2Sizes_old", oldPowerOf2 ); + + if ( newPowerOf2 ) + _details->setUserFlag( txn, Flag_UsePowerOf2Sizes ); + else + _details->clearUserFlag( txn, Flag_UsePowerOf2Sizes ); + + info->appendBool( "usePowerOf2Sizes_new", newPowerOf2 ); + } + + return Status::OK(); + } + + return Status( ErrorCodes::InvalidOptions, + str::stream() << "no such option: " << option.fieldName() ); + } +} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_base.h b/src/mongo/db/storage/mmap_v1/record_store_v1_base.h new file mode 100644 index 00000000000..72466c2b645 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/record_store_v1_base.h @@ -0,0 +1,303 @@ +// record_store_v1_base.h + +/** +* Copyright (C) 2013-2014 MongoDB Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#pragma once + +#include "mongo/db/diskloc.h" +#include "mongo/db/storage/record_store.h" + +namespace mongo { + + class DeletedRecord; + class DocWriter; + class ExtentManager; + class Record; + class OperationContext; + + struct Extent; + + class RecordStoreV1MetaData { + public: + virtual ~RecordStoreV1MetaData(){} + + virtual const DiskLoc& capExtent() const = 0; + virtual void setCapExtent( OperationContext* txn, const DiskLoc& loc ) = 0; + + virtual const DiskLoc& capFirstNewRecord() const = 0; + virtual void setCapFirstNewRecord( OperationContext* txn, const DiskLoc& loc ) = 0; + + bool capLooped() const { return capFirstNewRecord().isValid(); } + + virtual long long dataSize() const = 0; + virtual long long numRecords() const = 0; + + virtual void incrementStats( OperationContext* txn, + long long dataSizeIncrement, + long long numRecordsIncrement ) = 0; + + virtual void setStats( OperationContext* txn, + long long dataSizeIncrement, + long long numRecordsIncrement ) = 0; + + virtual const DiskLoc& deletedListEntry( int bucket ) const = 0; + virtual void setDeletedListEntry( OperationContext* txn, + int bucket, + const DiskLoc& loc ) = 0; + virtual void orphanDeletedList(OperationContext* txn) = 0; + + virtual const DiskLoc& firstExtent( OperationContext* txn ) const = 0; + virtual void setFirstExtent( OperationContext* txn, const DiskLoc& loc ) = 0; + + virtual const DiskLoc& lastExtent( OperationContext* txn ) const = 0; + virtual void setLastExtent( OperationContext* txn, const DiskLoc& loc ) = 0; + + virtual bool isCapped() const = 0; + + virtual bool isUserFlagSet( int flag ) const = 0; + virtual int userFlags() const = 0; + virtual bool setUserFlag( OperationContext* txn, int flag ) = 0; + virtual bool clearUserFlag( OperationContext* txn, int flag ) = 0; + virtual bool replaceUserFlags( OperationContext* txn, int flags ) = 0; + + virtual int lastExtentSize( OperationContext* txn) const = 0; + virtual void setLastExtentSize( OperationContext* txn, int newMax ) = 0; + + virtual long long maxCappedDocs() const = 0; + + virtual double paddingFactor() const = 0; + + virtual void setPaddingFactor( OperationContext* txn, double paddingFactor ) = 0; + + }; + + class RecordStoreV1Base : public RecordStore { + public: + + static const int Buckets; + static const int MaxBucket; + + static const int bucketSizes[]; + + enum UserFlags { + Flag_UsePowerOf2Sizes = 1 << 0 + }; + + // ------------ + + class IntraExtentIterator; + + /** + * @param details - takes ownership + * @param em - does NOT take ownership + */ + RecordStoreV1Base( const StringData& ns, + RecordStoreV1MetaData* details, + ExtentManager* em, + bool isSystemIndexes ); + + virtual ~RecordStoreV1Base(); + + virtual long long dataSize() const { return _details->dataSize(); } + virtual long long numRecords() const { return _details->numRecords(); } + + virtual int64_t storageSize( OperationContext* txn, + BSONObjBuilder* extraInfo = NULL, + int level = 0 ) const; + + virtual RecordData dataFor( const DiskLoc& loc ) const; + + void deleteRecord( OperationContext* txn, + const DiskLoc& dl ); + + StatusWith<DiskLoc> insertRecord( OperationContext* txn, + const char* data, + int len, + bool enforceQuota ); + + StatusWith<DiskLoc> insertRecord( OperationContext* txn, + const DocWriter* doc, + bool enforceQuota ); + + virtual StatusWith<DiskLoc> updateRecord( OperationContext* txn, + const DiskLoc& oldLocation, + const char* data, + int len, + bool enforceQuota, + UpdateMoveNotifier* notifier ); + + virtual Status updateWithDamages( OperationContext* txn, + const DiskLoc& loc, + const char* damangeSource, + const mutablebson::DamageVector& damages ); + + virtual RecordIterator* getIteratorForRepair( OperationContext* txn ) const; + + void increaseStorageSize( OperationContext* txn, int size, bool enforceQuota ); + + virtual Status validate( OperationContext* txn, + bool full, bool scanData, + ValidateAdaptor* adaptor, + ValidateResults* results, BSONObjBuilder* output ) const; + + virtual void appendCustomStats( OperationContext* txn, + BSONObjBuilder* result, + double scale ) const; + + virtual Status touch( OperationContext* txn, BSONObjBuilder* output ) const; + + const RecordStoreV1MetaData* details() const { return _details.get(); } + + /** + * @return the actual size to create + * will be >= oldRecordSize + * based on padding and any other flags + */ + int getRecordAllocationSize( int minRecordSize ) const; + + DiskLoc getExtentLocForRecord( OperationContext* txn, const DiskLoc& loc ) const; + + DiskLoc getNextRecord( OperationContext* txn, const DiskLoc& loc ) const; + DiskLoc getPrevRecord( OperationContext* txn, const DiskLoc& loc ) const; + + DiskLoc getNextRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const; + DiskLoc getPrevRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const; + + /* @return the size for an allocated record quantized to 1/16th of the BucketSize. + @param allocSize requested size to allocate + The returned size will be greater than or equal to 'allocSize'. + */ + static int quantizeAllocationSpace(int allocSize); + + /** + * Quantize 'allocSize' to the nearest bucketSize (or nearest 1mb boundary for large sizes). + */ + static int quantizePowerOf2AllocationSpace(int allocSize); + + /* return which "deleted bucket" for this size object */ + static int bucket(int size); + + virtual Status setCustomOption( OperationContext* txn, + const BSONElement& option, + BSONObjBuilder* info = NULL ); + protected: + + virtual Record* recordFor( const DiskLoc& loc ) const; + + const DeletedRecord* deletedRecordFor( const DiskLoc& loc ) const; + + virtual bool isCapped() const = 0; + + virtual StatusWith<DiskLoc> allocRecord( OperationContext* txn, + int lengthWithHeaders, + bool enforceQuota ) = 0; + + // TODO: document, remove, what have you + virtual void addDeletedRec( OperationContext* txn, const DiskLoc& dloc) = 0; + + // TODO: another sad one + virtual DeletedRecord* drec( const DiskLoc& loc ) const; + + // just a wrapper for _extentManager->getExtent( loc ); + Extent* _getExtent( OperationContext* txn, const DiskLoc& loc ) const; + + DiskLoc _getExtentLocForRecord( OperationContext* txn, const DiskLoc& loc ) const; + + DiskLoc _getNextRecord( OperationContext* txn, const DiskLoc& loc ) const; + DiskLoc _getPrevRecord( OperationContext* txn, const DiskLoc& loc ) const; + + DiskLoc _getNextRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const; + DiskLoc _getPrevRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const; + + /** + * finds the first suitable DiskLoc for data + * will return the DiskLoc of a newly created DeletedRecord + */ + DiskLoc _findFirstSpot( OperationContext* txn, const DiskLoc& extDiskLoc, Extent* e ); + + /** add a record to the end of the linked list chain within this extent. + require: you must have already declared write intent for the record header. + */ + void _addRecordToRecListInExtent(OperationContext* txn, Record* r, DiskLoc loc); + + void _paddingTooSmall( OperationContext* txn ); + void _paddingFits( OperationContext* txn ); + + /** + * internal + * doesn't check inputs or change padding + */ + StatusWith<DiskLoc> _insertRecord( OperationContext* txn, + const char* data, + int len, + bool enforceQuota ); + + scoped_ptr<RecordStoreV1MetaData> _details; + ExtentManager* _extentManager; + bool _isSystemIndexes; + + friend class RecordStoreV1RepairIterator; + }; + + /** + * Iterates over all records within a single extent. + * + * EOF at end of extent, even if there are more extents. + */ + class RecordStoreV1Base::IntraExtentIterator : public RecordIterator { + public: + IntraExtentIterator(OperationContext* txn, + DiskLoc start, + const RecordStoreV1Base* rs, + bool forward = true) + : _txn(txn), _curr(start), _rs(rs), _forward(forward) {} + + virtual bool isEOF() { return _curr.isNull(); } + + virtual DiskLoc curr() { return _curr; } + + virtual DiskLoc getNext( ); + + virtual void invalidate(const DiskLoc& dl); + + virtual void prepareToYield() {} + + virtual bool recoverFromYield() { return true; } + + virtual RecordData dataFor( const DiskLoc& loc ) const { return _rs->dataFor(loc); } + + private: + virtual const Record* recordFor( const DiskLoc& loc ) const { return _rs->recordFor(loc); } + OperationContext* _txn; + DiskLoc _curr; + const RecordStoreV1Base* _rs; + bool _forward; + }; + +} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp new file mode 100644 index 00000000000..c8524c76e22 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp @@ -0,0 +1,717 @@ +// record_store_v1_capped.cpp + +/** + * Copyright (C) 2013 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h" + +#include "mongo/db/operation_context_impl.h" +#include "mongo/db/storage/mmap_v1/extent.h" +#include "mongo/db/storage/mmap_v1/extent_manager.h" +#include "mongo/db/storage/mmap_v1/record.h" +#include "mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h" +#include "mongo/util/mmap.h" +#include "mongo/util/mongoutils/str.h" + +/* + capped collection layout + + d's below won't exist if things align perfectly: + + extent1 -> extent2 -> extent3 + ------------------- ----------------------- --------------------- + d r r r r r r r r d d r r r r d r r r r r d d r r r r r r r r r d + ^ ^ + oldest newest + + ^cappedFirstDeletedInCurExtent() + ^cappedLastDelRecLastExtent() + ^cappedListOfAllDeletedRecords() +*/ + +#define DDD(x) + +namespace mongo { + + CappedRecordStoreV1::CappedRecordStoreV1( OperationContext* txn, + CappedDocumentDeleteCallback* collection, + const StringData& ns, + RecordStoreV1MetaData* details, + ExtentManager* em, + bool isSystemIndexes ) + : RecordStoreV1Base( ns, details, em, isSystemIndexes ), + _deleteCallback( collection ) { + + DiskLoc extentLoc = details->firstExtent(txn); + while ( !extentLoc.isNull() ) { + _extentAdvice.push_back( _extentManager->cacheHint( extentLoc, + ExtentManager::Sequential ) ); + Extent* extent = em->getExtent( extentLoc ); + extentLoc = extent->xnext; + } + + // this is for VERY VERY old versions of capped collections + cappedCheckMigrate(txn); + } + + CappedRecordStoreV1::~CappedRecordStoreV1() { + } + + StatusWith<DiskLoc> CappedRecordStoreV1::allocRecord( OperationContext* txn, + int lenToAlloc, + bool enforceQuota ) { + { + // align very slightly. + lenToAlloc = (lenToAlloc + 3) & 0xfffffffc; + } + + if ( lenToAlloc > theCapExtent()->length ) { + // the extent check is a way to try and improve performance + // since we have to iterate all the extents (for now) to get + // storage size + if ( lenToAlloc > storageSize(txn) ) { + return StatusWith<DiskLoc>( ErrorCodes::BadValue, + mongoutils::str::stream() + << "document is larger than capped size " + << lenToAlloc << " > " << storageSize(txn), + 16328 ); + } + + } + DiskLoc loc; + { // do allocation + + // signal done allocating new extents. + if ( !cappedLastDelRecLastExtent().isValid() ) + setLastDelRecLastExtent( txn, DiskLoc() ); + + invariant( lenToAlloc < 400000000 ); + int passes = 0; + int maxPasses = ( lenToAlloc / 30 ) + 2; // 30 is about the smallest entry that could go in the oplog + if ( maxPasses < 5000 ) { + // this is for bacwards safety since 5000 was the old value + maxPasses = 5000; + } + + // delete records until we have room and the max # objects limit achieved. + + /* this fails on a rename -- that is ok but must keep commented out */ + //invariant( theCapExtent()->ns == ns ); + + theCapExtent()->assertOk(); + DiskLoc firstEmptyExtent; + while ( 1 ) { + if ( _details->numRecords() < _details->maxCappedDocs() ) { + loc = __capAlloc( txn, lenToAlloc ); + if ( !loc.isNull() ) + break; + } + + // If on first iteration through extents, don't delete anything. + if ( !_details->capFirstNewRecord().isValid() ) { + advanceCapExtent( txn, _ns ); + + if ( _details->capExtent() != _details->firstExtent(txn) ) + _details->setCapFirstNewRecord( txn, DiskLoc().setInvalid() ); + // else signal done with first iteration through extents. + continue; + } + + if ( !_details->capFirstNewRecord().isNull() && + theCapExtent()->firstRecord == _details->capFirstNewRecord() ) { + // We've deleted all records that were allocated on the previous + // iteration through this extent. + advanceCapExtent( txn, _ns ); + continue; + } + + if ( theCapExtent()->firstRecord.isNull() ) { + if ( firstEmptyExtent.isNull() ) + firstEmptyExtent = _details->capExtent(); + advanceCapExtent( txn, _ns ); + if ( firstEmptyExtent == _details->capExtent() ) { + _maybeComplain( txn, lenToAlloc ); + return StatusWith<DiskLoc>( ErrorCodes::InternalError, + "no space in capped collection" ); + } + continue; + } + + DiskLoc fr = theCapExtent()->firstRecord; + Status status = _deleteCallback->aboutToDeleteCapped( txn, fr ); + if ( !status.isOK() ) + return StatusWith<DiskLoc>( status ); + deleteRecord( txn, fr ); + + compact(txn); + if( ++passes > maxPasses ) { + StringBuilder sb; + sb << "passes >= maxPasses in CappedRecordStoreV1::cappedAlloc: ns: " << _ns + << ", lenToAlloc: " << lenToAlloc + << ", maxPasses: " << maxPasses + << ", _maxDocsInCapped: " << _details->maxCappedDocs() + << ", nrecords: " << _details->numRecords() + << ", datasize: " << _details->dataSize(); + + return StatusWith<DiskLoc>( ErrorCodes::InternalError, sb.str() ); + } + } + + // Remember first record allocated on this iteration through capExtent. + if ( _details->capFirstNewRecord().isValid() && _details->capFirstNewRecord().isNull() ) + _details->setCapFirstNewRecord( txn, loc ); + } + + invariant( !loc.isNull() ); + + // possibly slice up if we've allocated too much space + + DeletedRecord *r = drec( loc ); + + /* note we want to grab from the front so our next pointers on disk tend + to go in a forward direction which is important for performance. */ + int regionlen = r->lengthWithHeaders(); + invariant( r->extentOfs() < loc.getOfs() ); + + int left = regionlen - lenToAlloc; + + /* split off some for further use. */ + txn->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc; + DiskLoc newDelLoc = loc; + newDelLoc.inc(lenToAlloc); + DeletedRecord* newDel = drec( newDelLoc ); + DeletedRecord* newDelW = txn->recoveryUnit()->writing(newDel); + newDelW->extentOfs() = r->extentOfs(); + newDelW->lengthWithHeaders() = left; + newDelW->nextDeleted().Null(); + + addDeletedRec(txn, newDelLoc); + + return StatusWith<DiskLoc>( loc ); + } + + Status CappedRecordStoreV1::truncate(OperationContext* txn) { + setLastDelRecLastExtent( txn, DiskLoc() ); + setListOfAllDeletedRecords( txn, DiskLoc() ); + + // preserve firstExtent/lastExtent + _details->setCapExtent( txn, _details->firstExtent(txn) ); + _details->setStats( txn, 0, 0 ); + // preserve lastExtentSize + // nIndexes preserve 0 + // capped preserve true + // max preserve + _details->setPaddingFactor( txn, 1.0 ); + _details->setCapFirstNewRecord( txn, DiskLoc().setInvalid() ); + setLastDelRecLastExtent( txn, DiskLoc().setInvalid() ); + // dataFileVersion preserve + // indexFileVersion preserve + + // Reset all existing extents and recreate the deleted list. + Extent* ext; + for( DiskLoc extLoc = _details->firstExtent(txn); + !extLoc.isNull(); + extLoc = ext->xnext ) { + ext = _extentManager->getExtent(extLoc); + + txn->recoveryUnit()->writing( &ext->firstRecord )->Null(); + txn->recoveryUnit()->writing( &ext->lastRecord )->Null(); + + addDeletedRec( txn, _findFirstSpot( txn, extLoc, ext ) ); + } + + return Status::OK(); + } + + void CappedRecordStoreV1::temp_cappedTruncateAfter( OperationContext* txn, + DiskLoc end, + bool inclusive ) { + cappedTruncateAfter( txn, _ns.c_str(), end, inclusive ); + } + + /* combine adjacent deleted records *for the current extent* of the capped collection + + this is O(n^2) but we call it for capped tables where typically n==1 or 2! + (or 3...there will be a little unused sliver at the end of the extent.) + */ + void CappedRecordStoreV1::compact(OperationContext* txn) { + DDD( "CappedRecordStoreV1::compact enter" ); + + vector<DiskLoc> drecs; + + // Pull out capExtent's DRs from deletedList + DiskLoc i = cappedFirstDeletedInCurExtent(); + for (; !i.isNull() && inCapExtent( i ); i = deletedRecordFor( i )->nextDeleted() ) { + DDD( "\t" << i ); + drecs.push_back( i ); + } + + setFirstDeletedInCurExtent( txn, i ); + + std::sort( drecs.begin(), drecs.end() ); + DDD( "\t drecs.size(): " << drecs.size() ); + + vector<DiskLoc>::const_iterator j = drecs.begin(); + invariant( j != drecs.end() ); + DiskLoc a = *j; + while ( 1 ) { + j++; + if ( j == drecs.end() ) { + DDD( "\t compact adddelrec" ); + addDeletedRec(txn, a); + break; + } + DiskLoc b = *j; + while ( a.a() == b.a() && + a.getOfs() + drec( a )->lengthWithHeaders() == b.getOfs() ) { + + // a & b are adjacent. merge. + txn->recoveryUnit()->writingInt( drec(a)->lengthWithHeaders() ) += drec(b)->lengthWithHeaders(); + j++; + if ( j == drecs.end() ) { + DDD( "\t compact adddelrec2" ); + addDeletedRec(txn, a); + return; + } + b = *j; + } + DDD( "\t compact adddelrec3" ); + addDeletedRec(txn, a); + a = b; + } + + } + + const DiskLoc &CappedRecordStoreV1::cappedFirstDeletedInCurExtent() const { + if ( cappedLastDelRecLastExtent().isNull() ) + return cappedListOfAllDeletedRecords(); + else + return drec(cappedLastDelRecLastExtent())->nextDeleted(); + } + + void CappedRecordStoreV1::setFirstDeletedInCurExtent( OperationContext* txn, + const DiskLoc& loc ) { + if ( cappedLastDelRecLastExtent().isNull() ) + setListOfAllDeletedRecords( txn, loc ); + else + *txn->recoveryUnit()->writing( &drec(cappedLastDelRecLastExtent())->nextDeleted() ) = loc; + } + + void CappedRecordStoreV1::cappedCheckMigrate(OperationContext* txn) { + // migrate old RecordStoreV1MetaData format + if ( _details->capExtent().a() == 0 && _details->capExtent().getOfs() == 0 ) { + _details->setCapFirstNewRecord( txn, DiskLoc().setInvalid() ); + // put all the DeletedRecords in cappedListOfAllDeletedRecords() + for ( int i = 1; i < Buckets; ++i ) { + DiskLoc first = _details->deletedListEntry( i ); + if ( first.isNull() ) + continue; + DiskLoc last = first; + for (; !drec(last)->nextDeleted().isNull(); last = drec(last)->nextDeleted() ); + *txn->recoveryUnit()->writing(&drec(last)->nextDeleted()) = cappedListOfAllDeletedRecords(); + setListOfAllDeletedRecords( txn, first ); + _details->setDeletedListEntry(txn, i, DiskLoc()); + } + // NOTE cappedLastDelRecLastExtent() set to DiskLoc() in above + + // Last, in case we're killed before getting here + _details->setCapExtent( txn, _details->firstExtent(txn) ); + } + } + + bool CappedRecordStoreV1::inCapExtent( const DiskLoc &dl ) const { + invariant( !dl.isNull() ); + + if ( dl.a() != _details->capExtent().a() ) + return false; + + if ( dl.getOfs() < _details->capExtent().getOfs() ) + return false; + + const Extent* e = theCapExtent(); + int end = _details->capExtent().getOfs() + e->length; + return dl.getOfs() <= end; + } + + bool CappedRecordStoreV1::nextIsInCapExtent( const DiskLoc &dl ) const { + invariant( !dl.isNull() ); + DiskLoc next = drec(dl)->nextDeleted(); + if ( next.isNull() ) + return false; + return inCapExtent( next ); + } + + void CappedRecordStoreV1::advanceCapExtent( OperationContext* txn, const StringData& ns ) { + // We want cappedLastDelRecLastExtent() to be the last DeletedRecord of the prev cap extent + // (or DiskLoc() if new capExtent == firstExtent) + if ( _details->capExtent() == _details->lastExtent(txn) ) + setLastDelRecLastExtent( txn, DiskLoc() ); + else { + DiskLoc i = cappedFirstDeletedInCurExtent(); + for (; !i.isNull() && nextIsInCapExtent( i ); i = drec(i)->nextDeleted() ); + setLastDelRecLastExtent( txn, i ); + } + + _details->setCapExtent( txn, + theCapExtent()->xnext.isNull() ? _details->firstExtent(txn) + : theCapExtent()->xnext ); + + /* this isn't true if a collection has been renamed...that is ok just used for diagnostics */ + //dassert( theCapExtent()->ns == ns ); + + theCapExtent()->assertOk(); + _details->setCapFirstNewRecord( txn, DiskLoc() ); + } + + DiskLoc CappedRecordStoreV1::__capAlloc( OperationContext* txn, int len ) { + DiskLoc prev = cappedLastDelRecLastExtent(); + DiskLoc i = cappedFirstDeletedInCurExtent(); + DiskLoc ret; + for (; !i.isNull() && inCapExtent( i ); prev = i, i = drec(i)->nextDeleted() ) { + // We need to keep at least one DR per extent in cappedListOfAllDeletedRecords(), + // so make sure there's space to create a DR at the end. + if ( drec(i)->lengthWithHeaders() >= len + 24 ) { + ret = i; + break; + } + } + + /* unlink ourself from the deleted list */ + if ( !ret.isNull() ) { + if ( prev.isNull() ) + setListOfAllDeletedRecords( txn, drec(ret)->nextDeleted() ); + else + *txn->recoveryUnit()->writing(&drec(prev)->nextDeleted()) = drec(ret)->nextDeleted(); + *txn->recoveryUnit()->writing(&drec(ret)->nextDeleted()) = DiskLoc().setInvalid(); // defensive. + invariant( drec(ret)->extentOfs() < ret.getOfs() ); + } + + return ret; + } + + void CappedRecordStoreV1::cappedTruncateLastDelUpdate(OperationContext* txn) { + if ( _details->capExtent() == _details->firstExtent(txn) ) { + // Only one extent of the collection is in use, so there + // is no deleted record in a previous extent, so nullify + // cappedLastDelRecLastExtent(). + setLastDelRecLastExtent( txn, DiskLoc() ); + } + else { + // Scan through all deleted records in the collection + // until the last deleted record for the extent prior + // to the new capExtent is found. Then set + // cappedLastDelRecLastExtent() to that deleted record. + DiskLoc i = cappedListOfAllDeletedRecords(); + for( ; + !drec(i)->nextDeleted().isNull() && + !inCapExtent( drec(i)->nextDeleted() ); + i = drec(i)->nextDeleted() ); + // In our capped storage model, every extent must have at least one + // deleted record. Here we check that 'i' is not the last deleted + // record. (We expect that there will be deleted records in the new + // capExtent as well.) + invariant( !drec(i)->nextDeleted().isNull() ); + setLastDelRecLastExtent( txn, i ); + } + } + + void CappedRecordStoreV1::cappedTruncateAfter(OperationContext* txn, + const char* ns, + DiskLoc end, + bool inclusive) { + invariant( cappedLastDelRecLastExtent().isValid() ); + + // We iteratively remove the newest document until the newest document + // is 'end', then we remove 'end' if requested. + bool foundLast = false; + while( 1 ) { + if ( foundLast ) { + // 'end' has been found and removed, so break. + break; + } + txn->recoveryUnit()->commitIfNeeded(); + // 'curr' will point to the newest document in the collection. + DiskLoc curr = theCapExtent()->lastRecord; + invariant( !curr.isNull() ); + if ( curr == end ) { + if ( inclusive ) { + // 'end' has been found, so break next iteration. + foundLast = true; + } + else { + // 'end' has been found, so break. + break; + } + } + + // TODO The algorithm used in this function cannot generate an + // empty collection, but we could call emptyCappedCollection() in + // this case instead of asserting. + uassert( 13415, "emptying the collection is not allowed", _details->numRecords() > 1 ); + + // Delete the newest record, and coalesce the new deleted + // record with existing deleted records. + Status status = _deleteCallback->aboutToDeleteCapped( txn, curr ); + uassertStatusOK( status ); + deleteRecord( txn, curr ); + compact(txn); + + // This is the case where we have not yet had to remove any + // documents to make room for other documents, and we are allocating + // documents from free space in fresh extents instead of reusing + // space from familiar extents. + if ( !_details->capLooped() ) { + + // We just removed the last record from the 'capExtent', and + // the 'capExtent' can't be empty, so we set 'capExtent' to + // capExtent's prev extent. + if ( theCapExtent()->lastRecord.isNull() ) { + invariant( !theCapExtent()->xprev.isNull() ); + // NOTE Because we didn't delete the last document, and + // capLooped() is false, capExtent is not the first extent + // so xprev will be nonnull. + _details->setCapExtent( txn, theCapExtent()->xprev ); + theCapExtent()->assertOk(); + + // update cappedLastDelRecLastExtent() + cappedTruncateLastDelUpdate(txn); + } + continue; + } + + // This is the case where capLooped() is true, and we just deleted + // from capExtent, and we just deleted capFirstNewRecord, which was + // the last record on the fresh side of capExtent. + // NOTE In this comparison, curr and potentially capFirstNewRecord + // may point to invalid data, but we can still compare the + // references themselves. + if ( curr == _details->capFirstNewRecord() ) { + + // Set 'capExtent' to the first nonempty extent prior to the + // initial capExtent. There must be such an extent because we + // have not deleted the last document in the collection. It is + // possible that all extents other than the capExtent are empty. + // In this case we will keep the initial capExtent and specify + // that all records contained within are on the fresh rather than + // stale side of the extent. + DiskLoc newCapExtent = _details->capExtent(); + do { + // Find the previous extent, looping if necessary. + newCapExtent = ( newCapExtent == _details->firstExtent(txn) ) ? + _details->lastExtent(txn) : + _extentManager->getExtent(newCapExtent)->xprev; + _extentManager->getExtent(newCapExtent)->assertOk(); + } + while ( _extentManager->getExtent(newCapExtent)->firstRecord.isNull() ); + _details->setCapExtent( txn, newCapExtent ); + + // Place all documents in the new capExtent on the fresh side + // of the capExtent by setting capFirstNewRecord to the first + // document in the new capExtent. + _details->setCapFirstNewRecord( txn, theCapExtent()->firstRecord ); + + // update cappedLastDelRecLastExtent() + cappedTruncateLastDelUpdate(txn); + } + } + } + + const DiskLoc& CappedRecordStoreV1::cappedListOfAllDeletedRecords() const { + return _details->deletedListEntry(0); + } + + void CappedRecordStoreV1::setListOfAllDeletedRecords( OperationContext* txn, + const DiskLoc& loc ) { + return _details->setDeletedListEntry(txn, 0, loc); + } + + const DiskLoc& CappedRecordStoreV1::cappedLastDelRecLastExtent() const { + return _details->deletedListEntry(1); + } + + void CappedRecordStoreV1::setLastDelRecLastExtent( OperationContext* txn, + const DiskLoc& loc ) { + return _details->setDeletedListEntry(txn, 1, loc); + } + + Extent* CappedRecordStoreV1::theCapExtent() const { + return _extentManager->getExtent(_details->capExtent()); + } + + void CappedRecordStoreV1::addDeletedRec( OperationContext* txn, const DiskLoc& dloc ) { + DeletedRecord* d = txn->recoveryUnit()->writing( drec( dloc ) ); + + DEBUGGING log() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs() << endl; + if ( !cappedLastDelRecLastExtent().isValid() ) { + // Initial extent allocation. Insert at end. + d->nextDeleted() = DiskLoc(); + if ( cappedListOfAllDeletedRecords().isNull() ) + setListOfAllDeletedRecords( txn, dloc ); + else { + DiskLoc i = cappedListOfAllDeletedRecords(); + for (; !drec(i)->nextDeleted().isNull(); i = drec(i)->nextDeleted() ) + ; + *txn->recoveryUnit()->writing(&drec(i)->nextDeleted()) = dloc; + } + } + else { + d->nextDeleted() = cappedFirstDeletedInCurExtent(); + setFirstDeletedInCurExtent( txn, dloc ); + // always compact() after this so order doesn't matter + } + } + + RecordIterator* CappedRecordStoreV1::getIterator( OperationContext* txn, + const DiskLoc& start, + bool tailable, + const CollectionScanParams::Direction& dir) const { + return new CappedRecordStoreV1Iterator( txn, this, start, tailable, dir ); + } + + vector<RecordIterator*> CappedRecordStoreV1::getManyIterators( OperationContext* txn ) const { + OwnedPointerVector<RecordIterator> iterators; + + if (!_details->capLooped()) { + // if we haven't looped yet, just spit out all extents (same as non-capped impl) + const Extent* ext; + for (DiskLoc extLoc = details()->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) { + ext = _getExtent(txn, extLoc); + if (ext->firstRecord.isNull()) + continue; + + iterators.push_back(new RecordStoreV1Base::IntraExtentIterator(txn, + ext->firstRecord, + this)); + } + } + else { + // if we've looped we need to iterate the extents, starting and ending with the + // capExtent + const DiskLoc capExtent = details()->capExtent(); + invariant(!capExtent.isNull()); + invariant(capExtent.isValid()); + + // First do the "old" portion of capExtent if there is any + DiskLoc extLoc = capExtent; + { + const Extent* ext = _getExtent(txn, extLoc); + if (ext->firstRecord != details()->capFirstNewRecord()) { + // this means there is old data in capExtent + iterators.push_back(new RecordStoreV1Base::IntraExtentIterator(txn, + ext->firstRecord, + this)); + } + + extLoc = ext->xnext.isNull() ? details()->firstExtent(txn) : ext->xnext; + } + + // Next handle all the other extents + while (extLoc != capExtent) { + const Extent* ext = _getExtent(txn, extLoc); + iterators.push_back(new RecordStoreV1Base::IntraExtentIterator(txn, + ext->firstRecord, + this)); + + extLoc = ext->xnext.isNull() ? details()->firstExtent(txn) : ext->xnext; + } + + // Finally handle the "new" data in the capExtent + iterators.push_back( + new RecordStoreV1Base::IntraExtentIterator(txn, + details()->capFirstNewRecord(), + this)); + } + + return iterators.release(); + } + + Status CappedRecordStoreV1::compact( OperationContext* txn, + RecordStoreCompactAdaptor* adaptor, + const CompactOptions* options, + CompactStats* stats ) { + invariant(false); + } + + void CappedRecordStoreV1::_maybeComplain( OperationContext* txn, int len ) const { + RARELY { + std::stringstream buf; + buf << "couldn't make room for record len: " << len << " in capped ns " << _ns << '\n'; + buf << "numRecords: " << numRecords() << '\n'; + int i = 0; + for ( DiskLoc e = _details->firstExtent(txn); + !e.isNull(); + e = _extentManager->getExtent( e )->xnext, ++i ) { + buf << " Extent " << i; + if ( e == _details->capExtent() ) + buf << " (capExtent)"; + buf << ' ' << e; + buf << '\n'; + + buf << " magic: " << hex << _extentManager->getExtent( e )->magic << dec + << " extent->ns: " << _extentManager->getExtent( e )->nsDiagnostic.toString() + << '\n'; + buf << " fr: " << _extentManager->getExtent( e )->firstRecord.toString() + << " lr: " << _extentManager->getExtent( e )->lastRecord.toString() + << " extent->len: " << _extentManager->getExtent( e )->length << '\n'; + } + + warning() << buf.str(); + + // assume it is unusually large record; if not, something is broken + fassert( 17438, len * 5 > _details->lastExtentSize(txn) ); + } + } + + DiskLoc CappedRecordStoreV1::firstRecord( OperationContext* txn, + const DiskLoc &startExtent ) const { + for (DiskLoc i = startExtent.isNull() ? _details->firstExtent(txn) : startExtent; + !i.isNull(); + i = _extentManager->getExtent( i )->xnext ) { + + Extent* e = _extentManager->getExtent( i ); + + if ( !e->firstRecord.isNull() ) + return e->firstRecord; + } + return DiskLoc(); + } + + DiskLoc CappedRecordStoreV1::lastRecord( OperationContext* txn, + const DiskLoc &startExtent ) const { + for (DiskLoc i = startExtent.isNull() ? _details->lastExtent(txn) : startExtent; + !i.isNull(); + i = _extentManager->getExtent( i )->xprev ) { + + Extent* e = _extentManager->getExtent( i ); + if ( !e->lastRecord.isNull() ) + return e->lastRecord; + } + return DiskLoc(); + } + +} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped.h b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.h new file mode 100644 index 00000000000..4422b5d451b --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.h @@ -0,0 +1,139 @@ +// record_store_v1_capped.h + +/** +* Copyright (C) 2013 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#pragma once + +#include "mongo/base/owned_pointer_vector.h" +#include "mongo/db/diskloc.h" +#include "mongo/db/storage/capped_callback.h" +#include "mongo/db/storage/mmap_v1/extent_manager.h" +#include "mongo/db/storage/mmap_v1/record_store_v1_base.h" + +namespace mongo { + + class CappedRecordStoreV1 : public RecordStoreV1Base { + public: + CappedRecordStoreV1( OperationContext* txn, + CappedDocumentDeleteCallback* collection, + const StringData& ns, + RecordStoreV1MetaData* details, + ExtentManager* em, + bool isSystemIndexes ); + + virtual ~CappedRecordStoreV1(); + + const char* name() const { return "CappedRecordStoreV1"; } + + virtual Status truncate(OperationContext* txn); + + /** + * Truncate documents newer than the document at 'end' from the capped + * collection. The collection cannot be completely emptied using this + * function. An assertion will be thrown if that is attempted. + * @param inclusive - Truncate 'end' as well iff true + * XXX: this will go away soon, just needed to move for now + */ + virtual void temp_cappedTruncateAfter( OperationContext* txn, DiskLoc end, bool inclusive ); + + virtual RecordIterator* getIterator( OperationContext* txn, + const DiskLoc& start, bool tailable, + const CollectionScanParams::Direction& dir) const; + + virtual std::vector<RecordIterator*> getManyIterators( OperationContext* txn ) const; + + virtual bool compactSupported() const { return false; } + + virtual Status compact( OperationContext* txn, + RecordStoreCompactAdaptor* adaptor, + const CompactOptions* options, + CompactStats* stats ); + + // Start from firstExtent by default. + DiskLoc firstRecord( OperationContext* txn, + const DiskLoc &startExtent = DiskLoc() ) const; + // Start from lastExtent by default. + DiskLoc lastRecord( OperationContext* txn, + const DiskLoc &startExtent = DiskLoc() ) const; + + protected: + + virtual bool isCapped() const { return true; } + + virtual void setCappedDeleteCallback( CappedDocumentDeleteCallback* cb ) { + _deleteCallback = cb; + } + + virtual StatusWith<DiskLoc> allocRecord( OperationContext* txn, + int lengthWithHeaders, + bool enforceQuota ); + + virtual void addDeletedRec(OperationContext* txn, const DiskLoc& dloc); + + private: + // -- start copy from cap.cpp -- + void compact(OperationContext* txn); + const DiskLoc& cappedFirstDeletedInCurExtent() const; + void setFirstDeletedInCurExtent( OperationContext* txn, const DiskLoc& loc ); + void cappedCheckMigrate(OperationContext* txn); + DiskLoc __capAlloc( OperationContext* txn, int len ); + bool inCapExtent( const DiskLoc &dl ) const; + const DiskLoc& cappedListOfAllDeletedRecords() const; + const DiskLoc& cappedLastDelRecLastExtent() const; + void setListOfAllDeletedRecords( OperationContext* txn, const DiskLoc& loc ); + void setLastDelRecLastExtent( OperationContext* txn, const DiskLoc& loc ); + Extent *theCapExtent() const; + bool nextIsInCapExtent( const DiskLoc &dl ) const; + void advanceCapExtent( OperationContext* txn, const StringData& ns ); + void cappedTruncateLastDelUpdate(OperationContext* txn); + + /** + * Truncate documents newer than the document at 'end' from the capped + * collection. The collection cannot be completely emptied using this + * function. An assertion will be thrown if that is attempted. + * @param inclusive - Truncate 'end' as well iff true + */ + void cappedTruncateAfter(OperationContext* txn, + const char* ns, + DiskLoc end, + bool inclusive); + + void _maybeComplain( OperationContext* txn, int len ) const; + + // -- end copy from cap.cpp -- + + CappedDocumentDeleteCallback* _deleteCallback; + + OwnedPointerVector<ExtentManager::CacheHint> _extentAdvice; + + friend class CappedRecordStoreV1Iterator; + }; + + +} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp new file mode 100644 index 00000000000..11f7894fe77 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp @@ -0,0 +1,237 @@ +/** + * Copyright (C) 2013 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h" + +#include "mongo/db/catalog/collection.h" +#include "mongo/db/storage/mmap_v1/extent.h" +#include "mongo/db/storage/mmap_v1/extent_manager.h" +#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h" + +namespace mongo { + + + // + // Capped collection traversal + // + CappedRecordStoreV1Iterator::CappedRecordStoreV1Iterator( OperationContext* txn, + const CappedRecordStoreV1* collection, + const DiskLoc& start, bool tailable, + const CollectionScanParams::Direction& dir) + : _txn(txn), _recordStore(collection), _curr(start), _tailable(tailable), + _direction(dir), _killedByInvalidate(false) { + + if (_curr.isNull()) { + + const RecordStoreV1MetaData* nsd = _recordStore->details(); + + // If a start position isn't specified, we fill one out from the start of the + // collection. + if (CollectionScanParams::FORWARD == _direction) { + // Going forwards. + if (!nsd->capLooped()) { + // If our capped collection doesn't loop around, the first record is easy. + _curr = collection->firstRecord(_txn); + } + else { + // Our capped collection has "looped' around. + // Copied verbatim from ForwardCappedCursor::init. + // TODO ELABORATE + _curr = _getExtent( nsd->capExtent() )->firstRecord; + if (!_curr.isNull() && _curr == nsd->capFirstNewRecord()) { + _curr = _getExtent( nsd->capExtent() )->lastRecord; + _curr = nextLoop(_curr); + } + } + } + else { + // Going backwards + if (!nsd->capLooped()) { + // Start at the end. + _curr = collection->lastRecord(_txn); + } + else { + _curr = _getExtent( nsd->capExtent() )->lastRecord; + } + } + } + } + + bool CappedRecordStoreV1Iterator::isEOF() { return _curr.isNull(); } + + DiskLoc CappedRecordStoreV1Iterator::curr() { return _curr; } + + DiskLoc CappedRecordStoreV1Iterator::getNext() { + DiskLoc ret = _curr; + + // Move to the next thing. + if (!isEOF()) { + _prev = _curr; + _curr = getNextCapped(_curr); + } + else if (_tailable && !_prev.isNull()) { + // If we're tailable, there COULD have been something inserted even though we were + // previously EOF. Look at the next thing from 'prev' and see. + DiskLoc newCurr = getNextCapped(_prev); + + if (!newCurr.isNull()) { + // There's something new to return. _curr always points to the next thing to + // return. Update it, and move _prev to the thing we just returned. + _prev = ret = newCurr; + _curr = getNextCapped(_prev); + } + } + + return ret; + } + + void CappedRecordStoreV1Iterator::invalidate(const DiskLoc& dl) { + if ((_tailable && _curr.isNull() && dl == _prev) || (dl == _curr)) { + // In the _tailable case, we're about to kill the DiskLoc that we're tailing. Nothing + // that we can possibly do to survive that. + // + // In the _curr case, we *could* move to the next thing, since there is actually a next + // thing, but according to clientcursor.cpp: + // "note we cannot advance here. if this condition occurs, writes to the oplog + // have "caught" the reader. skipping ahead, the reader would miss postentially + // important data." + _curr = _prev = DiskLoc(); + _killedByInvalidate = true; + } + } + + void CappedRecordStoreV1Iterator::prepareToYield() { + } + + bool CappedRecordStoreV1Iterator::recoverFromYield() { + // If invalidate invalidated the DiskLoc we relied on, give up now. + if (_killedByInvalidate) { + _recordStore = NULL; + return false; + } + + return true; + } + + DiskLoc CappedRecordStoreV1Iterator::getNextCapped(const DiskLoc& dl) { + invariant(!dl.isNull()); + const RecordStoreV1MetaData* details = _recordStore->details(); + + if (CollectionScanParams::FORWARD == _direction) { + // If it's not looped, it's easy. + if (!_recordStore->details()->capLooped()) { + return _getNextRecord( dl ); + } + + // TODO ELABORATE + // EOF. + if (dl == _getExtent( details->capExtent() )->lastRecord) { + return DiskLoc(); + } + + DiskLoc ret = nextLoop(dl); + + // If we become capFirstNewRecord from same extent, advance to next extent. + if (ret == details->capFirstNewRecord() && ret != _getExtent( details->capExtent() )->firstRecord) { + ret = nextLoop(_getExtent( details->capExtent() )->lastRecord); + } + + // If we have just gotten to beginning of capExtent, skip to capFirstNewRecord + if (ret == _getExtent( details->capExtent() )->firstRecord) { ret = details->capFirstNewRecord(); } + + return ret; + } + else { + if (!details->capLooped()) { return _getPrevRecord( dl ); } + + // TODO ELABORATE + // Last record + if (details->capFirstNewRecord() == _getExtent( details->capExtent() )->firstRecord) { + if (dl == nextLoop(_getExtent( details->capExtent() )->lastRecord)) { + return DiskLoc(); + } + } + else { + if (dl == _getExtent( details->capExtent() )->firstRecord) { return DiskLoc(); } + } + + DiskLoc ret; + // If we are capFirstNewRecord, advance to prev extent, otherwise just get prev. + if (dl == details->capFirstNewRecord()) { + ret = prevLoop(_getExtent( details->capExtent() )->firstRecord); + } + else { + ret = prevLoop(dl); + } + + // If we just became last in cap extent, advance past capFirstNewRecord + // (We know ext(capExtent)->firstRecord != capFirstNewRecord, since would + // have returned DiskLoc() earlier otherwise.) + if (ret == _getExtent( details->capExtent() )->lastRecord) { + ret = _getPrevRecord( details->capFirstNewRecord() ); + } + + return ret; + } + } + + DiskLoc CappedRecordStoreV1Iterator::nextLoop(const DiskLoc& prev) { + // TODO ELABORATE + DiskLoc next = _getNextRecord( prev ); + if (!next.isNull()) { + return next; + } + return _recordStore->firstRecord(_txn); + } + + DiskLoc CappedRecordStoreV1Iterator::prevLoop(const DiskLoc& curr) { + // TODO ELABORATE + DiskLoc prev = _getPrevRecord( curr ); + if (!prev.isNull()) { + return prev; + } + return _recordStore->lastRecord(_txn); + } + + RecordData CappedRecordStoreV1Iterator::dataFor( const DiskLoc& loc ) const { + return _recordStore->dataFor( loc ); + } + + Extent* CappedRecordStoreV1Iterator::_getExtent( const DiskLoc& loc ) { + return _recordStore->_extentManager->getExtent( loc ); + } + + DiskLoc CappedRecordStoreV1Iterator::_getNextRecord( const DiskLoc& loc ) { + return _recordStore->getNextRecord( _txn, loc ); + } + + DiskLoc CappedRecordStoreV1Iterator::_getPrevRecord( const DiskLoc& loc ) { + return _recordStore->getPrevRecord( _txn, loc ); + } + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h new file mode 100644 index 00000000000..501986d98fa --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h @@ -0,0 +1,100 @@ +/** + * Copyright (C) 2013 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/storage/record_store.h" + +namespace mongo { + + class CappedRecordStoreV1; + + struct Extent; + + /** + * This class iterates over a capped collection identified by 'ns'. + * The collection must exist when the constructor is called. + * + * If start is not DiskLoc(), the iteration begins at that DiskLoc. + * + * If tailable is true, getNext() can be called after isEOF. It will use the last valid + * returned DiskLoc and try to find the next record from that. + */ + class CappedRecordStoreV1Iterator : public RecordIterator { + public: + CappedRecordStoreV1Iterator( OperationContext* txn, + const CappedRecordStoreV1* collection, + const DiskLoc& start, + bool tailable, + const CollectionScanParams::Direction& dir ); + virtual ~CappedRecordStoreV1Iterator() { } + + // If this is a tailable cursor, isEOF could change its mind after a call to getNext(). + virtual bool isEOF(); + virtual DiskLoc getNext(); + virtual DiskLoc curr(); + + virtual void invalidate(const DiskLoc& dl); + virtual void prepareToYield(); + virtual bool recoverFromYield(); + + virtual RecordData dataFor( const DiskLoc& loc ) const; + private: + /** + * Internal collection navigation helper methods. + */ + DiskLoc getNextCapped(const DiskLoc& dl); + DiskLoc prevLoop(const DiskLoc& curr); + DiskLoc nextLoop(const DiskLoc& prev); + + // some helpers - these move to RecordStore probably + Extent* _getExtent( const DiskLoc& loc ); + DiskLoc _getNextRecord( const DiskLoc& loc ); + DiskLoc _getPrevRecord( const DiskLoc& loc ); + + // transactional context for read locks. Not owned by us + OperationContext* _txn; + + // The collection we're iterating over. + const CappedRecordStoreV1* _recordStore; + + // The result returned on the next call to getNext(). + DiskLoc _curr; + + // If we're tailable, we try to progress from the last valid result when we hit the end. + DiskLoc _prev; + bool _tailable; + + CollectionScanParams::Direction _direction; + + // If invalidate kills the DiskLoc we need to move forward, we kill the iterator. See the + // comment in the body of invalidate(...). + bool _killedByInvalidate; + }; + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp new file mode 100644 index 00000000000..6e423b9e073 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp @@ -0,0 +1,558 @@ +// record_store_v1_capped_test.cpp + +/** + * Copyright (C) 2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h" + +#include "mongo/db/operation_context_noop.h" +#include "mongo/db/storage/mmap_v1/record.h" +#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h" +#include "mongo/unittest/unittest.h" + +using namespace mongo; + +namespace { + + // Provides data to be inserted. Must be large enough for largest possible record. + // Should be in BSS so unused portions should be free. + char zeros[20*1024*1024] = {}; + + class DummyCappedDocumentDeleteCallback : public CappedDocumentDeleteCallback { + public: + Status aboutToDeleteCapped( OperationContext* txn, const DiskLoc& loc ) { + deleted.push_back( loc ); + return Status::OK(); + } + vector<DiskLoc> deleted; + }; + + void simpleInsertTest( const char* buf, int size ) { + + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 ); + DummyCappedDocumentDeleteCallback cb; + + string myns = "test.simple1"; + CappedRecordStoreV1 rs( &txn, &cb, myns, md, &em, false ); + + rs.increaseStorageSize( &txn, 1024, -1 ); + + ASSERT_NOT_OK( rs.insertRecord( &txn, buf, 3, 1000 ).getStatus() ); + + rs.insertRecord( &txn, buf, size, 10000 ); + + { + BSONObjBuilder b; + int64_t storageSize = rs.storageSize( &txn, &b ); + BSONObj obj = b.obj(); + ASSERT_EQUALS( 1, obj["numExtents"].numberInt() ); + ASSERT_EQUALS( storageSize, em.quantizeExtentSize( 1024 ) ); + } + + for ( int i = 0; i < 1000; i++ ) { + ASSERT_OK( rs.insertRecord( &txn, buf, size, 10000 ).getStatus() ); + } + + long long start = md->numRecords(); + for ( int i = 0; i < 1000; i++ ) { + ASSERT_OK( rs.insertRecord( &txn, buf, size, 10000 ).getStatus() ); + } + ASSERT_EQUALS( start, md->numRecords() ); + ASSERT_GREATER_THAN( start, 100 ); + ASSERT_LESS_THAN( start, 1000 ); + } + + TEST(CappedRecordStoreV1, SimpleInsertSize4) { + simpleInsertTest("abcd", 4); + } + TEST(CappedRecordStoreV1, SimpleInsertSize8) { + simpleInsertTest("abcdefgh", 8); + } + + TEST(CappedRecordStoreV1, EmptySingleExtent) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 ); + DummyCappedDocumentDeleteCallback cb; + CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false); + + { + LocAndSize records[] = { + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1000), 1000}, + {} + }; + md->setCapExtent(&txn, DiskLoc(0, 0)); + md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); + initializeV1RS(&txn, records, drecs, &em, md); + } + + rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false); + + { + LocAndSize recs[] = { + {DiskLoc(0, 1000), 100}, + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1100), 900}, + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0)); + ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc().setInvalid()); // unlooped + } + } + + TEST(CappedRecordStoreV1, FirstLoopWithSingleExtentExactSize) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 ); + DummyCappedDocumentDeleteCallback cb; + CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false); + + { + LocAndSize records[] = { + {DiskLoc(0, 1000), 100}, + {DiskLoc(0, 1100), 100}, + {DiskLoc(0, 1200), 100}, + {DiskLoc(0, 1300), 100}, + {DiskLoc(0, 1400), 100}, + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1500), 50}, + {} + }; + md->setCapExtent(&txn, DiskLoc(0, 0)); + md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped + initializeV1RS(&txn, records, drecs, &em, md); + } + + rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false); + + { + LocAndSize recs[] = { + {DiskLoc(0, 1200), 100}, // first old record + {DiskLoc(0, 1300), 100}, + {DiskLoc(0, 1400), 100}, // last old record + {DiskLoc(0, 1000), 100}, // first new record + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1100), 100}, // gap after newest record XXX this is probably a bug + {DiskLoc(0, 1500), 50}, // gap at end of extent + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0)); + ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000)); + } + } + + TEST(CappedRecordStoreV1, NonFirstLoopWithSingleExtentExactSize) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 ); + DummyCappedDocumentDeleteCallback cb; + CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false); + + { + LocAndSize records[] = { + {DiskLoc(0, 1000), 100}, + {DiskLoc(0, 1100), 100}, + {DiskLoc(0, 1200), 100}, + {DiskLoc(0, 1300), 100}, + {DiskLoc(0, 1400), 100}, + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1500), 50}, + {} + }; + md->setCapExtent(&txn, DiskLoc(0, 0)); + md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000)); + initializeV1RS(&txn, records, drecs, &em, md); + } + + rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false); + + { + LocAndSize recs[] = { + {DiskLoc(0, 1200), 100}, // first old record + {DiskLoc(0, 1300), 100}, + {DiskLoc(0, 1400), 100}, // last old record + {DiskLoc(0, 1000), 100}, // first new record + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1100), 100}, // gap after newest record XXX this is probably a bug + {DiskLoc(0, 1500), 50}, // gap at end of extent + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0)); + ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000)); + } + } + + /** + * Current code always tries to leave 24 bytes to create a DeletedRecord. + */ + TEST(CappedRecordStoreV1, WillLoopWithout24SpareBytes) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 ); + DummyCappedDocumentDeleteCallback cb; + CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false); + + { + LocAndSize records[] = { + {DiskLoc(0, 1000), 100}, + {DiskLoc(0, 1100), 100}, + {DiskLoc(0, 1200), 100}, + {DiskLoc(0, 1300), 100}, + {DiskLoc(0, 1400), 100}, + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1500), 123}, + {} + }; + md->setCapExtent(&txn, DiskLoc(0, 0)); + md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000)); + initializeV1RS(&txn, records, drecs, &em, md); + } + + rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false); + + { + LocAndSize recs[] = { + {DiskLoc(0, 1200), 100}, // first old record + {DiskLoc(0, 1300), 100}, + {DiskLoc(0, 1400), 100}, // last old record + {DiskLoc(0, 1000), 100}, // first new record + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1100), 100}, // gap after newest record + {DiskLoc(0, 1500), 123}, // gap at end of extent + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0)); + ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000)); + } + } + + TEST(CappedRecordStoreV1, WontLoopWith24SpareBytes) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 ); + DummyCappedDocumentDeleteCallback cb; + CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false); + + { + LocAndSize records[] = { + {DiskLoc(0, 1000), 100}, + {DiskLoc(0, 1100), 100}, + {DiskLoc(0, 1200), 100}, + {DiskLoc(0, 1300), 100}, + {DiskLoc(0, 1400), 100}, + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1500), 124}, + {} + }; + md->setCapExtent(&txn, DiskLoc(0, 0)); + md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000)); + initializeV1RS(&txn, records, drecs, &em, md); + } + + rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false); + + { + LocAndSize recs[] = { + {DiskLoc(0, 1000), 100}, + {DiskLoc(0, 1100), 100}, + {DiskLoc(0, 1200), 100}, + {DiskLoc(0, 1300), 100}, + {DiskLoc(0, 1400), 100}, + {DiskLoc(0, 1500), 100}, + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1600), 24}, // gap at end of extent + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0)); + ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000)); + } + } + + TEST(CappedRecordStoreV1, MoveToSecondExtentUnLooped) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 ); + DummyCappedDocumentDeleteCallback cb; + CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false); + + { + // Two extents, each with 1000 bytes. + LocAndSize records[] = { + {DiskLoc(0, 1000), 500}, + {DiskLoc(0, 1500), 300}, + {DiskLoc(0, 1800), 100}, + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1900), 100}, + {DiskLoc(1, 1000), 1000}, + {} + }; + md->setCapExtent(&txn, DiskLoc(0, 0)); + md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); + initializeV1RS(&txn, records, drecs, &em, md); + } + + rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false); + + { + LocAndSize recs[] = { + {DiskLoc(0, 1000), 500}, + {DiskLoc(0, 1500), 300}, + {DiskLoc(0, 1800), 100}, + + {DiskLoc(1, 1000), 100}, + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1900), 100}, + {DiskLoc(1, 1100), 900}, + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + ASSERT_EQUALS(md->capExtent(), DiskLoc(1, 0)); + ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc().setInvalid()); // unlooped + } + } + + TEST(CappedRecordStoreV1, MoveToSecondExtentLooped) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 ); + DummyCappedDocumentDeleteCallback cb; + CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false); + + { + // Two extents, each with 1000 bytes. + LocAndSize records[] = { + {DiskLoc(0, 1800), 100}, // old + {DiskLoc(0, 1000), 500}, // first new + {DiskLoc(0, 1500), 400}, + + {DiskLoc(1, 1000), 300}, + {DiskLoc(1, 1300), 600}, + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1900), 100}, + {DiskLoc(1, 1900), 100}, + {} + }; + md->setCapExtent(&txn, DiskLoc(0, 0)); + md->setCapFirstNewRecord(&txn, DiskLoc(0, 1000)); + initializeV1RS(&txn, records, drecs, &em, md); + } + + rs.insertRecord(&txn, zeros, 200 - Record::HeaderSize, false); + + { + LocAndSize recs[] = { + {DiskLoc(0, 1000), 500}, + {DiskLoc(0, 1500), 400}, + + {DiskLoc(1, 1300), 600}, // old + {DiskLoc(1, 1000), 200}, // first new + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1800), 200}, + {DiskLoc(1, 1200), 100}, + {DiskLoc(1, 1900), 100}, + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + ASSERT_EQUALS(md->capExtent(), DiskLoc(1, 0)); + ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(1, 1000)); + } + } + + // + // XXX The CappedRecordStoreV1Scrambler suite of tests describe existing behavior that is less + // than ideal. Any improved implementation will need to be able to handle a collection that has + // been scrambled like this. + // + + /** + * This is a minimal example that shows the current allocator laying out records out-of-order. + */ + TEST(CappedRecordStoreV1Scrambler, Minimal) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 ); + DummyCappedDocumentDeleteCallback cb; + CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false); + + { + // Starting with a single empty 1000 byte extent. + LocAndSize records[] = { + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1000), 1000}, + {} + }; + md->setCapExtent(&txn, DiskLoc(0, 0)); + md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped + initializeV1RS(&txn, records, drecs, &em, md); + } + + rs.insertRecord(&txn, zeros, 500 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 300 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 400 - Record::HeaderSize, false); // won't fit at end so wraps + rs.insertRecord(&txn, zeros, 120 - Record::HeaderSize, false); // fits at end + rs.insertRecord(&txn, zeros, 60 - Record::HeaderSize, false); // fits in earlier hole + + { + LocAndSize recs[] = { + {DiskLoc(0, 1500), 300}, // 2nd insert + {DiskLoc(0, 1000), 400}, // 3rd (1st new) + {DiskLoc(0, 1800), 120}, // 4th + {DiskLoc(0, 1400), 60}, // 5th + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1460), 40}, + {DiskLoc(0, 1920), 80}, + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0)); + ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000)); + } + } + + /** + * This tests a specially crafted set of inserts that scrambles a capped collection in a way + * that leaves 4 deleted records in a single extent. + */ + TEST(CappedRecordStoreV1Scrambler, FourDeletedRecordsInSingleExtent) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( true, 0 ); + DummyCappedDocumentDeleteCallback cb; + CappedRecordStoreV1 rs(&txn, &cb, "test.foo", md, &em, false); + + { + // Starting with a single empty 1000 byte extent. + LocAndSize records[] = { + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1000), 1000}, + {} + }; + md->setCapExtent(&txn, DiskLoc(0, 0)); + md->setCapFirstNewRecord(&txn, DiskLoc().setInvalid()); // unlooped + initializeV1RS(&txn, records, drecs, &em, md); + } + + // This list of sizes was empirically generated to achieve this outcome. Don't think too + // much about them. + rs.insertRecord(&txn, zeros, 500 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 300 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 304 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 76 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 96 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 76 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 200 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 200 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 56 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 96 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 104 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 96 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 60 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 60 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 146 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 146 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 40 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 40 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 36 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 96 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 200 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 60 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 64 - Record::HeaderSize, false); + + { + LocAndSize recs[] = { + {DiskLoc(0, 1148), 148}, + {DiskLoc(0, 1936), 40}, + {DiskLoc(0, 1712), 40}, + {DiskLoc(0, 1296), 36}, + {DiskLoc(0, 1752), 100}, + {DiskLoc(0, 1332), 96}, + {DiskLoc(0, 1428), 200}, + {DiskLoc(0, 1852), 60}, + {DiskLoc(0, 1000), 64}, // (1st new) + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1064), 84}, + {DiskLoc(0, 1976), 24}, + {DiskLoc(0, 1912), 24}, + {DiskLoc(0, 1628), 84}, + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0)); + ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000)); + } + } +} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp new file mode 100644 index 00000000000..a210c0dc0f3 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp @@ -0,0 +1,192 @@ +/** + * Copyright (C) 2014 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h" + +#include "mongo/db/catalog/collection.h" +#include "mongo/db/storage/mmap_v1/extent.h" +#include "mongo/db/storage/mmap_v1/extent_manager.h" +#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h" + +namespace mongo { + + RecordStoreV1RepairIterator::RecordStoreV1RepairIterator(OperationContext* txn, + const RecordStoreV1Base* recordStore) + : _txn(txn), _recordStore(recordStore), _stage(FORWARD_SCAN) { + + // Position the iterator at the first record + // + getNext(); + } + + bool RecordStoreV1RepairIterator::isEOF() { + return _currRecord.isNull(); + } + + DiskLoc RecordStoreV1RepairIterator::curr() { return _currRecord; } + + DiskLoc RecordStoreV1RepairIterator::getNext() { + DiskLoc retVal = _currRecord; + + const ExtentManager* em = _recordStore->_extentManager; + + while (true) { + if (_currRecord.isNull()) { + + if (!_advanceToNextValidExtent()) { + return retVal; + } + + _seenInCurrentExtent.clear(); + + // Otherwise _advanceToNextValidExtent would have returned false + // + invariant(!_currExtent.isNull()); + + const Extent* e = em->getExtent(_currExtent, false); + _currRecord = (FORWARD_SCAN == _stage ? e->firstRecord : e->lastRecord); + } + else { + switch (_stage) { + case FORWARD_SCAN: + _currRecord = _recordStore->getNextRecordInExtent(_txn, _currRecord); + break; + case BACKWARD_SCAN: + _currRecord = _recordStore->getPrevRecordInExtent(_txn, _currRecord); + break; + default: + invariant(!"This should never be reached."); + break; + } + } + + if (_currRecord.isNull()) { + continue; + } + + // Validate the contents of the record's disk location and deduplicate + // + if (!_seenInCurrentExtent.insert(_currRecord).second) { + error() << "infinite loop in extent, seen: " << _currRecord << " before" << endl; + _currRecord = DiskLoc(); + continue; + } + + if (_currRecord.getOfs() <= 0){ + error() << "offset is 0 for record which should be impossible" << endl; + _currRecord = DiskLoc(); + continue; + } + + return retVal; + } + } + + bool RecordStoreV1RepairIterator::_advanceToNextValidExtent() { + const ExtentManager* em = _recordStore->_extentManager; + + while (true) { + if (_currExtent.isNull()) { + switch (_stage) { + case FORWARD_SCAN: + _currExtent = _recordStore->details()->firstExtent(_txn); + break; + case BACKWARD_SCAN: + _currExtent = _recordStore->details()->lastExtent(_txn); + break; + default: + invariant(DONE == _stage); + return false; + } + } + else { + // If _currExtent is not NULL, then it must point to a valid extent, so no extra + // checks here. + // + const Extent* e = em->getExtent(_currExtent, false); + _currExtent = (FORWARD_SCAN == _stage ? e->xnext : e->xprev); + } + + bool hasNextExtent = !_currExtent.isNull(); + + // Sanity checks for the extent's disk location + // + if (hasNextExtent && (!_currExtent.isValid() || (_currExtent.getOfs() <= 0))) { + error() << "Invalid extent location: " << _currExtent << endl; + + // Switch the direction of scan + // + hasNextExtent = false; + } + + if (hasNextExtent) { + break; + } + + // Swap the direction of scan and loop again + // + switch (_stage) { + case FORWARD_SCAN: + _stage = BACKWARD_SCAN; + break; + case BACKWARD_SCAN: + _stage = DONE; + break; + default: + invariant(!"This should never be reached."); + break; + } + + _currExtent = DiskLoc(); + } + + + // Check _currExtent's contents for validity, but do not count is as failure if they + // don't check out. + // + const Extent* e = em->getExtent(_currExtent, false); + if (!e->isOk()){ + warning() << "Extent not ok magic: " << e->magic << " going to try to continue" + << endl; + } + + log() << (FORWARD_SCAN == _stage ? "FORWARD" : "BACKWARD") << " Extent loc: " + << _currExtent << ", length: " << e->length << endl; + + return true; + } + + void RecordStoreV1RepairIterator::invalidate(const DiskLoc& dl) { + verify(!"Invalidate is not supported for RecordStoreV1RepairIterator."); + } + + RecordData RecordStoreV1RepairIterator::dataFor(const DiskLoc& loc) const { + return _recordStore->dataFor( loc ); + } + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h new file mode 100644 index 00000000000..c75c1c790c1 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h @@ -0,0 +1,96 @@ +/** + * Copyright (C) 2014 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include <set> + +#include "mongo/db/storage/record_store.h" +#include "mongo/db/storage/mmap_v1/record_store_v1_base.h" + +namespace mongo { + + /** + * This iterator will go over the collection twice - once going forward (first extent -> last + * extent) and once backwards in an attempt to salvage potentially corrupted or unreachable + * records. It is used by the mongodump --repair option. + */ + class RecordStoreV1RepairIterator : public RecordIterator { + public: + RecordStoreV1RepairIterator(OperationContext* txn, + const RecordStoreV1Base* recordStore); + virtual ~RecordStoreV1RepairIterator() { } + + virtual bool isEOF(); + virtual DiskLoc getNext(); + virtual DiskLoc curr(); + + virtual void invalidate(const DiskLoc& dl); + virtual void prepareToYield() { } + virtual bool recoverFromYield() { + return true; + } + + virtual RecordData dataFor( const DiskLoc& loc ) const; + + private: + + /** + * Based on the direction of scan, finds the next valid (un-corrupted) extent in the chain + * and sets _currExtent to point to that. + * + * @return true if valid extent was found (_currExtent will not be null) + * false otherwise and _currExtent will be null + */ + bool _advanceToNextValidExtent(); + + // transactional context for read locks. Not owned by us + OperationContext* _txn; + + // Reference to the owning RecordStore. The store must not be deleted while there are + // active iterators on it. + // + const RecordStoreV1Base* _recordStore; + + DiskLoc _currExtent; + DiskLoc _currRecord; + + enum Stage { + FORWARD_SCAN = 0, + BACKWARD_SCAN = 1, + DONE = 2 + }; + + Stage _stage; + + // Used to find cycles within an extent. Cleared after each extent has been processed. + // + std::set<DiskLoc> _seenInCurrentExtent; + }; + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp new file mode 100644 index 00000000000..7a9d17974eb --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp @@ -0,0 +1,505 @@ +// record_store_v1_simple.cpp + +/** + * Copyright (C) 2013-2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/platform/basic.h" + +#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h" + +#include "mongo/base/counter.h" +#include "mongo/db/catalog/collection.h" +#include "mongo/db/curop.h" +#include "mongo/db/commands/server_status_metric.h" +#include "mongo/db/storage/mmap_v1/extent.h" +#include "mongo/db/storage/mmap_v1/extent_manager.h" +#include "mongo/db/storage/mmap_v1/record.h" +#include "mongo/db/operation_context.h" +#include "mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h" +#include "mongo/util/log.h" +#include "mongo/util/progress_meter.h" +#include "mongo/util/timer.h" +#include "mongo/util/touch_pages.h" + +namespace mongo { + + MONGO_LOG_DEFAULT_COMPONENT_FILE(::mongo::logger::LogComponent::kStorage); + + static Counter64 freelistAllocs; + static Counter64 freelistBucketExhausted; + static Counter64 freelistIterations; + + static ServerStatusMetricField<Counter64> dFreelist1( "storage.freelist.search.requests", + &freelistAllocs ); + + static ServerStatusMetricField<Counter64> dFreelist2( "storage.freelist.search.bucketExhausted", + &freelistBucketExhausted ); + + static ServerStatusMetricField<Counter64> dFreelist3( "storage.freelist.search.scanned", + &freelistIterations ); + + SimpleRecordStoreV1::SimpleRecordStoreV1( OperationContext* txn, + const StringData& ns, + RecordStoreV1MetaData* details, + ExtentManager* em, + bool isSystemIndexes ) + : RecordStoreV1Base( ns, details, em, isSystemIndexes ) { + + invariant( !details->isCapped() ); + _normalCollection = NamespaceString::normal( ns ); + if ( _details->paddingFactor() == 0 ) { + warning() << "implicit updgrade of paddingFactor of very old collection" << endl; + _details->setPaddingFactor(txn, 1.0); + } + + } + + SimpleRecordStoreV1::~SimpleRecordStoreV1() { + } + + DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents( OperationContext* txn, + int lenToAlloc ) { + // align size up to a multiple of 4 + lenToAlloc = (lenToAlloc + (4-1)) & ~(4-1); + + freelistAllocs.increment(); + DiskLoc loc; + { + DiskLoc *prev = 0; + DiskLoc *bestprev = 0; + DiskLoc bestmatch; + int bestmatchlen = INT_MAX; // sentinel meaning we haven't found a record big enough + int b = bucket(lenToAlloc); + DiskLoc cur = _details->deletedListEntry(b); + + int extra = 5; // look for a better fit, a little. + int chain = 0; + while ( 1 ) { + { // defensive check + int fileNumber = cur.a(); + int fileOffset = cur.getOfs(); + if (fileNumber < -1 || fileNumber >= 100000 || fileOffset < 0) { + StringBuilder sb; + sb << "Deleted record list corrupted in collection " << _ns + << ", bucket " << b + << ", link number " << chain + << ", invalid link is " << cur.toString() + << ", throwing Fatal Assertion"; + log() << sb.str() << endl; + fassertFailed(16469); + } + } + if ( cur.isNull() ) { + // move to next bucket. if we were doing "extra", just break + if ( bestmatchlen < INT_MAX ) + break; + + if ( chain > 0 ) { + // if we looked at things in the right bucket, but they were not suitable + freelistBucketExhausted.increment(); + } + + b++; + if ( b > MaxBucket ) { + // out of space. alloc a new extent. + freelistIterations.increment( 1 + chain ); + return DiskLoc(); + } + cur = _details->deletedListEntry(b); + prev = 0; + continue; + } + DeletedRecord *r = drec(cur); + if ( r->lengthWithHeaders() >= lenToAlloc && + r->lengthWithHeaders() < bestmatchlen ) { + bestmatchlen = r->lengthWithHeaders(); + bestmatch = cur; + bestprev = prev; + if (r->lengthWithHeaders() == lenToAlloc) + // exact match, stop searching + break; + } + if ( bestmatchlen < INT_MAX && --extra <= 0 ) + break; + if ( ++chain > 30 && b <= MaxBucket ) { + // too slow, force move to next bucket to grab a big chunk + //b++; + freelistIterations.increment( chain ); + chain = 0; + cur.Null(); + } + else { + cur = r->nextDeleted(); + prev = &r->nextDeleted(); + } + } + + // unlink ourself from the deleted list + DeletedRecord *bmr = drec(bestmatch); + if ( bestprev ) { + *txn->recoveryUnit()->writing(bestprev) = bmr->nextDeleted(); + } + else { + // should be the front of a free-list + int myBucket = bucket(bmr->lengthWithHeaders()); + invariant( _details->deletedListEntry(myBucket) == bestmatch ); + _details->setDeletedListEntry(txn, myBucket, bmr->nextDeleted()); + } + *txn->recoveryUnit()->writing(&bmr->nextDeleted()) = DiskLoc().setInvalid(); // defensive. + invariant(bmr->extentOfs() < bestmatch.getOfs()); + + freelistIterations.increment( 1 + chain ); + loc = bestmatch; + } + + if ( loc.isNull() ) + return loc; + + // determine if we should chop up + + DeletedRecord *r = drec(loc); + + /* note we want to grab from the front so our next pointers on disk tend + to go in a forward direction which is important for performance. */ + int regionlen = r->lengthWithHeaders(); + invariant( r->extentOfs() < loc.getOfs() ); + + int left = regionlen - lenToAlloc; + if ( left < 24 || left < (lenToAlloc / 8) ) { + // you get the whole thing. + return loc; + } + + // don't quantize: + // - $ collections (indexes) as we already have those aligned the way we want SERVER-8425 + if ( _normalCollection ) { + // we quantize here so that it only impacts newly sized records + // this prevents oddities with older records and space re-use SERVER-8435 + lenToAlloc = std::min( r->lengthWithHeaders(), + quantizeAllocationSpace( lenToAlloc ) ); + left = regionlen - lenToAlloc; + + if ( left < 24 ) { + // you get the whole thing. + return loc; + } + } + + /* split off some for further use. */ + txn->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc; + DiskLoc newDelLoc = loc; + newDelLoc.inc(lenToAlloc); + DeletedRecord* newDel = drec(newDelLoc); + DeletedRecord* newDelW = txn->recoveryUnit()->writing(newDel); + newDelW->extentOfs() = r->extentOfs(); + newDelW->lengthWithHeaders() = left; + newDelW->nextDeleted().Null(); + + addDeletedRec( txn, newDelLoc ); + return loc; + } + + StatusWith<DiskLoc> SimpleRecordStoreV1::allocRecord( OperationContext* txn, + int lengthWithHeaders, + bool enforceQuota ) { + DiskLoc loc = _allocFromExistingExtents( txn, lengthWithHeaders ); + if ( !loc.isNull() ) + return StatusWith<DiskLoc>( loc ); + + LOG(1) << "allocating new extent"; + + increaseStorageSize( txn, + _extentManager->followupSize( lengthWithHeaders, + _details->lastExtentSize(txn)), + enforceQuota ); + + loc = _allocFromExistingExtents( txn, lengthWithHeaders ); + if ( !loc.isNull() ) { + // got on first try + return StatusWith<DiskLoc>( loc ); + } + + log() << "warning: alloc() failed after allocating new extent. " + << "lengthWithHeaders: " << lengthWithHeaders << " last extent size:" + << _details->lastExtentSize(txn) << "; trying again"; + + for ( int z = 0; z < 10 && lengthWithHeaders > _details->lastExtentSize(txn); z++ ) { + log() << "try #" << z << endl; + + increaseStorageSize( txn, + _extentManager->followupSize( lengthWithHeaders, + _details->lastExtentSize(txn)), + enforceQuota ); + + loc = _allocFromExistingExtents( txn, lengthWithHeaders ); + if ( ! loc.isNull() ) + return StatusWith<DiskLoc>( loc ); + } + + return StatusWith<DiskLoc>( ErrorCodes::InternalError, "cannot allocate space" ); + } + + Status SimpleRecordStoreV1::truncate(OperationContext* txn) { + return Status( ErrorCodes::InternalError, + "SimpleRecordStoreV1::truncate not implemented" ); + } + + void SimpleRecordStoreV1::addDeletedRec( OperationContext* txn, const DiskLoc& dloc ) { + DeletedRecord* d = drec( dloc ); + + DEBUGGING log() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs() << endl; + + int b = bucket(d->lengthWithHeaders()); + *txn->recoveryUnit()->writing(&d->nextDeleted()) = _details->deletedListEntry(b); + _details->setDeletedListEntry(txn, b, dloc); + } + + RecordIterator* SimpleRecordStoreV1::getIterator( OperationContext* txn, + const DiskLoc& start, + bool tailable, + const CollectionScanParams::Direction& dir) const { + return new SimpleRecordStoreV1Iterator( txn, this, start, dir ); + } + + vector<RecordIterator*> SimpleRecordStoreV1::getManyIterators( OperationContext* txn ) const { + OwnedPointerVector<RecordIterator> iterators; + const Extent* ext; + for (DiskLoc extLoc = details()->firstExtent(txn); !extLoc.isNull(); extLoc = ext->xnext) { + ext = _getExtent(txn, extLoc); + if (ext->firstRecord.isNull()) + continue; + iterators.push_back( + new RecordStoreV1Base::IntraExtentIterator(txn, ext->firstRecord, this)); + } + + return iterators.release(); + } + + class CompactDocWriter : public DocWriter { + public: + /** + * param allocationSize - allocation size WITH header + */ + CompactDocWriter( const Record* rec, unsigned dataSize, size_t allocationSize ) + : _rec( rec ), + _dataSize( dataSize ), + _allocationSize( allocationSize ) { + } + + virtual ~CompactDocWriter() {} + + virtual void writeDocument( char* buf ) const { + memcpy( buf, _rec->data(), _dataSize ); + } + + virtual size_t documentSize() const { + return _allocationSize - Record::HeaderSize; + } + + virtual bool addPadding() const { + return false; + } + + private: + const Record* _rec; + size_t _dataSize; + size_t _allocationSize; + }; + + void SimpleRecordStoreV1::_compactExtent(OperationContext* txn, + const DiskLoc diskloc, + int extentNumber, + RecordStoreCompactAdaptor* adaptor, + const CompactOptions* compactOptions, + CompactStats* stats ) { + + log() << "compact begin extent #" << extentNumber + << " for namespace " << _ns << " " << diskloc; + + unsigned oldObjSize = 0; // we'll report what the old padding was + unsigned oldObjSizeWithPadding = 0; + + Extent *e = _extentManager->getExtent( diskloc ); + e->assertOk(); + fassert( 17437, e->validates(diskloc) ); + + { + // the next/prev pointers within the extent might not be in order so we first + // page the whole thing in sequentially + log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; + Timer t; + size_t length = e->length; + + touch_pages( reinterpret_cast<const char*>(e), length ); + int ms = t.millis(); + if( ms > 1000 ) + log() << "compact end paging in " << ms << "ms " + << e->length/1000000.0/t.seconds() << "MB/sec" << endl; + } + + { + log() << "compact copying records" << endl; + long long datasize = 0; + long long nrecords = 0; + DiskLoc L = e->firstRecord; + if( !L.isNull() ) { + while( 1 ) { + Record *recOld = recordFor(L); + RecordData oldData = recOld->toRecordData(); + L = getNextRecordInExtent(txn, L); + + if ( compactOptions->validateDocuments && !adaptor->isDataValid( oldData ) ) { + // object is corrupt! + log() << "compact skipping corrupt document!"; + stats->corruptDocuments++; + } + else { + unsigned dataSize = adaptor->dataSize( oldData ); + unsigned docSize = dataSize; + + nrecords++; + oldObjSize += docSize; + oldObjSizeWithPadding += recOld->netLength(); + + unsigned lenWHdr = docSize + Record::HeaderSize; + unsigned lenWPadding = lenWHdr; + + switch( compactOptions->paddingMode ) { + case CompactOptions::NONE: + if ( _details->isUserFlagSet(Flag_UsePowerOf2Sizes) ) + lenWPadding = quantizePowerOf2AllocationSpace(lenWPadding); + break; + case CompactOptions::PRESERVE: + // if we are preserving the padding, the record should not change size + lenWPadding = recOld->lengthWithHeaders(); + break; + case CompactOptions::MANUAL: + lenWPadding = compactOptions->computeRecordSize(lenWPadding); + if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { + lenWPadding = lenWHdr; + } + break; + } + + CompactDocWriter writer( recOld, dataSize, lenWPadding ); + StatusWith<DiskLoc> status = insertRecord( txn, &writer, false ); + uassertStatusOK( status.getStatus() ); + datasize += recordFor( status.getValue() )->netLength(); + + adaptor->inserted( dataFor( status.getValue() ), status.getValue() ); + } + + if( L.isNull() ) { + // we just did the very last record from the old extent. it's still pointed to + // by the old extent ext, but that will be fixed below after this loop + break; + } + + // remove the old records (orphan them) periodically so our commit block doesn't get too large + bool stopping = false; + RARELY stopping = !txn->checkForInterruptNoAssert().isOK(); + if( stopping || txn->recoveryUnit()->isCommitNeeded() ) { + *txn->recoveryUnit()->writing(&e->firstRecord) = L; + Record *r = recordFor(L); + txn->recoveryUnit()->writingInt(r->prevOfs()) = DiskLoc::NullOfs; + txn->recoveryUnit()->commitIfNeeded(); + txn->checkForInterrupt(); + } + } + } // if !L.isNull() + + invariant( _details->firstExtent(txn) == diskloc ); + invariant( _details->lastExtent(txn) != diskloc ); + DiskLoc newFirst = e->xnext; + _details->setFirstExtent( txn, newFirst ); + *txn->recoveryUnit()->writing(&_extentManager->getExtent( newFirst )->xprev) = DiskLoc(); + _extentManager->freeExtent( txn, diskloc ); + + txn->recoveryUnit()->commitIfNeeded(); + + { + double op = 1.0; + if( oldObjSize ) + op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; + log() << "compact finished extent #" << extentNumber << " containing " << nrecords + << " documents (" << datasize/1000000.0 << "MB)" + << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100; + } + } + + } + + Status SimpleRecordStoreV1::compact( OperationContext* txn, + RecordStoreCompactAdaptor* adaptor, + const CompactOptions* options, + CompactStats* stats ) { + + // this is a big job, so might as well make things tidy before we start just to be nice. + txn->recoveryUnit()->commitIfNeeded(); + + list<DiskLoc> extents; + for( DiskLoc extLocation = _details->firstExtent(txn); + !extLocation.isNull(); + extLocation = _extentManager->getExtent( extLocation )->xnext ) { + extents.push_back( extLocation ); + } + log() << "compact " << extents.size() << " extents"; + + log() << "compact orphan deleted lists" << endl; + _details->orphanDeletedList(txn); + + // Start over from scratch with our extent sizing and growth + _details->setLastExtentSize( txn, 0 ); + + // create a new extent so new records go there + increaseStorageSize( txn, _details->lastExtentSize(txn), true ); + + // reset data size and record counts to 0 for this namespace + // as we're about to tally them up again for each new extent + _details->setStats( txn, 0, 0 ); + + ProgressMeterHolder pm(*txn->setMessage("compact extent", + "Extent Compacting Progress", + extents.size())); + + int extentNumber = 0; + for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { + _compactExtent(txn, *i, extentNumber++, adaptor, options, stats ); + pm.hit(); + } + + invariant( _extentManager->getExtent( _details->firstExtent(txn) )->xprev.isNull() ); + invariant( _extentManager->getExtent( _details->lastExtent(txn) )->xnext.isNull() ); + + // indexes will do their own progress meter + pm.finished(); + + return Status::OK(); + } + +} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple.h b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.h new file mode 100644 index 00000000000..abc6b11b928 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.h @@ -0,0 +1,95 @@ +// record_store_v1_simple.h + +/** +* Copyright (C) 2013-2014 MongoDB Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#pragma once + +#include "mongo/db/diskloc.h" +#include "mongo/db/storage/mmap_v1/record_store_v1_base.h" + +namespace mongo { + + class SimpleRecordStoreV1Iterator; + + // used by index and original collections + class SimpleRecordStoreV1 : public RecordStoreV1Base { + public: + SimpleRecordStoreV1( OperationContext* txn, + const StringData& ns, + RecordStoreV1MetaData* details, + ExtentManager* em, + bool isSystemIndexes ); + + virtual ~SimpleRecordStoreV1(); + + const char* name() const { return "SimpleRecordStoreV1"; } + + virtual RecordIterator* getIterator( OperationContext* txn, const DiskLoc& start, bool tailable, + const CollectionScanParams::Direction& dir) const; + + virtual std::vector<RecordIterator*> getManyIterators(OperationContext* txn) const; + + virtual Status truncate(OperationContext* txn); + + virtual void temp_cappedTruncateAfter(OperationContext* txn, DiskLoc end, bool inclusive) { + invariant(!"cappedTruncateAfter not supported"); + } + + virtual bool compactSupported() const { return true; } + virtual Status compact( OperationContext* txn, + RecordStoreCompactAdaptor* adaptor, + const CompactOptions* options, + CompactStats* stats ); + + protected: + virtual bool isCapped() const { return false; } + + virtual StatusWith<DiskLoc> allocRecord( OperationContext* txn, + int lengthWithHeaders, + bool enforceQuota ); + + virtual void addDeletedRec(OperationContext* txn, + const DiskLoc& dloc); + private: + DiskLoc _allocFromExistingExtents( OperationContext* txn, + int lengthWithHeaders ); + + void _compactExtent(OperationContext* txn, + const DiskLoc diskloc, + int extentNumber, + RecordStoreCompactAdaptor* adaptor, + const CompactOptions* compactOptions, + CompactStats* stats ); + + bool _normalCollection; + + friend class SimpleRecordStoreV1Iterator; + }; + +} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp new file mode 100644 index 00000000000..803b1494920 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp @@ -0,0 +1,130 @@ +/** + * Copyright (C) 2013 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h" + +#include "mongo/db/catalog/collection.h" +#include "mongo/db/storage/mmap_v1/extent.h" +#include "mongo/db/storage/mmap_v1/extent_manager.h" +#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h" + +namespace mongo { + + // + // Regular / non-capped collection traversal + // + + SimpleRecordStoreV1Iterator::SimpleRecordStoreV1Iterator(OperationContext* txn, + const SimpleRecordStoreV1* collection, + const DiskLoc& start, + const CollectionScanParams::Direction& dir) + : _txn(txn), _curr(start), _recordStore(collection), _direction(dir) { + + if (_curr.isNull()) { + + const ExtentManager* em = _recordStore->_extentManager; + + if ( _recordStore->details()->firstExtent(txn).isNull() ) { + // nothing in the collection + verify( _recordStore->details()->lastExtent(txn).isNull() ); + } + else if (CollectionScanParams::FORWARD == _direction) { + + // Find a non-empty extent and start with the first record in it. + Extent* e = em->getExtent( _recordStore->details()->firstExtent(txn) ); + + while (e->firstRecord.isNull() && !e->xnext.isNull()) { + e = em->getExtent( e->xnext ); + } + + // _curr may be set to DiskLoc() here if e->lastRecord isNull but there is no + // valid e->xnext + _curr = e->firstRecord; + } + else { + // Walk backwards, skipping empty extents, and use the last record in the first + // non-empty extent we see. + Extent* e = em->getExtent( _recordStore->details()->lastExtent(txn) ); + + // TODO ELABORATE + // Does one of e->lastRecord.isNull(), e.firstRecord.isNull() imply the other? + while (e->lastRecord.isNull() && !e->xprev.isNull()) { + e = em->getExtent( e->xprev ); + } + + // _curr may be set to DiskLoc() here if e->lastRecord isNull but there is no + // valid e->xprev + _curr = e->lastRecord; + } + } + } + + bool SimpleRecordStoreV1Iterator::isEOF() { + return _curr.isNull(); + } + + DiskLoc SimpleRecordStoreV1Iterator::curr() { return _curr; } + + DiskLoc SimpleRecordStoreV1Iterator::getNext() { + DiskLoc ret = _curr; + + // Move to the next thing. + if (!isEOF()) { + if (CollectionScanParams::FORWARD == _direction) { + _curr = _recordStore->getNextRecord( _txn, _curr ); + } + else { + _curr = _recordStore->getPrevRecord( _txn, _curr ); + } + } + + return ret; + } + + void SimpleRecordStoreV1Iterator::invalidate(const DiskLoc& dl) { + // Just move past the thing being deleted. + if (dl == _curr) { + // We don't care about the return of getNext so much as the side effect of moving _curr + // to the 'next' thing. + getNext(); + } + } + + void SimpleRecordStoreV1Iterator::prepareToYield() { + } + + bool SimpleRecordStoreV1Iterator::recoverFromYield() { + // if the collection is dropped, then the cursor should be destroyed + return true; + } + + RecordData SimpleRecordStoreV1Iterator::dataFor( const DiskLoc& loc ) const { + return _recordStore->dataFor( loc ); + } + +} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h new file mode 100644 index 00000000000..ded30a3ee1d --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h @@ -0,0 +1,73 @@ +/** + * Copyright (C) 2013 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/storage/record_store.h" + +namespace mongo { + + class SimpleRecordStoreV1; + + /** + * This class iterates over a non-capped collection identified by 'ns'. + * The collection must exist when the constructor is called. + * + * If start is not DiskLoc(), the iteration begins at that DiskLoc. + */ + class SimpleRecordStoreV1Iterator : public RecordIterator { + public: + SimpleRecordStoreV1Iterator( OperationContext* txn, + const SimpleRecordStoreV1* records, + const DiskLoc& start, + const CollectionScanParams::Direction& dir ); + virtual ~SimpleRecordStoreV1Iterator() { } + + virtual bool isEOF(); + virtual DiskLoc getNext(); + virtual DiskLoc curr(); + + virtual void invalidate(const DiskLoc& dl); + virtual void prepareToYield(); + virtual bool recoverFromYield(); + + virtual RecordData dataFor( const DiskLoc& loc ) const; + + private: + // for getNext, not owned + OperationContext* _txn; + + // The result returned on the next call to getNext(). + DiskLoc _curr; + + const SimpleRecordStoreV1* _recordStore; + + CollectionScanParams::Direction _direction; + }; + +} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp new file mode 100644 index 00000000000..31f17f42b28 --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp @@ -0,0 +1,775 @@ +// record_store_v1_simple_test.cpp + +/** + * Copyright (C) 2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h" + +#include "mongo/db/operation_context_noop.h" +#include "mongo/db/storage/mmap_v1/record.h" +#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h" +#include "mongo/unittest/unittest.h" + +using namespace mongo; + +namespace { + + // Provides data to be inserted. Must be large enough for largest possible record. + // Should be in BSS so unused portions should be free. + char zeros[20*1024*1024] = {}; + + TEST( SimpleRecordStoreV1, quantizeAllocationSpaceSimple ) { + ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(33), 36); + ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1000), 1024); + ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(10001), 10240); + ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(100000), 106496); + ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1000001), 1048576); + ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(10000000), 10223616); + } + + TEST( SimpleRecordStoreV1, quantizeAllocationMinMaxBound ) { + const int maxSize = 16 * 1024 * 1024; + ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1), 2); + ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(maxSize), maxSize); + } + + /** + * Test Quantize record allocation on every boundary, as well as boundary-1 + * @see NamespaceDetails::quantizeAllocationSpace() + */ + TEST( SimpleRecordStoreV1, quantizeAllocationBoundary ) { + for (int iBucket = 0; iBucket <= RecordStoreV1Base::MaxBucket; ++iBucket) { + // for each bucket in range [min, max) + const int bucketSize = RecordStoreV1Base::bucketSizes[iBucket]; + const int prevBucketSize = + (iBucket - 1 >= 0) ? RecordStoreV1Base::bucketSizes[iBucket - 1] : 0; + const int intervalSize = bucketSize / 16; + for (int iBoundary = prevBucketSize; + iBoundary < bucketSize; + iBoundary += intervalSize) { + // for each quantization boundary within the bucket + for (int iSize = iBoundary - 1; iSize <= iBoundary; ++iSize) { + // test the quantization boundary - 1, and the boundary itself + const int quantized = + RecordStoreV1Base::quantizeAllocationSpace(iSize); + // assert quantized size is greater than or equal to requested size + ASSERT(quantized >= iSize); + // assert quantized size is within one quantization interval of + // the requested size + ASSERT(quantized - iSize <= intervalSize); + // assert quantization is an idempotent operation + ASSERT(quantized == + RecordStoreV1Base::quantizeAllocationSpace(quantized)); + } + } + } + } + + /** + * For buckets up to 4MB powerOf2 allocation should round up to next power of 2. It should be + * return the input unmodified if it is already a power of 2. + */ + TEST( SimpleRecordStoreV1, quantizePowerOf2Small ) { + // only tests buckets <= 4MB. Higher buckets quatize to 1MB even with powerOf2 + for (int bucket = 0; bucket < RecordStoreV1Base::MaxBucket; bucket++) { + const int size = RecordStoreV1Base::bucketSizes[bucket]; + const int nextSize = RecordStoreV1Base::bucketSizes[bucket + 1]; + + // size - 1 is quantized to size. + ASSERT_EQUALS( size, + RecordStoreV1Base::quantizePowerOf2AllocationSpace( size - 1 ) ); + + // size is quantized to size. + ASSERT_EQUALS( size, + RecordStoreV1Base::quantizePowerOf2AllocationSpace( size ) ); + + // size + 1 is quantized to nextSize (unless > 4MB which is covered by next test) + if (size < 4*1024*1024) { + ASSERT_EQUALS( nextSize, + RecordStoreV1Base::quantizePowerOf2AllocationSpace( size + 1 ) ); + } + } + } + + /** + * Within the largest bucket, quantizePowerOf2AllocationSpace quantizes to the nearest + * megabyte boundary. + */ + TEST( SimpleRecordStoreV1, SimpleRecordLargePowerOf2ToMegabyteBoundary ) { + // Iterate iSize over all 1mb boundaries from the size of the next to largest bucket + // to the size of the largest bucket + 1mb. + for( int iSize = RecordStoreV1Base::bucketSizes[ RecordStoreV1Base::MaxBucket - 1 ]; + iSize <= RecordStoreV1Base::bucketSizes[ RecordStoreV1Base::MaxBucket ] + 0x100000; + iSize += 0x100000 ) { + + // iSize - 1 is quantized to iSize. + ASSERT_EQUALS( iSize, + RecordStoreV1Base::quantizePowerOf2AllocationSpace( iSize - 1 ) ); + + // iSize is quantized to iSize. + ASSERT_EQUALS( iSize, + RecordStoreV1Base::quantizePowerOf2AllocationSpace( iSize ) ); + + // iSize + 1 is quantized to iSize + 1mb. + ASSERT_EQUALS( iSize + 0x100000, + RecordStoreV1Base::quantizePowerOf2AllocationSpace( iSize + 1 ) ); + } + } + + BSONObj docForRecordSize( int size ) { + BSONObjBuilder b; + b.append( "_id", 5 ); + b.append( "x", string( size - Record::HeaderSize - 22, 'x' ) ); + BSONObj x = b.obj(); + ASSERT_EQUALS( Record::HeaderSize + x.objsize(), size ); + return x; + } + + /** alloc() quantizes the requested size using quantizeAllocationSpace() rules. */ + TEST(SimpleRecordStoreV1, AllocQuantized) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 ); + + string myns = "test.AllocQuantized"; + SimpleRecordStoreV1 rs( &txn, myns, md, &em, false ); + + BSONObj obj = docForRecordSize( 300 ); + StatusWith<DiskLoc> result = rs.insertRecord( &txn, obj.objdata(), obj.objsize(), false); + ASSERT( result.isOK() ); + + // The length of the allocated record is quantized. + ASSERT_EQUALS( 320, rs.dataFor( result.getValue() ).size() + Record::HeaderSize ); + } + + /** + * alloc() does not quantize records in index collections using quantizeAllocationSpace() + * rules. + */ + TEST(SimpleRecordStoreV1, AllocIndexNamespaceNotQuantized) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 ); + + string myns = "test.AllocIndexNamespaceNotQuantized"; + SimpleRecordStoreV1 rs( &txn, myns + "$x", md, &em, false ); + + BSONObj obj = docForRecordSize( 300 ); + StatusWith<DiskLoc> result = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false); + ASSERT( result.isOK() ); + + // The length of the allocated record is not quantized. + ASSERT_EQUALS( 300, rs.dataFor( result.getValue() ).size() + Record::HeaderSize ); + + } + + /** alloc() quantizes records in index collections to the nearest multiple of 4. */ + TEST(SimpleRecordStoreV1, AllocIndexNamespaceSlightlyQuantized) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 ); + + string myns = "test.AllocIndexNamespaceNotQuantized"; + SimpleRecordStoreV1 rs( &txn, myns + "$x", md, &em, false ); + + BSONObj obj = docForRecordSize( 298 ); + StatusWith<DiskLoc> result = rs.insertRecord( &txn, obj.objdata(), obj.objsize(), false); + ASSERT( result.isOK() ); + + ASSERT_EQUALS( 300, rs.dataFor( result.getValue() ).size() + Record::HeaderSize ); + } + + /** alloc() returns a non quantized record larger than the requested size. */ + TEST(SimpleRecordStoreV1, AllocUseNonQuantizedDeletedRecord) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 ); + SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false ); + + { + LocAndSize drecs[] = { + {DiskLoc(0, 1000), 310}, + {} + }; + initializeV1RS(&txn, NULL, drecs, &em, md); + } + + BSONObj obj = docForRecordSize( 300 ); + StatusWith<DiskLoc> actualLocation = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false); + ASSERT_OK( actualLocation.getStatus() ); + + { + LocAndSize recs[] = { + {DiskLoc(0, 1000), 310}, + {} + }; + LocAndSize drecs[] = { + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + } + } + + /** alloc() returns a non quantized record equal to the requested size. */ + TEST(SimpleRecordStoreV1, AllocExactSizeNonQuantizedDeletedRecord) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 ); + SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false ); + + { + LocAndSize drecs[] = { + {DiskLoc(0, 1000), 300}, + {} + }; + initializeV1RS(&txn, NULL, drecs, &em, md); + } + + BSONObj obj = docForRecordSize( 300 ); + StatusWith<DiskLoc> actualLocation = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false); + ASSERT_OK( actualLocation.getStatus() ); + + { + LocAndSize recs[] = { + {DiskLoc(0, 1000), 300}, + {} + }; + LocAndSize drecs[] = { + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + } + } + + /** + * alloc() returns a non quantized record equal to the quantized size plus some extra space + * too small to make a DeletedRecord. + */ + TEST(SimpleRecordStoreV1, AllocQuantizedWithExtra) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 ); + SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false ); + + { + LocAndSize drecs[] = { + {DiskLoc(0, 1000), 343}, + {} + }; + initializeV1RS(&txn, NULL, drecs, &em, md); + } + + BSONObj obj = docForRecordSize( 300 ); + StatusWith<DiskLoc> actualLocation = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false); + ASSERT_OK( actualLocation.getStatus() ); + + { + LocAndSize recs[] = { + {DiskLoc(0, 1000), 343}, + {} + }; + LocAndSize drecs[] = { + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + } + } + + /** + * alloc() returns a quantized record when the extra space in the reclaimed deleted record + * is large enough to form a new deleted record. + */ + TEST(SimpleRecordStoreV1, AllocQuantizedWithoutExtra) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 ); + SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false ); + + { + LocAndSize drecs[] = { + {DiskLoc(0, 1000), 344}, + {} + }; + initializeV1RS(&txn, NULL, drecs, &em, md); + } + + + BSONObj obj = docForRecordSize( 300 ); + StatusWith<DiskLoc> actualLocation = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false); + ASSERT_OK( actualLocation.getStatus() ); + + { + LocAndSize recs[] = { + // The returned record is quantized from 300 to 320. + {DiskLoc(0, 1000), 320}, + {} + }; + LocAndSize drecs[] = { + // A new 24 byte deleted record is split off. + {DiskLoc(0, 1320), 24}, + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + } + } + + /** + * A non quantized deleted record within 1/8 of the requested size is returned as is, even + * if a quantized portion of the deleted record could be used instead. + */ + TEST(SimpleRecordStoreV1, AllocNotQuantizedNearDeletedSize) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 ); + SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false ); + + { + LocAndSize drecs[] = { + {DiskLoc(0, 1000), 344}, + {} + }; + initializeV1RS(&txn, NULL, drecs, &em, md); + } + + BSONObj obj = docForRecordSize( 319 ); + StatusWith<DiskLoc> actualLocation = rs.insertRecord(&txn, obj.objdata(), obj.objsize(), false); + ASSERT_OK( actualLocation.getStatus() ); + + // Even though 319 would be quantized to 320 and 344 - 320 == 24 could become a new + // deleted record, the entire deleted record is returned because + // ( 344 - 320 ) < ( 320 / 8 ). + + { + LocAndSize recs[] = { + {DiskLoc(0, 1000), 344}, + {} + }; + LocAndSize drecs[] = { + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + } + } + + /** getRecordAllocationSize() returns its argument when the padding factor is 1.0. */ + TEST(SimpleRecordStoreV1, GetRecordAllocationSizeNoPadding) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 ); + SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false ); + ASSERT_EQUALS( 1.0, md->paddingFactor() ); + ASSERT_EQUALS( 300, rs.getRecordAllocationSize( 300 ) ); + } + + /** getRecordAllocationSize() multiplies by a padding factor > 1.0. */ + TEST(SimpleRecordStoreV1, GetRecordAllocationSizeWithPadding) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 ); + SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false ); + double paddingFactor = 1.2; + md->setPaddingFactor( &txn, paddingFactor ); + ASSERT_EQUALS( paddingFactor, md->paddingFactor() ); + ASSERT_EQUALS( int(300 * paddingFactor), rs.getRecordAllocationSize( 300 ) ); + } + + /** + * getRecordAllocationSize() quantizes to the nearest power of 2 when Flag_UsePowerOf2Sizes + * is set. + */ + TEST(SimpleRecordStoreV1, GetRecordAllocationSizePowerOf2) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( + false, + RecordStoreV1Base::Flag_UsePowerOf2Sizes ); + + SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false ); + ASSERT_EQUALS( 512, rs.getRecordAllocationSize( 300 ) ); + } + + /** + * getRecordAllocationSize() quantizes to the nearest power of 2 when Flag_UsePowerOf2Sizes + * is set, ignoring the padding factor. + */ + TEST(SimpleRecordStoreV1, GetRecordAllocationSizePowerOf2PaddingIgnored) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( + false, + RecordStoreV1Base::Flag_UsePowerOf2Sizes ); + + SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false ); + md->setPaddingFactor( &txn, 2.0 ); + ASSERT_EQUALS( 2.0, md->paddingFactor() ); + ASSERT_EQUALS( 512, rs.getRecordAllocationSize( 300 ) ); + } + + + // ----------------- + + TEST( SimpleRecordStoreV1, FullSimple1 ) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 ); + SimpleRecordStoreV1 rs( &txn, + "test.foo", + md, + &em, + false ); + + + ASSERT_EQUALS( 0, md->numRecords() ); + StatusWith<DiskLoc> result = rs.insertRecord( &txn, "abc", 4, 1000 ); + ASSERT_TRUE( result.isOK() ); + ASSERT_EQUALS( 1, md->numRecords() ); + RecordData recordData = rs.dataFor( result.getValue() ); + ASSERT_EQUALS( string("abc"), string(recordData.data()) ); + } + + // ---------------- + + /** + * Inserts take the first deleted record with the correct size. + */ + TEST( SimpleRecordStoreV1, InsertTakesFirstDeletedWithExactSize ) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 ); + SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false ); + + { + LocAndSize recs[] = { + {DiskLoc(0, 1000), 100}, + {DiskLoc(0, 1100), 100}, + {DiskLoc(0, 1300), 100}, + {DiskLoc(2, 1100), 100}, + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1200), 100}, // this one will be used + {DiskLoc(2, 1000), 100}, + {DiskLoc(1, 1000), 1000}, + {} + }; + + initializeV1RS(&txn, recs, drecs, &em, md); + } + + rs.insertRecord(&txn, zeros, 100 - Record::HeaderSize, false); + + { + LocAndSize recs[] = { + {DiskLoc(0, 1000), 100}, + {DiskLoc(0, 1100), 100}, + {DiskLoc(0, 1300), 100}, + {DiskLoc(0, 1200), 100}, // this is the new record + {DiskLoc(2, 1100), 100}, + {} + }; + LocAndSize drecs[] = { + {DiskLoc(2, 1000), 100}, + {DiskLoc(1, 1000), 1000}, + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + } + } + + /** + * Test that we keep looking for better matches for 5 links once we find a non-exact match. + * This "extra" scanning does not proceed into bigger buckets. + * WARNING: this test depends on magic numbers inside RSV1Simple::_allocFromExistingExtents. + */ + TEST( SimpleRecordStoreV1, InsertLooksForBetterMatchUpTo5Links ) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 ); + SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false ); + + { + LocAndSize recs[] = { + {} + }; + LocAndSize drecs[] = { + // This intentionally leaves gaps to keep locs readable. + {DiskLoc(0, 1000), 75}, // too small + {DiskLoc(0, 1100), 100}, // 1st big enough: will be first record + {DiskLoc(0, 1200), 100}, // 2nd: will be third record + {DiskLoc(0, 1300), 100}, // 3rd + {DiskLoc(0, 1400), 100}, // 4th + {DiskLoc(0, 1500), 100}, // 5th: first and third will stop once they look here + {DiskLoc(0, 1600), 80}, // 6th: second will make it here and use this + {DiskLoc(0, 1700), 999}, // bigger bucket. Should never look here + {} + }; + initializeV1RS(&txn, recs, drecs, &em, md); + } + + rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false); + + { + LocAndSize recs[] = { + {DiskLoc(0, 1100), 100}, // 1st insert + {DiskLoc(0, 1600), 80}, // 2nd insert + {DiskLoc(0, 1200), 100}, // 3rd insert + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1000), 75}, + {DiskLoc(0, 1300), 100}, + {DiskLoc(0, 1400), 100}, + {DiskLoc(0, 1500), 100}, + {DiskLoc(0, 1700), 999}, + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + } + } + + /** + * Test that we stop looking in a bucket once we see 31 too small drecs. + * WARNING: this test depends on magic numbers inside RSV1Simple::_allocFromExistingExtents. + */ + TEST( SimpleRecordStoreV1, InsertLooksForMatchUpTo31Links ) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 ); + SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false ); + + { + LocAndSize recs[] = { + {} + }; + LocAndSize drecs[] = { + // This intentionally leaves gaps to keep locs readable. + {DiskLoc(0, 1000), 50}, // different bucket + + {DiskLoc(0, 1100), 75}, // 1st too small in correct bucket + {DiskLoc(0, 1200), 75}, + {DiskLoc(0, 1300), 75}, + {DiskLoc(0, 1400), 75}, + {DiskLoc(0, 1500), 75}, + {DiskLoc(0, 1600), 75}, + {DiskLoc(0, 1700), 75}, + {DiskLoc(0, 1800), 75}, + {DiskLoc(0, 1900), 75}, + {DiskLoc(0, 2000), 75}, // 10th too small + {DiskLoc(0, 2100), 75}, + {DiskLoc(0, 2200), 75}, + {DiskLoc(0, 2300), 75}, + {DiskLoc(0, 2400), 75}, + {DiskLoc(0, 2500), 75}, + {DiskLoc(0, 2600), 75}, + {DiskLoc(0, 2700), 75}, + {DiskLoc(0, 2800), 75}, + {DiskLoc(0, 2900), 75}, + {DiskLoc(0, 3000), 75}, // 20th too small + {DiskLoc(0, 3100), 75}, + {DiskLoc(0, 3200), 75}, + {DiskLoc(0, 3300), 75}, + {DiskLoc(0, 3400), 75}, + {DiskLoc(0, 3500), 75}, + {DiskLoc(0, 3600), 75}, + {DiskLoc(0, 3700), 75}, + {DiskLoc(0, 3800), 75}, + {DiskLoc(0, 3900), 75}, + {DiskLoc(0, 4000), 75}, // 30th too small + {DiskLoc(0, 4100), 75}, // 31st too small + + {DiskLoc(0, 8000), 80}, // big enough but wont be seen until we take an earlier one + {DiskLoc(0, 9000), 140}, // bigger bucket. jumps here after seeing 31 drecs + {} + }; + initializeV1RS(&txn, recs, drecs, &em, md); + } + + rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false); // takes from bigger bucket + rs.insertRecord(&txn, zeros, 70 - Record::HeaderSize, false); // removes a 75-sized drec + rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false); // now sees big-enough drec + + { + LocAndSize recs[] = { + {DiskLoc(0, 9000), 80}, // 1st insert went here + {DiskLoc(0, 1100), 75}, // 2nd here + {DiskLoc(0, 8000), 80}, // 3rd here + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 9000 + 80), 140 - 80}, // split off during first insert + {DiskLoc(0, 1000), 50}, + {DiskLoc(0, 1200), 75}, + {DiskLoc(0, 1300), 75}, + {DiskLoc(0, 1400), 75}, + {DiskLoc(0, 1500), 75}, + {DiskLoc(0, 1600), 75}, + {DiskLoc(0, 1700), 75}, + {DiskLoc(0, 1800), 75}, + {DiskLoc(0, 1900), 75}, + {DiskLoc(0, 2000), 75}, + {DiskLoc(0, 2100), 75}, + {DiskLoc(0, 2200), 75}, + {DiskLoc(0, 2300), 75}, + {DiskLoc(0, 2400), 75}, + {DiskLoc(0, 2500), 75}, + {DiskLoc(0, 2600), 75}, + {DiskLoc(0, 2700), 75}, + {DiskLoc(0, 2800), 75}, + {DiskLoc(0, 2900), 75}, + {DiskLoc(0, 3000), 75}, + {DiskLoc(0, 3100), 75}, + {DiskLoc(0, 3200), 75}, + {DiskLoc(0, 3300), 75}, + {DiskLoc(0, 3400), 75}, + {DiskLoc(0, 3500), 75}, + {DiskLoc(0, 3600), 75}, + {DiskLoc(0, 3700), 75}, + {DiskLoc(0, 3800), 75}, + {DiskLoc(0, 3900), 75}, + {DiskLoc(0, 4000), 75}, + {DiskLoc(0, 4100), 75}, + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + } + } + + /** + * Test that we stop looking in a bucket once we see 31 drecs, or look 4-past the first + * too-large match, whichever comes first. This is a combination of + * InsertLooksForBetterMatchUpTo5Links and InsertLooksForMatchUpTo31Links. + * + * WARNING: this test depends on magic numbers inside RSV1Simple::_allocFromExistingExtents. + */ + TEST( SimpleRecordStoreV1, InsertLooksForMatchUpTo31LinksEvenIfFoundOversizedFit ) { + OperationContextNoop txn; + DummyExtentManager em; + DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData( false, 0 ); + SimpleRecordStoreV1 rs( &txn, "test.foo", md, &em, false ); + + { + LocAndSize recs[] = { + {} + }; + LocAndSize drecs[] = { + // This intentionally leaves gaps to keep locs readable. + {DiskLoc(0, 1000), 50}, // different bucket + + {DiskLoc(0, 1100), 75}, // 1st too small in correct bucket + {DiskLoc(0, 1200), 75}, + {DiskLoc(0, 1300), 75}, + {DiskLoc(0, 1400), 75}, + {DiskLoc(0, 1500), 75}, + {DiskLoc(0, 1600), 75}, + {DiskLoc(0, 1700), 75}, + {DiskLoc(0, 1800), 75}, + {DiskLoc(0, 1900), 75}, + {DiskLoc(0, 2000), 75}, // 10th too small + {DiskLoc(0, 2100), 75}, + {DiskLoc(0, 2200), 75}, + {DiskLoc(0, 2300), 75}, + {DiskLoc(0, 2400), 75}, + {DiskLoc(0, 2500), 75}, + {DiskLoc(0, 2600), 75}, + {DiskLoc(0, 2700), 75}, + {DiskLoc(0, 2800), 75}, + {DiskLoc(0, 2900), 75}, + {DiskLoc(0, 3000), 75}, // 20th too small + {DiskLoc(0, 3100), 75}, + {DiskLoc(0, 3200), 75}, + {DiskLoc(0, 3300), 75}, + {DiskLoc(0, 3400), 75}, + {DiskLoc(0, 3500), 75}, + {DiskLoc(0, 3600), 75}, + {DiskLoc(0, 3700), 75}, // 27th too small + + {DiskLoc(0, 7000), 95}, // 1st insert takes this + {DiskLoc(0, 7100), 95}, // 3rd insert takes this + + {DiskLoc(0, 3800), 75}, + {DiskLoc(0, 3900), 75}, // 29th too small (31st overall) + + {DiskLoc(0, 8000), 80}, // exact match. taken by 2nd insert + + {DiskLoc(0, 9000), 140}, // bigger bucket. Should never get here + {} + }; + initializeV1RS(&txn, recs, drecs, &em, md); + } + + rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false); + rs.insertRecord(&txn, zeros, 80 - Record::HeaderSize, false); + + { + LocAndSize recs[] = { + {DiskLoc(0, 7000), 95}, // 1st insert went here + {DiskLoc(0, 8000), 80}, // 2nd here + {DiskLoc(0, 7100), 95}, // 3rd here + {} + }; + LocAndSize drecs[] = { + {DiskLoc(0, 1000), 50}, + {DiskLoc(0, 1100), 75}, + {DiskLoc(0, 1200), 75}, + {DiskLoc(0, 1300), 75}, + {DiskLoc(0, 1400), 75}, + {DiskLoc(0, 1500), 75}, + {DiskLoc(0, 1600), 75}, + {DiskLoc(0, 1700), 75}, + {DiskLoc(0, 1800), 75}, + {DiskLoc(0, 1900), 75}, + {DiskLoc(0, 2000), 75}, + {DiskLoc(0, 2100), 75}, + {DiskLoc(0, 2200), 75}, + {DiskLoc(0, 2300), 75}, + {DiskLoc(0, 2400), 75}, + {DiskLoc(0, 2500), 75}, + {DiskLoc(0, 2600), 75}, + {DiskLoc(0, 2700), 75}, + {DiskLoc(0, 2800), 75}, + {DiskLoc(0, 2900), 75}, + {DiskLoc(0, 3000), 75}, + {DiskLoc(0, 3100), 75}, + {DiskLoc(0, 3200), 75}, + {DiskLoc(0, 3300), 75}, + {DiskLoc(0, 3400), 75}, + {DiskLoc(0, 3500), 75}, + {DiskLoc(0, 3600), 75}, + {DiskLoc(0, 3700), 75}, + {DiskLoc(0, 3800), 75}, + {DiskLoc(0, 3900), 75}, + {DiskLoc(0, 9000), 140}, + {} + }; + assertStateV1RS(&txn, recs, drecs, &em, md); + } + } +} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp new file mode 100644 index 00000000000..3ea4298332f --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp @@ -0,0 +1,608 @@ +// record_store_v1_test_help.cpp + +/** +* Copyright (C) 2014 MongoDB Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h" + +#include <algorithm> +#include <map> +#include <set> +#include <vector> + +#include "mongo/db/storage/mmap_v1/extent.h" +#include "mongo/db/storage/mmap_v1/record.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { + + DummyRecordStoreV1MetaData::DummyRecordStoreV1MetaData( bool capped, int userFlags ) { + _dataSize = 0; + _numRecords = 0; + _capped = capped; + _userFlags = userFlags; + _lastExtentSize = 0; + _paddingFactor = 1; + _maxCappedDocs = numeric_limits<long long>::max(); + _capFirstNewRecord.setInvalid(); + if ( _capped ) { + // copied from NamespaceDetails::NamespaceDetails() + setDeletedListEntry( NULL, 1, DiskLoc().setInvalid() ); + } + } + + const DiskLoc& DummyRecordStoreV1MetaData::capExtent() const { + return _capExtent; + } + + void DummyRecordStoreV1MetaData::setCapExtent( OperationContext* txn, + const DiskLoc& loc ) { + _capExtent = loc; + } + + const DiskLoc& DummyRecordStoreV1MetaData::capFirstNewRecord() const { + return _capFirstNewRecord; + } + + void DummyRecordStoreV1MetaData::setCapFirstNewRecord( OperationContext* txn, + const DiskLoc& loc ) { + _capFirstNewRecord = loc; + } + + long long DummyRecordStoreV1MetaData::dataSize() const { + return _dataSize; + } + + long long DummyRecordStoreV1MetaData::numRecords() const { + return _numRecords; + } + + void DummyRecordStoreV1MetaData::incrementStats( OperationContext* txn, + long long dataSizeIncrement, + long long numRecordsIncrement ) { + _dataSize += dataSizeIncrement; + _numRecords += numRecordsIncrement; + } + + void DummyRecordStoreV1MetaData::setStats( OperationContext* txn, + long long dataSizeIncrement, + long long numRecordsIncrement ) { + _dataSize = dataSizeIncrement; + _numRecords = numRecordsIncrement; + } + + namespace { + DiskLoc myNull; + } + + const DiskLoc& DummyRecordStoreV1MetaData::deletedListEntry( int bucket ) const { + invariant( bucket >= 0 ); + if ( static_cast<size_t>( bucket ) >= _deletedLists.size() ) + return myNull; + return _deletedLists[bucket]; + } + + void DummyRecordStoreV1MetaData::setDeletedListEntry( OperationContext* txn, + int bucket, + const DiskLoc& loc ) { + invariant( bucket >= 0 ); + invariant( bucket < 1000 ); + while ( static_cast<size_t>( bucket ) >= _deletedLists.size() ) + _deletedLists.push_back( DiskLoc() ); + _deletedLists[bucket] = loc; + } + + void DummyRecordStoreV1MetaData::orphanDeletedList(OperationContext* txn) { + invariant( false ); + } + + const DiskLoc& DummyRecordStoreV1MetaData::firstExtent(OperationContext* txn) const { + return _firstExtent; + } + + void DummyRecordStoreV1MetaData::setFirstExtent( OperationContext* txn, + const DiskLoc& loc ) { + _firstExtent = loc; + } + + const DiskLoc& DummyRecordStoreV1MetaData::lastExtent(OperationContext* txn) const { + return _lastExtent; + } + + void DummyRecordStoreV1MetaData::setLastExtent( OperationContext* txn, + const DiskLoc& loc ) { + _lastExtent = loc; + } + + bool DummyRecordStoreV1MetaData::isCapped() const { + return _capped; + } + + bool DummyRecordStoreV1MetaData::isUserFlagSet( int flag ) const { + return _userFlags & flag; + } + + bool DummyRecordStoreV1MetaData::setUserFlag( OperationContext* txn, int flag ) { + if ( ( _userFlags & flag ) == flag ) + return false; + + _userFlags |= flag; + return true; + + } + bool DummyRecordStoreV1MetaData::clearUserFlag( OperationContext* txn, int flag ) { + if ( ( _userFlags & flag ) == 0 ) + return false; + + _userFlags &= ~flag; + return true; + + } + bool DummyRecordStoreV1MetaData::replaceUserFlags( OperationContext* txn, int flags ) { + if ( _userFlags == flags ) + return false; + _userFlags = flags; + return true; + } + + + int DummyRecordStoreV1MetaData::lastExtentSize(OperationContext* txn) const { + return _lastExtentSize; + } + + void DummyRecordStoreV1MetaData::setLastExtentSize( OperationContext* txn, int newMax ) { + _lastExtentSize = newMax; + } + + long long DummyRecordStoreV1MetaData::maxCappedDocs() const { + return _maxCappedDocs; + } + + double DummyRecordStoreV1MetaData::paddingFactor() const { + return _paddingFactor; + } + + void DummyRecordStoreV1MetaData::setPaddingFactor( OperationContext* txn, + double paddingFactor ) { + _paddingFactor = paddingFactor; + } + + // ----------------------------------------- + + DummyExtentManager::~DummyExtentManager() { + for ( size_t i = 0; i < _extents.size(); i++ ) { + if ( _extents[i].data ) + free( _extents[i].data ); + } + } + + Status DummyExtentManager::init(OperationContext* txn) { + return Status::OK(); + } + + int DummyExtentManager::numFiles() const { + return static_cast<int>( _extents.size() ); + } + + long long DummyExtentManager::fileSize() const { + invariant( false ); + return -1; + } + + DiskLoc DummyExtentManager::allocateExtent( OperationContext* txn, + bool capped, + int size, + bool enforceQuota ) { + size = quantizeExtentSize( size ); + + ExtentInfo info; + info.data = static_cast<char*>( malloc( size ) ); + info.length = size; + + DiskLoc loc( _extents.size(), 0 ); + _extents.push_back( info ); + + Extent* e = getExtent( loc, false ); + e->magic = Extent::extentSignature; + e->myLoc = loc; + e->xnext.Null(); + e->xprev.Null(); + e->length = size; + e->firstRecord.Null(); + e->lastRecord.Null(); + + return loc; + + } + + void DummyExtentManager::freeExtents( OperationContext* txn, + DiskLoc firstExt, DiskLoc lastExt ) { + // XXX + } + + void DummyExtentManager::freeExtent( OperationContext* txn, DiskLoc extent ) { + // XXX + } + void DummyExtentManager::freeListStats( int* numExtents, int64_t* totalFreeSize ) const { + invariant( false ); + } + + Record* DummyExtentManager::recordForV1( const DiskLoc& loc ) const { + invariant( static_cast<size_t>( loc.a() ) < _extents.size() ); + invariant( static_cast<size_t>( loc.getOfs() ) < _extents[loc.a()].length ); + char* root = _extents[loc.a()].data; + return reinterpret_cast<Record*>( root + loc.getOfs() ); + } + + Extent* DummyExtentManager::extentForV1( const DiskLoc& loc ) const { + invariant( false ); + } + + DiskLoc DummyExtentManager::extentLocForV1( const DiskLoc& loc ) const { + return DiskLoc( loc.a(), 0 ); + } + + Extent* DummyExtentManager::getExtent( const DiskLoc& loc, bool doSanityCheck ) const { + invariant( !loc.isNull() ); + invariant( static_cast<size_t>( loc.a() ) < _extents.size() ); + invariant( loc.getOfs() == 0 ); + Extent* ext = reinterpret_cast<Extent*>( _extents[loc.a()].data ); + if (doSanityCheck) + ext->assertOk(); + return ext; + } + + int DummyExtentManager::maxSize() const { + return 1024 * 1024 * 64; + } + + DummyExtentManager::CacheHint* DummyExtentManager::cacheHint( const DiskLoc& extentLoc, const HintType& hint ) { + return new CacheHint(); + } + +namespace { + void accumulateExtentSizeRequirements(const LocAndSize* las, std::map<int, size_t>* sizes) { + if (!las) + return; + + while (!las->loc.isNull()) { + // We require passed in offsets to be > 1000 to leave room for Extent headers. + invariant(Extent::HeaderSize() < 1000); + invariant(las->loc.getOfs() >= 1000); + + const size_t end = las->loc.getOfs() + las->size; + size_t& sizeNeeded = (*sizes)[las->loc.a()]; + sizeNeeded = std::max(sizeNeeded, end); + las++; + } + } + + void printRecList(OperationContext* txn, + const ExtentManager* em, + const RecordStoreV1MetaData* md) { + log() << " *** BEGIN ACTUAL RECORD LIST *** "; + DiskLoc extLoc = md->firstExtent(txn); + std::set<DiskLoc> seenLocs; + while (!extLoc.isNull()) { + Extent* ext = em->getExtent(extLoc, true); + DiskLoc actualLoc = ext->firstRecord; + while (!actualLoc.isNull()) { + const Record* actualRec = em->recordForV1(actualLoc); + const int actualSize = actualRec->lengthWithHeaders(); + + log() << "loc: " << actualLoc // <--hex + << " (" << actualLoc.getOfs() << ")" + << " size: " << actualSize + << " prev: " << actualRec->prevOfs() + << " next: " << actualRec->nextOfs() + << (actualLoc == md->capFirstNewRecord() ? " (CAP_FIRST_NEW)" : "") + ; + + const bool foundCycle = !seenLocs.insert(actualLoc).second; + invariant(!foundCycle); + + const int nextOfs = actualRec->nextOfs(); + actualLoc = (nextOfs == DiskLoc::NullOfs ? DiskLoc() + : DiskLoc(actualLoc.a(), nextOfs)); + } + extLoc = ext->xnext; + } + log() << " *** END ACTUAL RECORD LIST *** "; + } + + void printDRecList(const ExtentManager* em, const RecordStoreV1MetaData* md) { + log() << " *** BEGIN ACTUAL DELETED RECORD LIST *** "; + std::set<DiskLoc> seenLocs; + for (int bucketIdx = 0; bucketIdx < RecordStoreV1Base::Buckets; bucketIdx++) { + DiskLoc actualLoc = md->deletedListEntry(bucketIdx); + while (!actualLoc.isNull()) { + const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted(); + const int actualSize = actualDrec->lengthWithHeaders(); + + log() << "loc: " << actualLoc // <--hex + << " (" << actualLoc.getOfs() << ")" + << " size: " << actualSize + << " bucket: " << bucketIdx + << " next: " << actualDrec->nextDeleted(); + + const bool foundCycle = !seenLocs.insert(actualLoc).second; + invariant(!foundCycle); + + actualLoc = actualDrec->nextDeleted(); + } + + // Only print bucket 0 in capped collections since it contains all deleted records + if (md->isCapped()) + break; + } + log() << " *** END ACTUAL DELETED RECORD LIST *** "; + } +} + + void initializeV1RS(OperationContext* txn, + const LocAndSize* records, + const LocAndSize* drecs, + DummyExtentManager* em, + DummyRecordStoreV1MetaData* md) { + invariant(records || drecs); // if both are NULL nothing is being created... + + // Need to start with a blank slate + invariant(em->numFiles() == 0); + invariant(md->firstExtent(txn).isNull()); + + // pre-allocate extents (even extents that aren't part of this RS) + { + typedef std::map<int, size_t> ExtentSizes; + ExtentSizes extentSizes; + accumulateExtentSizeRequirements(records, &extentSizes); + accumulateExtentSizeRequirements(drecs, &extentSizes); + invariant(!extentSizes.empty()); + + const int maxExtent = extentSizes.rbegin()->first; + for (int i = 0; i <= maxExtent; i++) { + const size_t size = extentSizes.count(i) ? extentSizes[i] : 0; + const DiskLoc loc = em->allocateExtent(txn, md->isCapped(), size, 0); + + // This function and assertState depend on these details of DummyExtentManager + invariant(loc.a() == i); + invariant(loc.getOfs() == 0); + } + + // link together extents that should be part of this RS + md->setFirstExtent(txn, DiskLoc(extentSizes.begin()->first, 0)); + md->setLastExtent(txn, DiskLoc(extentSizes.rbegin()->first, 0)); + for (ExtentSizes::iterator it = extentSizes.begin(); + boost::next(it) != extentSizes.end(); /* ++it */ ) { + const int a = it->first; + ++it; + const int b = it->first; + em->getExtent(DiskLoc(a, 0))->xnext = DiskLoc(b, 0); + em->getExtent(DiskLoc(b, 0))->xprev = DiskLoc(a, 0); + } + + // This signals "done allocating new extents". + if (md->isCapped()) + md->setDeletedListEntry(txn, 1, DiskLoc()); + } + + if (records && !records[0].loc.isNull()) { + int recIdx = 0; + DiskLoc extLoc = md->firstExtent(txn); + while (!extLoc.isNull()) { + Extent* ext = em->getExtent(extLoc); + int prevOfs = DiskLoc::NullOfs; + while (extLoc.a() == records[recIdx].loc.a()) { // for all records in this extent + const DiskLoc loc = records[recIdx].loc; + const int size = records[recIdx].size;; + invariant(size >= Record::HeaderSize); + + md->incrementStats(txn, size - Record::HeaderSize, 1); + + if (ext->firstRecord.isNull()) + ext->firstRecord = loc; + + Record* rec = em->recordForV1(loc); + rec->lengthWithHeaders() = size; + rec->extentOfs() = 0; + + rec->prevOfs() = prevOfs; + prevOfs = loc.getOfs(); + + const DiskLoc nextLoc = records[recIdx + 1].loc; + if (nextLoc.a() == loc.a()) { // if next is in same extent + rec->nextOfs() = nextLoc.getOfs(); + } + else { + rec->nextOfs() = DiskLoc::NullOfs; + ext->lastRecord = loc; + } + + recIdx++; + } + extLoc = ext->xnext; + } + invariant(records[recIdx].loc.isNull()); + } + + if (drecs && !drecs[0].loc.isNull()) { + int drecIdx = 0; + DiskLoc* prevNextPtr = NULL; + int lastBucket = -1; + while (!drecs[drecIdx].loc.isNull()) { + const DiskLoc loc = drecs[drecIdx].loc; + const int size = drecs[drecIdx].size; + invariant(size >= Record::HeaderSize); + const int bucket = RecordStoreV1Base::bucket(size); + + if (md->isCapped()) { + // All drecs form a single list in bucket 0 + if (prevNextPtr == NULL) { + md->setDeletedListEntry(txn, 0, loc); + } + else { + *prevNextPtr = loc; + } + + if (loc.a() < md->capExtent().a() + && drecs[drecIdx + 1].loc.a() == md->capExtent().a()) { + // Bucket 1 is known as cappedLastDelRecLastExtent + md->setDeletedListEntry(txn, 1, loc); + } + } + else if (bucket != lastBucket) { + invariant(bucket > lastBucket); // if this fails, drecs weren't sorted by bucket + md->setDeletedListEntry(txn, bucket, loc); + lastBucket = bucket; + } + else { + *prevNextPtr = loc; + } + + DeletedRecord* drec = &em->recordForV1(loc)->asDeleted(); + drec->lengthWithHeaders() = size; + drec->extentOfs() = 0; + drec->nextDeleted() = DiskLoc(); + prevNextPtr = &drec->nextDeleted(); + + drecIdx++; + } + } + + // Make sure we set everything up as requested. + assertStateV1RS(txn, records, drecs, em, md); + } + + void assertStateV1RS(OperationContext* txn, + const LocAndSize* records, + const LocAndSize* drecs, + const ExtentManager* em, + const DummyRecordStoreV1MetaData* md) { + invariant(records || drecs); // if both are NULL nothing is being asserted... + + try { + if (records) { + long long dataSize = 0; + long long numRecs = 0; + + int recIdx = 0; + + DiskLoc extLoc = md->firstExtent(txn); + while (!extLoc.isNull()) { // for each Extent + Extent* ext = em->getExtent(extLoc, true); + int expectedPrevOfs = DiskLoc::NullOfs; + DiskLoc actualLoc = ext->firstRecord; + while (!actualLoc.isNull()) { // for each Record in this Extent + const Record* actualRec = em->recordForV1(actualLoc); + const int actualSize = actualRec->lengthWithHeaders(); + + dataSize += actualSize - Record::HeaderSize; + numRecs += 1; + + ASSERT_EQUALS(actualLoc, records[recIdx].loc); + ASSERT_EQUALS(actualSize, records[recIdx].size); + + ASSERT_EQUALS(actualRec->extentOfs(), extLoc.getOfs()); + ASSERT_EQUALS(actualRec->prevOfs(), expectedPrevOfs); + expectedPrevOfs = actualLoc.getOfs(); + + recIdx++; + const int nextOfs = actualRec->nextOfs(); + actualLoc = (nextOfs == DiskLoc::NullOfs ? DiskLoc() + : DiskLoc(actualLoc.a(), nextOfs)); + } + + if (ext->xnext.isNull()) { + ASSERT_EQUALS(md->lastExtent(txn), extLoc); + } + + extLoc = ext->xnext; + } + + // both the expected and actual record lists must be done at this point + ASSERT_EQUALS(records[recIdx].loc, DiskLoc()); + + ASSERT_EQUALS(dataSize, md->dataSize()); + ASSERT_EQUALS(numRecs, md->numRecords()); + } + + if (drecs) { + int drecIdx = 0; + for (int bucketIdx = 0; bucketIdx < RecordStoreV1Base::Buckets; bucketIdx++) { + DiskLoc actualLoc = md->deletedListEntry(bucketIdx); + + if (md->isCapped() && bucketIdx == 1) { + // In capped collections, the 2nd bucket (index 1) points to the drec before + // the first drec in the capExtent. If the capExtent is the first Extent, + // it should be Null. + + if (md->capExtent() == md->firstExtent(txn)) { + ASSERT_EQUALS(actualLoc, DiskLoc()); + } + else { + ASSERT_NOT_EQUALS(actualLoc.a(), md->capExtent().a()); + const DeletedRecord* actualDrec = + &em->recordForV1(actualLoc)->asDeleted(); + ASSERT_EQUALS(actualDrec->nextDeleted().a(), md->capExtent().a()); + } + + // Don't do normal checking of bucket 1 in capped collections. Checking + // other buckets to verify that they are Null. + continue; + } + + while (!actualLoc.isNull()) { + const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted(); + const int actualSize = actualDrec->lengthWithHeaders(); + + ASSERT_EQUALS(actualLoc, drecs[drecIdx].loc); + ASSERT_EQUALS(actualSize, drecs[drecIdx].size); + + // Make sure the drec is correct + ASSERT_EQUALS(actualDrec->extentOfs(), 0); + + // in capped collections all drecs are linked into a single list in bucket 0 + ASSERT_EQUALS(bucketIdx, md->isCapped() + ? 0 + : RecordStoreV1Base::bucket(actualSize)); + + drecIdx++; + actualLoc = actualDrec->nextDeleted(); + } + } + // both the expected and actual deleted lists must be done at this point + ASSERT_EQUALS(drecs[drecIdx].loc, DiskLoc()); + } + } + catch (...) { + // If a test fails, provide extra info to make debugging easier + printRecList(txn, em, md); + printDRecList(em, md); + throw; + } + } +} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h new file mode 100644 index 00000000000..87ddc078b6d --- /dev/null +++ b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h @@ -0,0 +1,198 @@ +// record_store_v1_test_help.h + +/** +* Copyright (C) 2014 MongoDB Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#pragma once + +#include <vector> + +#include "mongo/db/storage/mmap_v1/extent_manager.h" +#include "mongo/db/storage/mmap_v1/record_store_v1_base.h" + +namespace mongo { + + class DummyRecordStoreV1MetaData : public RecordStoreV1MetaData { + public: + DummyRecordStoreV1MetaData( bool capped, int userFlags ); + virtual ~DummyRecordStoreV1MetaData(){} + + virtual const DiskLoc& capExtent() const; + virtual void setCapExtent( OperationContext* txn, const DiskLoc& loc ); + + virtual const DiskLoc& capFirstNewRecord() const; + virtual void setCapFirstNewRecord( OperationContext* txn, const DiskLoc& loc ); + + virtual long long dataSize() const; + virtual long long numRecords() const; + + virtual void incrementStats( OperationContext* txn, + long long dataSizeIncrement, + long long numRecordsIncrement ); + + virtual void setStats( OperationContext* txn, + long long dataSizeIncrement, + long long numRecordsIncrement ); + + virtual const DiskLoc& deletedListEntry( int bucket ) const; + virtual void setDeletedListEntry( OperationContext* txn, + int bucket, + const DiskLoc& loc ); + virtual void orphanDeletedList(OperationContext* txn); + + virtual const DiskLoc& firstExtent( OperationContext* txn ) const; + virtual void setFirstExtent( OperationContext* txn, const DiskLoc& loc ); + + virtual const DiskLoc& lastExtent( OperationContext* txn ) const; + virtual void setLastExtent( OperationContext* txn, const DiskLoc& loc ); + + virtual bool isCapped() const; + + virtual bool isUserFlagSet( int flag ) const; + virtual int userFlags() const { return _userFlags; } + virtual bool setUserFlag( OperationContext* txn, int flag ); + virtual bool clearUserFlag( OperationContext* txn, int flag ); + virtual bool replaceUserFlags( OperationContext* txn, int flags ); + + + virtual int lastExtentSize( OperationContext* txn ) const; + virtual void setLastExtentSize( OperationContext* txn, int newMax ); + + virtual long long maxCappedDocs() const; + + virtual double paddingFactor() const; + + virtual void setPaddingFactor( OperationContext* txn, double paddingFactor ); + + protected: + + DiskLoc _capExtent; + DiskLoc _capFirstNewRecord; + + long long _dataSize; + long long _numRecords; + + DiskLoc _firstExtent; + DiskLoc _lastExtent; + + bool _capped; + int _userFlags; + long long _maxCappedDocs; + + int _lastExtentSize; + double _paddingFactor; + + std::vector<DiskLoc> _deletedLists; + }; + + class DummyExtentManager : public ExtentManager { + public: + virtual ~DummyExtentManager(); + + virtual Status init(OperationContext* txn); + + virtual int numFiles() const; + virtual long long fileSize() const; + + virtual DiskLoc allocateExtent( OperationContext* txn, + bool capped, + int size, + bool enforceQuota ); + + virtual void freeExtents( OperationContext* txn, + DiskLoc firstExt, DiskLoc lastExt ); + + virtual void freeExtent( OperationContext* txn, DiskLoc extent ); + + virtual void freeListStats( int* numExtents, int64_t* totalFreeSize ) const; + + virtual Record* recordForV1( const DiskLoc& loc ) const; + + virtual Extent* extentForV1( const DiskLoc& loc ) const; + + virtual DiskLoc extentLocForV1( const DiskLoc& loc ) const; + + virtual Extent* getExtent( const DiskLoc& loc, bool doSanityCheck = true ) const; + + virtual int maxSize() const; + + virtual CacheHint* cacheHint( const DiskLoc& extentLoc, const HintType& hint ); + + protected: + struct ExtentInfo { + char* data; + size_t length; + }; + + std::vector<ExtentInfo> _extents; + }; + + struct LocAndSize { + DiskLoc loc; + int size; // with headers + }; + + /** + * Creates a V1 storage/mmap_v1 with the passed in records and DeletedRecords (drecs). + * + * List of LocAndSize are terminated by a Null DiskLoc. Passing a NULL pointer is shorthand for + * an empty list. Each extent gets it's own DiskLoc file number. DiskLoc Offsets must be > 1000. + * + * records must be sorted by extent/file. offsets within an extent can be in any order. + * + * In a simple RS, drecs must be grouped into size-buckets, but the ordering within the size + * buckets is up to you. + * + * In a capped collection, all drecs form a single list and must be grouped by extent, with each + * extent having at least one drec. capFirstNewRecord() and capExtent() *must* be correctly set + * on md before calling. + * + * You are responsible for ensuring the records and drecs don't overlap. + * + * ExtentManager and MetaData must both be empty. + */ + void initializeV1RS(OperationContext* txn, + const LocAndSize* records, + const LocAndSize* drecs, + DummyExtentManager* em, + DummyRecordStoreV1MetaData* md); + + /** + * Asserts that the V1RecordStore defined by md has the passed in records and drecs in the + * correct order. + * + * List of LocAndSize are terminated by a Null DiskLoc. Passing a NULL pointer means don't check + * that list. + */ + void assertStateV1RS(OperationContext* txn, + const LocAndSize* records, + const LocAndSize* drecs, + const ExtentManager* em, + const DummyRecordStoreV1MetaData* md); + +} // namespace mongo diff --git a/src/mongo/db/storage/record_store.h b/src/mongo/db/storage/record_store.h new file mode 100644 index 00000000000..8437046e5d6 --- /dev/null +++ b/src/mongo/db/storage/record_store.h @@ -0,0 +1,291 @@ +// record_store.h + +/** +* Copyright (C) 2013 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +* +* As a special exception, the copyright holders give permission to link the +* code of portions of this program with the OpenSSL library under certain +* conditions as described in each individual source file and distribute +* linked combinations including the program with the OpenSSL library. You +* must comply with the GNU Affero General Public License in all respects for +* all of the code used other than as permitted herein. If you modify file(s) +* with this exception, you may extend this exception to your version of the +* file(s), but you are not obligated to do so. If you do not wish to do so, +* delete this exception statement from your version. If you delete this +* exception statement from all source files in the program, then also delete +* it in the license file. +*/ + +#pragma once + +#include "mongo/base/owned_pointer_vector.h" +#include "mongo/bson/mutable/damage_vector.h" +#include "mongo/db/diskloc.h" +#include "mongo/db/exec/collection_scan_common.h" +#include "mongo/db/storage/record_data.h" + +namespace mongo { + + class CappedDocumentDeleteCallback; + class Collection; + struct CompactOptions; + struct CompactStats; + class DocWriter; + class MAdvise; + class NamespaceDetails; + class OperationContext; + class Record; + + class RecordStoreCompactAdaptor; + class RecordStore; + + struct ValidateResults; + class ValidateAdaptor; + + /** + * Allows inserting a Record "in-place" without creating a copy ahead of time. + */ + class DocWriter { + public: + virtual ~DocWriter() {} + virtual void writeDocument( char* buf ) const = 0; + virtual size_t documentSize() const = 0; + virtual bool addPadding() const { return true; } + }; + + /** + * @see RecordStore::updateRecord + */ + class UpdateMoveNotifier { + public: + virtual ~UpdateMoveNotifier(){} + virtual Status recordStoreGoingToMove( OperationContext* txn, + const DiskLoc& oldLocation, + const char* oldBuffer, + size_t oldSize ) = 0; + }; + + /** + * A RecordIterator provides an interface for walking over a RecordStore. + * The details of navigating the collection's structure are below this interface. + */ + class RecordIterator { + public: + virtual ~RecordIterator() { } + + // True if getNext will produce no more data, false otherwise. + virtual bool isEOF() = 0; + + // Return the DiskLoc that the iterator points at. Returns DiskLoc() if isEOF. + virtual DiskLoc curr() = 0; + + // Return the DiskLoc that the iterator points at and move the iterator to the next item + // from the collection. Returns DiskLoc() if isEOF. + virtual DiskLoc getNext() = 0; + + // Can only be called after prepareToYield and before recoverFromYield. + virtual void invalidate(const DiskLoc& dl) = 0; + + // Save any state required to resume operation (without crashing) after DiskLoc deletion or + // a collection drop. + virtual void prepareToYield() = 0; + + // Returns true if collection still exists, false otherwise. + virtual bool recoverFromYield() = 0; + + // normally this will just go back to the RecordStore and convert + // but this gives the iterator an oppurtnity to optimize + virtual RecordData dataFor( const DiskLoc& loc ) const = 0; + }; + + + class RecordStore { + MONGO_DISALLOW_COPYING(RecordStore); + public: + RecordStore( const StringData& ns ) : _ns(ns.toString()) { } + + virtual ~RecordStore() { } + + // META + + // name of the RecordStore implementation + virtual const char* name() const = 0; + + virtual long long dataSize() const = 0; + + virtual long long numRecords() const = 0; + + virtual bool isCapped() const = 0; + + virtual void setCappedDeleteCallback(CappedDocumentDeleteCallback*) {invariant( false );} + + /** + * @param extraInfo - optional more debug info + * @param level - optional, level of debug info to put in (higher is more) + */ + virtual int64_t storageSize( OperationContext* txn, + BSONObjBuilder* extraInfo = NULL, + int infoLevel = 0 ) const = 0; + + // CRUD related + + virtual RecordData dataFor( const DiskLoc& loc) const = 0; + + virtual void deleteRecord( OperationContext* txn, const DiskLoc& dl ) = 0; + + virtual StatusWith<DiskLoc> insertRecord( OperationContext* txn, + const char* data, + int len, + bool enforceQuota ) = 0; + + virtual StatusWith<DiskLoc> insertRecord( OperationContext* txn, + const DocWriter* doc, + bool enforceQuota ) = 0; + + /** + * @param notifier - this is called if the document is moved + * it is to be called after the document has been written to new + * location, before deleted from old. + * @return Status or DiskLoc, DiskLoc might be different + */ + virtual StatusWith<DiskLoc> updateRecord( OperationContext* txn, + const DiskLoc& oldLocation, + const char* data, + int len, + bool enforceQuota, + UpdateMoveNotifier* notifier ) = 0; + + virtual Status updateWithDamages( OperationContext* txn, + const DiskLoc& loc, + const char* damangeSource, + const mutablebson::DamageVector& damages ) = 0; + /** + * returned iterator owned by caller + * canonical to get all would be + * getIterator( txn, DiskLoc(), false, CollectionScanParams::FORWARD ) + */ + virtual RecordIterator* getIterator( OperationContext* txn, + const DiskLoc& start = DiskLoc(), + bool tailable = false, + const CollectionScanParams::Direction& dir = + CollectionScanParams::FORWARD + ) const = 0; + + /** + * Constructs an iterator over a potentially corrupted store, which can be used to salvage + * damaged records. The iterator might return every record in the store if all of them + * are reachable and not corrupted. + */ + virtual RecordIterator* getIteratorForRepair( OperationContext* txn ) const = 0; + + /** + * Returns many iterators that partition the RecordStore into many disjoint sets. Iterating + * all returned iterators is equivalent to Iterating the full store. + */ + virtual std::vector<RecordIterator*> getManyIterators( OperationContext* txn ) const = 0; + + // higher level + + + /** + * removes all Records + */ + virtual Status truncate( OperationContext* txn ) = 0; + + /** + * Truncate documents newer than the document at 'end' from the capped + * collection. The collection cannot be completely emptied using this + * function. An assertion will be thrown if that is attempted. + * @param inclusive - Truncate 'end' as well iff true + * XXX: this will go away soon, just needed to move for now + */ + virtual void temp_cappedTruncateAfter(OperationContext* txn, + DiskLoc end, + bool inclusive) = 0; + + // does this RecordStore support the compact operation + virtual bool compactSupported() const = 0; + virtual Status compact( OperationContext* txn, + RecordStoreCompactAdaptor* adaptor, + const CompactOptions* options, + CompactStats* stats ) = 0; + + /** + * @param full - does more checks + * @param scanData - scans each document + * @return OK if the validate run successfully + * OK will be returned even if corruption is found + * deatils will be in result + */ + virtual Status validate( OperationContext* txn, + bool full, bool scanData, + ValidateAdaptor* adaptor, + ValidateResults* results, BSONObjBuilder* output ) const = 0; + + /** + * @param scaleSize - amount by which to scale size metrics + * appends any custom stats from the RecordStore or other unique stats + */ + virtual void appendCustomStats( OperationContext* txn, + BSONObjBuilder* result, + double scale ) const = 0; + + /** + * Load all data into cache. + * What cache depends on implementation. + * @param output (optional) - where to put detailed stats + */ + virtual Status touch( OperationContext* txn, BSONObjBuilder* output ) const = 0; + + /** + * @return Status::OK() if option hanlded + * InvalidOptions is option not supported + * other errors indicate option supported, but error setting + */ + virtual Status setCustomOption( OperationContext* txn, + const BSONElement& option, + BSONObjBuilder* info = NULL ) = 0; + protected: + std::string _ns; + }; + + class RecordStoreCompactAdaptor { + public: + virtual ~RecordStoreCompactAdaptor(){} + virtual bool isDataValid( const RecordData& recData ) = 0; + virtual size_t dataSize( const RecordData& recData ) = 0; + virtual void inserted( const RecordData& recData, const DiskLoc& newLocation ) = 0; + }; + + struct ValidateResults { + ValidateResults() { + valid = true; + } + bool valid; + std::vector<std::string> errors; + }; + + /** + * This is so when a RecordStore is validating all records + * it can call back to someone to check if a record is valid. + * The actual data contained in a Record is totally opaque to the implementation. + */ + class ValidateAdaptor { + public: + virtual ~ValidateAdaptor(){} + + virtual Status validate( const RecordData& recordData, size_t* dataSize ) = 0; + }; +} diff --git a/src/mongo/db/storage/rocks/rocks_btree_impl.cpp b/src/mongo/db/storage/rocks/rocks_btree_impl.cpp index 00cbbf1c580..8bd3f2734cf 100644 --- a/src/mongo/db/storage/rocks/rocks_btree_impl.cpp +++ b/src/mongo/db/storage/rocks/rocks_btree_impl.cpp @@ -87,7 +87,7 @@ namespace mongo { rocksdb::Slice sliced[2]; }; - class RocksCursor : public BtreeInterface::Cursor { + class RocksCursor : public SortedDataInterface::Cursor { public: RocksCursor( rocksdb::Iterator* iterator, bool direction ) : _iterator( iterator ), _direction( direction ), _cached( false ) { @@ -285,8 +285,8 @@ namespace mongo { return Status::OK(); } - BtreeInterface::Cursor* RocksBtreeImpl::newCursor(OperationContext* txn, - int direction) const { + SortedDataInterface::Cursor* RocksBtreeImpl::newCursor(OperationContext* txn, + int direction) const { return new RocksCursor( _db->NewIterator( rocksdb::ReadOptions(), _columnFamily ), txn, diff --git a/src/mongo/db/storage/rocks/rocks_btree_impl.h b/src/mongo/db/storage/rocks/rocks_btree_impl.h index 2a15e46aad5..4e75ad50e11 100644 --- a/src/mongo/db/storage/rocks/rocks_btree_impl.h +++ b/src/mongo/db/storage/rocks/rocks_btree_impl.h @@ -28,7 +28,7 @@ * it in the license file. */ -#include "mongo/db/structure/btree/btree_interface.h" +#include "mongo/db/storage/sorted_data_interface.h" #pragma once @@ -47,7 +47,7 @@ namespace mongo { virtual unsigned long long commit(bool mayInterrupt) = 0; }; - class RocksBtreeImpl : public BtreeInterface { + class RocksBtreeImpl : public SortedDataInterface { public: RocksBtreeImpl( rocksdb::DB* db, rocksdb::ColumnFamilyHandle* cf ); diff --git a/src/mongo/db/storage/rocks/rocks_btree_impl_test.cpp b/src/mongo/db/storage/rocks/rocks_btree_impl_test.cpp index f7102163352..e080fb08faf 100644 --- a/src/mongo/db/storage/rocks/rocks_btree_impl_test.cpp +++ b/src/mongo/db/storage/rocks/rocks_btree_impl_test.cpp @@ -126,7 +126,7 @@ namespace mongo { DiskLoc loc( 5, 16 ); { - scoped_ptr<BtreeInterface::Cursor> cursor( btree.newCursor( 1 ) ); + scoped_ptr<SortedDataInterface::Cursor> cursor( btree.newCursor( 1 ) ); ASSERT( !cursor->locate( key, loc ) ); } @@ -140,7 +140,7 @@ namespace mongo { } { - scoped_ptr<BtreeInterface::Cursor> cursor( btree.newCursor( 1 ) ); + scoped_ptr<SortedDataInterface::Cursor> cursor( btree.newCursor( 1 ) ); ASSERT( cursor->locate( key, loc ) ); ASSERT_EQUALS( key, cursor->getKey() ); ASSERT_EQUALS( loc, cursor->getDiskLoc() ); @@ -166,7 +166,7 @@ namespace mongo { } { - scoped_ptr<BtreeInterface::Cursor> cursor( btree.newCursor( 1 ) ); + scoped_ptr<SortedDataInterface::Cursor> cursor( btree.newCursor( 1 ) ); ASSERT( cursor->locate( BSON( "a" << 2 ), DiskLoc(0,0) ) ); ASSERT( !cursor->isEOF() ); ASSERT_EQUALS( BSON( "" << 2 ), cursor->getKey() ); diff --git a/src/mongo/db/storage/sorted_data_interface.h b/src/mongo/db/storage/sorted_data_interface.h new file mode 100644 index 00000000000..52f20a6288d --- /dev/null +++ b/src/mongo/db/storage/sorted_data_interface.h @@ -0,0 +1,200 @@ +/** + * Copyright (C) 2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/bson/ordering.h" +#include "mongo/db/catalog/head_manager.h" +#include "mongo/db/diskloc.h" +#include "mongo/db/jsobj.h" +#include "mongo/db/operation_context.h" +#include "mongo/db/storage/record_store.h" + +#pragma once + +namespace mongo { + + class BucketDeletionNotification; + class SortedDataBuilderInterface; + + /** + * This interface is a work in progress. Notes below: + * + * This interface began as the SortedDataInterface, a way to hide the fact that there were two + * on-disk formats for the btree. With the introduction of other storage engines, this + * interface was generalized to provide access to sorted data. Specifically: + * + * 1. Many other storage engines provide different Btree(-ish) implementations. This interface + * could allow those interfaces to avoid storing btree buckets in an already sorted structure. + * + * TODO: See if there is actually a performance gain. + * + * 2. The existing btree implementation is written to assume that if it modifies a record it is + * modifying the underlying record. This interface is an attempt to work around that. + * + * TODO: See if this actually works. + */ + class SortedDataInterface { + public: + virtual ~SortedDataInterface() { } + + // + // Data changes + // + + /** + * Caller owns returned pointer. + * 'this' must outlive the returned pointer. + */ + virtual SortedDataBuilderInterface* getBulkBuilder(OperationContext* txn, + bool dupsAllowed) = 0; + + virtual Status insert(OperationContext* txn, + const BSONObj& key, + const DiskLoc& loc, + bool dupsAllowed) = 0; + + virtual bool unindex(OperationContext* txn, + const BSONObj& key, + const DiskLoc& loc) = 0; + + // TODO: Hide this by exposing an update method? + virtual Status dupKeyCheck(OperationContext* txn, + const BSONObj& key, + const DiskLoc& loc) = 0; + + // + // Information about the tree + // + + // TODO: expose full set of args for testing? + virtual void fullValidate(OperationContext* txn, long long* numKeysOut) = 0; + + virtual bool isEmpty() = 0; + + /** + * Attempt to bring whole index into memory. No-op is ok if not supported. + */ + virtual Status touch(OperationContext* txn) const = 0; + + // + // Navigation + // + + class Cursor { + public: + virtual ~Cursor() {} + + virtual int getDirection() const = 0; + + virtual bool isEOF() const = 0; + + /** + * Will only be called with other from same index as this. + * All EOF locs should be considered equal. + */ + virtual bool pointsToSamePlaceAs(const Cursor& other) const = 0; + + /** + * If the SortedDataInterface impl calls the BucketNotificationCallback, the argument must + * be forwarded to all Cursors over that SortedData. + * TODO something better. + */ + virtual void aboutToDeleteBucket(const DiskLoc& bucket) = 0; + + virtual bool locate(const BSONObj& key, const DiskLoc& loc) = 0; + + virtual void advanceTo(const BSONObj &keyBegin, + int keyBeginLen, + bool afterKey, + const vector<const BSONElement*>& keyEnd, + const vector<bool>& keyEndInclusive) = 0; + + /** + * Locate a key with fields comprised of a combination of keyBegin fields and keyEnd + * fields. + */ + virtual void customLocate(const BSONObj& keyBegin, + int keyBeginLen, + bool afterVersion, + const vector<const BSONElement*>& keyEnd, + const vector<bool>& keyEndInclusive) = 0; + + /** + * Return OK if it's not + * Otherwise return a status that can be displayed + */ + virtual BSONObj getKey() const = 0; + + virtual DiskLoc getDiskLoc() const = 0; + + virtual void advance() = 0; + + // + // Saving and restoring state + // + virtual void savePosition() = 0; + + virtual void restorePosition() = 0; + }; + + /** + * Caller takes ownership. SortedDataInterface must outlive all Cursors it produces. + */ + virtual Cursor* newCursor(OperationContext* txn, int direction) const = 0; + + // + // Index creation + // + + virtual Status initAsEmpty(OperationContext* txn) = 0; + }; + + /** + * A version-hiding wrapper around the bulk builder for the Btree. + */ + class SortedDataBuilderInterface { + public: + virtual ~SortedDataBuilderInterface() { } + + /** + * Adds 'key' to intermediate storage. + * + * 'key' must be > or >= the last key passed to this function (depends on _dupsAllowed). If + * this is violated an error Status (ErrorCodes::InternalError) will be returned. + */ + virtual Status addKey(const BSONObj& key, const DiskLoc& loc) = 0; + + /** + * commit work. if not called, destructor will clean up partially completed work + * (in case exception has happened). + * + * Returns number of keys added. + */ + virtual unsigned long long commit(bool mayInterrupt) = 0; + }; + +} // namespace mongo |