// btree.h /** * Copyright (C) 2008 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ #pragma once #include "../pch.h" #include "jsobj.h" #include "diskloc.h" #include "pdfile.h" #include "key.h" namespace mongo { /** * Our btree implementation generally follows the standard btree algorithm, * which is described in many places. The nodes of our btree are referred to * as buckets below. These buckets are of size BucketSize and their body is * an ordered array of pairs, where disk loc is the disk * location of a document and bson key is a projection of this document into * the schema of the index for this btree. Ordering is determined on the * basis of bson key first and then disk loc in case of a tie. All bson keys * for a btree have identical schemas with empty string field names and may * not have an objsize() exceeding KeyMax. The btree's buckets are * themselves organized into an ordered tree. Although there are exceptions, * generally buckets with n keys have n+1 children and the body of a bucket is * at least lowWaterMark bytes. A more strictly enforced requirement is that * a non root bucket must have at least one key except in certain transient * states. * * Our btrees support the following primary read operations: finding a * specified key; iterating from a starting key to the next or previous * ordered key; and skipping from a starting key to another specified key * without checking every intermediate key. The primary write operations * are insertion and deletion of keys. Insertion may trigger a bucket split * if necessary to avoid bucket overflow. In such a case, subsequent splits * will occur recursively as necessary. Deletion may trigger a bucket * rebalance, in which a size deficient bucket is filled with keys from an * adjacent bucket. In this case, splitting may potentially occur in the * parent. Deletion may alternatively trigger a merge, in which the keys * from two buckets and a key from their shared parent are combined into the * same bucket. In such a case, rebalancing or merging may proceed * recursively from the parent. * * While the btree data format has been relatively constant over time, btrees * initially created by versions of mongo earlier than the current version * may embody different properties than freshly created btrees (while * following the same data format). These older btrees are referred to * below as legacy btrees. */ const int OldBucketSize = 8192; #pragma pack(1) template< class Version > class BucketBasics; /** * This is the fixed width data component for storage of a key within a * bucket. It contains an offset pointer to the variable width bson * data component. A _KeyNode may be 'unused', please see below. */ template< class Loc > struct __KeyNode { /** Signals that we are writing this _KeyNode and casts away const */ __KeyNode & writing() const; /** * The 'left' child bucket of this key. If this is the i-th key, it * points to the i index child bucket. */ Loc prevChildBucket; /** The location of the record associated with this key. */ Loc recordLoc; short keyDataOfs() const { return (short) _kdo; } /** Offset within current bucket of the variable width bson key for this _KeyNode. */ unsigned short _kdo; void setKeyDataOfs(short s) { _kdo = s; assert(s>=0); } /** Seems to be redundant. */ void setKeyDataOfsSavingUse(short s) { _kdo = s; assert(s>=0); } /** * Unused keys are not returned by read operations. Keys may be marked * as unused in cases where it is difficult to delete them while * maintaining the constraints required of a btree. * * Setting ofs to odd is the sentinel for unused, as real recordLoc's * are always even numbers. Note we need to keep its value basically * the same as we use the recordLoc as part of the key in the index * (to handle duplicate keys efficiently). * * Flagging keys as unused is a feature that is being phased out in favor * of deleting the keys outright. The current btree implementation is * not expected to mark a key as unused in a non legacy btree. */ void setUnused() { recordLoc.GETOFS() |= 1; } void setUsed() { recordLoc.GETOFS() &= ~1; } int isUnused() const { return recordLoc.getOfs() & 1; } int isUsed() const { return !isUnused(); } }; /** * This structure represents header data for a btree bucket. An object of * this type is typically allocated inside of a buffer of size BucketSize, * resulting in a full bucket with an appropriate header. * * The body of a btree bucket contains an array of _KeyNode objects starting * from its lowest indexed bytes and growing to higher indexed bytes. The * body also contains variable width bson keys, which are allocated from the * highest indexed bytes toward lower indexed bytes. * * |hhhh|kkkkkkk--------bbbbbbbbbbbuuubbbuubbb| * h = header data * k = KeyNode data * - = empty space * b = bson key data * u = unused (old) bson key data, that may be garbage collected */ class BtreeData_V0 { protected: /** Parent bucket of this bucket, which isNull() for the root bucket. */ DiskLoc parent; /** Given that there are n keys, this is the n index child. */ DiskLoc nextChild; /** can be reused, value is 8192 in current pdfile version Apr2010 */ unsigned short _wasSize; /** zero */ unsigned short _reserved1; int flags; void _init() { _reserved1 = 0; _wasSize = BucketSize; reserved = 0; } /** basicInsert() assumes the next three members are consecutive and in this order: */ /** Size of the empty region. */ int emptySize; /** Size used for bson storage, including storage of old keys. */ int topSize; /* Number of keys in the bucket. */ int n; int reserved; /* Beginning of the bucket's body */ char data[4]; public: typedef __KeyNode _KeyNode; typedef DiskLoc Loc; typedef KeyBson Key; typedef KeyBson KeyOwned; enum { BucketSize = 8192 }; // largest key size we allow. note we very much need to support bigger keys (somehow) in the future. static const int KeyMax = OldBucketSize / 10; }; // a a a ofs ofs ofs ofs class DiskLoc56Bit { int ofs; unsigned char _a[3]; unsigned long long Z() const { // endian return *((unsigned long long*)this) & 0x00ffffffffffffffULL; } enum { // first bit of offsets used in _KeyNode we don't use -1 here. OurNullOfs = -2 }; public: template< class V > const BtreeBucket * btree() const { return DiskLoc(*this).btree(); } template< class V > BtreeBucket * btreemod() const { return DiskLoc(*this).btreemod(); } operator const DiskLoc() const { // endian if( isNull() ) return DiskLoc(); unsigned a = *((unsigned *) (_a-1)); return DiskLoc(a >> 8, ofs); } int& GETOFS() { return ofs; } int getOfs() const { return ofs; } bool operator<(const DiskLoc56Bit& rhs) const { // the orderering of dup keys in btrees isn't too critical, but we'd like to put items that are // close together on disk close together in the tree, so we do want the file # to be the most significant // bytes return Z() < rhs.Z(); } int compare(const DiskLoc56Bit& rhs) const { unsigned long long a = Z(); unsigned long long b = rhs.Z(); if( a < b ) return -1; return a == b ? 0 : 1; } bool operator==(const DiskLoc56Bit& rhs) const { return Z() == rhs.Z(); } bool operator!=(const DiskLoc56Bit& rhs) const { return Z() != rhs.Z(); } bool operator==(const DiskLoc& rhs) const { return DiskLoc(*this) == rhs; } bool operator!=(const DiskLoc& rhs) const { return !(*this==rhs); } bool isNull() const { return ofs < 0; } void Null() { ofs = OurNullOfs; _a[0] = _a[1] = _a[2] = 0; } string toString() const { return DiskLoc(*this).toString(); } void operator=(const DiskLoc& loc) { ofs = loc.getOfs(); int la = loc.a(); assert( la <= 0xffffff ); // must fit in 3 bytes if( la < 0 ) { assert( la == -1 ); la = 0; ofs = OurNullOfs; } memcpy(_a, &la, 3); // endian dassert( ofs != 0 ); } DiskLoc56Bit& writing() const { return *((DiskLoc56Bit*) getDur().writingPtr((void*)this, 7)); } }; class BtreeData_V1 { public: typedef DiskLoc56Bit Loc; //typedef DiskLoc Loc; typedef __KeyNode _KeyNode; typedef KeyV1 Key; typedef KeyV1Owned KeyOwned; enum { BucketSize = 8192-16 }; // leave room for Record header // largest key size we allow. note we very much need to support bigger keys (somehow) in the future. static const int KeyMax = 1024; protected: /** Parent bucket of this bucket, which isNull() for the root bucket. */ Loc parent; /** Given that there are n keys, this is the n index child. */ Loc nextChild; unsigned short flags; /** basicInsert() assumes the next three members are consecutive and in this order: */ /** Size of the empty region. */ unsigned short emptySize; /** Size used for bson storage, including storage of old keys. */ unsigned short topSize; /* Number of keys in the bucket. */ unsigned short n; /* Beginning of the bucket's body */ char data[4]; void _init() { } }; typedef BtreeData_V0 V0; typedef BtreeData_V1 V1; /** * This class adds functionality to BtreeData for managing a single bucket. * The following policies are used in an attempt to encourage simplicity: * * Const member functions of this class are those which may be called on * an object for which writing has not been signaled. Non const member * functions may only be called on objects for which writing has been * signaled. Note that currently some const functions write to the * underlying memory representation of this bucket using optimized methods * to signal write operations. * * DiskLoc parameters that may shadow references within the btree should * be passed by value rather than by reference to non const member * functions or to const member functions which may perform writes. This way * a callee need not worry that write operations will change or invalidate * its arguments. * * The current policy for dealing with bson arguments is the opposite of * what is described above for DiskLoc arguments. We do not want to copy * bson into memory as an intermediate step for btree changes, and if bson * is to be moved it must be copied to the new location before the old * location is invalidated. Care should be taken in cases where that invalid * memory may be implicitly referenced by function arguments. * * A number of functions below require a thisLoc argument, which must be the * disk location of the bucket mapped to 'this'. */ template< class Version > class BucketBasics : public Version { public: template friend class BtreeBuilder; typedef typename Version::Key Key; typedef typename Version::_KeyNode _KeyNode; typedef typename Version::Loc Loc; int getN() const { return this->n; } /** * This is an in memory wrapper for a _KeyNode, and not itself part of btree * storage. This object and its BSONObj 'key' will become invalid if the * _KeyNode data that generated it is moved within the btree. In general, * a KeyNode should not be expected to be valid after a write. */ class KeyNode { public: KeyNode(const BucketBasics& bb, const _KeyNode &k); const Loc& prevChildBucket; const Loc& recordLoc; /* Points to the bson key storage for a _KeyNode */ Key key; }; friend class KeyNode; /** Assert write intent declared for this bucket already. */ void assertWritable(); void assertValid(const Ordering &order, bool force = false) const; void assertValid(const BSONObj &orderObj, bool force = false) const { return assertValid(Ordering::make(orderObj),force); } /** * @return KeyNode for key at index i. The KeyNode will become invalid * if the key is moved or reassigned, or if the node is packed. In general * a KeyNode should not be expected to be valid after a write. */ const KeyNode keyNode(int i) const { if ( i >= this->n ) { massert( 13000 , (string)"invalid keyNode: " + BSON( "i" << i << "n" << this->n ).jsonString() , i < this->n ); } return KeyNode(*this, k(i)); } static int headerSize() { const BucketBasics *d = 0; return (char*)&(d->data) - (char*)&(d->parent); } static int bodySize() { return Version::BucketSize - headerSize(); } static int lowWaterMark() { return bodySize() / 2 - Version::KeyMax - sizeof( _KeyNode ) + 1; } // see comment in btree.cpp // for testing int nKeys() const { return this->n; } const DiskLoc getNextChild() const { return this->nextChild; } protected: char * dataAt(short ofs) { return this->data + ofs; } /** Initialize the header for a new node. */ void init(); /** * Preconditions: * - 0 <= keypos <= n * - If key is inserted at position keypos, the bucket's keys will still be * in order. * Postconditions: * - If key can fit in the bucket, the bucket may be packed and keypos * may be decreased to reflect deletion of earlier indexed keys during * packing, the key will be inserted at the updated keypos index with * a null prevChildBucket, the subsequent keys shifted to the right, * and the function will return true. * - If key cannot fit in the bucket, the bucket will be packed and * the function will return false. * Although this function is marked const, it modifies the underlying * btree representation through an optimized write intent mechanism. */ bool basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order) const; /** * Preconditions: * - key / recordLoc are > all existing keys * - The keys in prevChild and their descendents are between all existing * keys and 'key'. * Postconditions: * - If there is space for key without packing, it is inserted as the * last key with specified prevChild and true is returned. * Importantly, nextChild is not updated! * - Otherwise false is returned and there is no change. */ bool _pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild); void pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild) { bool ok = _pushBack( recordLoc , key , order , prevChild ); assert(ok); } /** * This is a special purpose function used by BtreeBuilder. The * interface is quite dangerous if you're not careful. The bson key * returned here points to bucket memory that has been invalidated but * not yet reclaimed. * * TODO Maybe this could be replaced with two functions, one which * returns the last key without deleting it and another which simply * deletes the last key. Then the caller would have enough control to * ensure proper memory integrity. * * Preconditions: * - bucket is not empty * - last key of bucket is used (not unused) * - nextChild isNull() * - _unalloc will work correctly as used - see code * Postconditions: * - The last key of the bucket is removed, and its key and recLoc are * returned. As mentioned above, the key points to unallocated memory. */ void popBack(DiskLoc& recLoc, Key &key); /** * Preconditions: * - 0 <= keypos < n * - there is no child bucket at keypos * - n > 1 * - if mayEmpty == false or nextChild.isNull(), n > 0 * Postconditions: * - The key at keypos is removed, and remaining keys are shifted over. * - The bucket becomes unpacked. * - if mayEmpty is true and nextChild.isNull(), the bucket may have no keys. */ void _delKeyAtPos(int keypos, bool mayEmpty = false); /* !Packed means there is deleted fragment space within the bucket. We "repack" when we run out of space before considering the node to be full. */ enum Flags { Packed=1 }; /** n == 0 is ok */ const Loc& childForPos(int p) const { return p == this->n ? this->nextChild : k(p).prevChildBucket; } Loc& childForPos(int p) { return p == this->n ? this->nextChild : k(p).prevChildBucket; } /** Same as bodySize(). */ int totalDataSize() const; /** * @return true when a key may be dropped by pack() * @param index index of the key that may be dropped * @param refPos index of a particular key of interest, which must not * be dropped; = 0 to safely ignore */ bool mayDropKey( int index, int refPos ) const; /** * Pack the bucket to reclaim space from invalidated memory. * @refPos is an index in the bucket which may be updated if we * delete keys from the bucket * This function may cast away const and perform a write. * Preconditions: none * Postconditions: * - Bucket will be packed * - Some unused nodes may be dropped, but not ones at index 0 or refPos * - Some used nodes may be moved * - If refPos is the index of an existing key, it will be updated to that * key's new index if the key is moved. */ void _pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const; /** Pack when already writable */ void _packReadyForMod(const Ordering &order, int &refPos); /** @return the size the bucket's body would have if we were to call pack() */ int packedDataSize( int refPos ) const; void setNotPacked() { this->flags &= ~Packed; } void setPacked() { this->flags |= Packed; } /** * Preconditions: 'bytes' is <= emptySize * Postconditions: A buffer of size 'bytes' is allocated on the top side, * and its offset is returned. */ int _alloc(int bytes); /** * This function can be used to deallocate the lowest byte index bson * buffer in the top region, which in some but not all cases is for the * n - 1 index key. This function only works correctly in certain * special cases, please be careful. * Preconditions: 'bytes' <= topSize * Postconditions: The top region is decreased */ void _unalloc(int bytes); /** * Preconditions: 'N' <= n * Postconditions: * - All keys after the N index key are dropped. * - Then bucket is packed, without dropping refPos if < refPos N. */ void truncateTo(int N, const Ordering &order, int &refPos); /** * Preconditions: * - 'nDrop' < n * - for now, refPos should be zero. * Postconditions: * - All keys before the nDrop index key are dropped. * - The bucket is packed. */ void dropFront(int nDrop, const Ordering &order, int &refPos); /** * Preconditions: 0 <= keypos < n * Postconditions: keypos indexed key is marked unused. */ void markUnused(int keypos); /** * BtreeBuilder uses the parent var as a temp place to maintain a linked list chain. * we use tempNext() when we do that to be less confusing. (one might have written a union in C) */ DiskLoc tempNext() const { return this->parent; } void setTempNext(DiskLoc l) { this->parent = l; } void _shape(int level, stringstream&) const; int Size() const; /** @return i-indexed _KeyNode, without bounds checking */ public: const _KeyNode& k(int i) const { return ((const _KeyNode*)this->data)[i]; } _KeyNode& _k(int i) { return ((_KeyNode*)this->data)[i]; } protected: _KeyNode& k(int i) { return ((_KeyNode*)this->data)[i]; } /** * Preconditions: 'this' is packed * @return the key index to be promoted on split * @param keypos The requested index of a key to insert, which may affect * the choice of split position. */ int splitPos( int keypos ) const; /** * Preconditions: nAdd * sizeof( _KeyNode ) <= emptySize * Postconditions: * - Increases indexes of existing _KeyNode objects by nAdd, reserving * space for additional _KeyNode objects at front. * - Does not initialize ofs values for the bson data of these * _KeyNode objects. */ void reserveKeysFront( int nAdd ); /** * Preconditions: * - 0 <= i < n * - The bson 'key' must fit in the bucket without packing. * - If 'key' and 'prevChildBucket' are set at index i, the btree * ordering properties will be maintained. * Postconditions: * - The specified key is set at index i, replacing the existing * _KeyNode data and without shifting any other _KeyNode objects. */ void setKey( int i, const DiskLoc recordLoc, const Key& key, const DiskLoc prevChildBucket ); }; template< class V> struct Continuation; /** * This class adds functionality for manipulating buckets that are assembled * in a tree. The requirements for const and non const functions and * arguments are generally the same as in BtreeBucket. Because this class * deals with tree structure, some functions that are marked const may * trigger modification of another node in the btree or potentially of the * current node. In such cases, the function's implementation explicitly * casts away const when indicating an intent to write to the durability * layer. The DiskLocs provided to such functions should be passed by * value if they shadow pointers within the btree. * * To clarify enforcement of referential integrity in this implementation, * we use the following pattern when deleting data we have a persistent * pointer to. The pointer is cleared or removed explicitly, then the data * it pointed to is cleaned up with a helper function. * * TODO It might make sense to put some of these functions in a class * representing a full btree instead of a single btree bucket. That would * allow us to use the const qualifier in a manner more consistent with * standard usage. Right now the interface is for both a node and a tree, * so assignment of const is sometimes nonideal. * * TODO There are several cases in which the 'this' pointer is invalidated * as a result of deallocation. A seperate class representing a btree would * alleviate some fragile cases where the implementation must currently * behave correctly if the 'this' pointer is suddenly invalidated by a * callee. */ template< class V > class BtreeBucket : public BucketBasics { friend class BtreeCursor; friend struct Continuation; public: // make compiler happy: typedef typename V::Key Key; typedef typename V::KeyOwned KeyOwned; typedef typename BucketBasics::KeyNode KeyNode; typedef typename BucketBasics::_KeyNode _KeyNode; typedef typename BucketBasics::Loc Loc; const _KeyNode& k(int i) const { return static_cast< const BucketBasics * >(this)->k(i); } protected: _KeyNode& k(int i) { return static_cast< BucketBasics * >(this)->_k(i); } public: const KeyNode keyNode(int i) const { return static_cast< const BucketBasics * >(this)->keyNode(i); } bool isHead() const { return this->parent.isNull(); } void dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const; long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order, long long *unusedCount = 0, bool strict = false, unsigned depth=0) const; /* traverses everything */ bool isUsed( int i ) const { return this->k(i).isUsed(); } string bucketSummary() const; void dump(unsigned depth=0) const; /** * @return true if key exists in index * * @order - indicates order of keys in the index. this is basically the index's key pattern, e.g.: * BSONObj order = ((IndexDetails&)idx).keyPattern(); * likewise below in bt_insert() etc. */ private: bool exists(const IndexDetails& idx, const DiskLoc &thisLoc, const Key& key, const Ordering& order) const; public: /** * @param self - Don't complain about ourself already being in the index case. * @return true = There is a duplicate used key. */ bool wouldCreateDup( const IndexDetails& idx, const DiskLoc &thisLoc, const Key& key, const Ordering& order, const DiskLoc &self) const; /** * Preconditions: none * Postconditions: @return a new bucket allocated from pdfile storage * and init()-ed. This bucket is suitable to for use as a new root * or any other new node in the tree. */ static DiskLoc addBucket(const IndexDetails&); /** * Preconditions: none * Postconditions: * - Some header values in this bucket are cleared, and the bucket is * deallocated from pdfile storage. * - The memory at thisLoc is invalidated, and 'this' is invalidated. */ void deallocBucket(const DiskLoc thisLoc, const IndexDetails &id); /** * Preconditions: * - 'key' has a valid schema for this index. * - All other paramenters are valid and consistent with this index if applicable. * Postconditions: * - If key is bigger than KeyMax, @return 2 or 3 and no change. * - If key / recordLoc exist in the btree as an unused key, set them * as used and @return 0 * - If key / recordLoc exist in the btree as a used key, @throw * exception 10287 and no change. * - If key / recordLoc do not exist in the btree, they are inserted * and @return 0. The root of the btree may be changed, so * 'this'/thisLoc may no longer be the root upon return. */ int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, bool dupsAllowed, IndexDetails& idx, bool toplevel = true) const; /** does the insert in two steps - can then use an upgradable lock for step 1, which is the part which may have page faults. also that step is most of the computational work. */ void twoStepInsert(DiskLoc thisLoc, Continuation &c, bool dupsAllowed) const; /** * Preconditions: * - 'key' has a valid schema for this index, and may have objsize() > KeyMax. * Postconditions: * - If key / recordLoc are in the btree, they are removed (possibly * by being marked as an unused key), @return true, and potentially * invalidate 'this' / thisLoc and change the head. * - If key / recordLoc are not in the btree, @return false and do nothing. */ bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const; /** * locate may return an "unused" key that is just a marker. so be careful. * looks for a key:recordloc pair. * * @found - returns true if exact match found. note you can get back a position * result even if found is false. */ DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const; DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const Key& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const; /** * find the first instance of the key * does not handle dups * WARNING: findSingle may not be compound index safe. this may need to change. see notes in * findSingle code. * @return the record location of the first match */ DiskLoc findSingle( const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const; /** * Advance to next or previous key in the index. * @param direction to advance. */ DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const; /** Advance in specified direction to the specified key */ void advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const; /** Locate a key with fields comprised of a combination of keyBegin fields and keyEnd fields. */ static void customLocate(DiskLoc &locInOut, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) ; /** @return head of the btree by traversing from current bucket. */ const DiskLoc getHead(const DiskLoc& thisLoc) const; /** get tree shape */ void shape(stringstream&) const; static void a_test(IndexDetails&); static int getKeyMax(); protected: /** * Preconditions: * - 0 <= firstIndex <= n * - -1 <= lastIndex <= n ( -1 is equivalent to n ) * Postconditions: * - Any children at indexes firstIndex through lastIndex (inclusive) * will have their parent pointers set to thisLoc. */ void fixParentPtrs(const DiskLoc thisLoc, int firstIndex = 0, int lastIndex = -1) const; /** * Preconditions: * - thisLoc is not the btree head. * - n == 0 is ok * Postconditions: * - All cursors pointing to this bucket will be updated. * - This bucket's parent's child pointer is set to null. * - This bucket is deallocated from pdfile storage. * - 'this' and thisLoc are invalidated. */ void delBucket(const DiskLoc thisLoc, const IndexDetails&); /** * Preconditions: 0 <= p < n * Postconditions: * - The key at index p is removed from the btree. * - 'this' and thisLoc may be invalidated. * - The tree head may change. */ void delKeyAtPos(const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order); /** * Preconditions: * - n == 0 is ok * Postconditions: * - If thisLoc is head, or if its body has at least lowWaterMark bytes, * return false and do nothing. * - Otherwise, if thisLoc has left or right neighbors, either balance * or merge with them and return true. Also, 'this' and thisLoc may * be invalidated and the tree head may change. */ bool mayBalanceWithNeighbors(const DiskLoc thisLoc, IndexDetails &id, const Ordering &order) const; /** * Preconditions: * - 0 <= leftIndex < n * - The child at leftIndex or the child at leftIndex + 1 contains * fewer than lowWaterMark bytes. * Postconditions: * - If the child bucket at leftIndex can merge with the child index * at leftIndex + 1, do nothing and return false. * - Otherwise, balance keys between the leftIndex child and the * leftIndex + 1 child, return true, and possibly change the tree head. */ bool tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const; /** * Preconditions: * - All preconditions of tryBalanceChildren. * - The leftIndex child and leftIndex + 1 child cannot be merged. * Postconditions: * - Keys are moved between the leftIndex child and the leftIndex + 1 * child such that neither child has fewer than lowWaterMark bytes. * The tree head may change. */ void doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ); /** * Preconditions: * - All preconditions of doBalanceChildren * - The leftIndex and leftIndex + 1 children are packed. * - The leftIndex + 1 child has fewer than lowWaterMark bytes. * - split returned by rebalancedSeparatorPos() * Postconditions: * - The key in lchild at index split is set as thisLoc's key at index * leftIndex, which may trigger a split and change the tree head. * The previous key in thisLoc at index leftIndex and all keys with * indexes greater than split in lchild are moved to rchild. */ void doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split, BtreeBucket *l, const DiskLoc lchild, BtreeBucket *r, const DiskLoc rchild, IndexDetails &id, const Ordering &order ); /** * Preconditions: * - All preconditions of doBalanceChildren * - The leftIndex and leftIndex + 1 children are packed. * - The leftIndex child has fewer than lowWaterMark bytes. * - split returned by rebalancedSeparatorPos() * Postconditions: * - The key in rchild at index split - l->n - 1 is set as thisLoc's key * at index leftIndex, which may trigger a split and change the tree * head. The previous key in thisLoc at index leftIndex and all keys * with indexes less than split - l->n - 1 in rchild are moved to * lchild. */ void doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split, BtreeBucket *l, const DiskLoc lchild, BtreeBucket *r, const DiskLoc rchild, IndexDetails &id, const Ordering &order ); /** * Preconditions: * - 0 <= leftIndex < n * - this->canMergeChildren( thisLoc, leftIndex ) == true * Postconditions: * - All of the above mentioned keys will be placed in the left child. * - The tree may be updated recursively, resulting in 'this' and * thisLoc being invalidated and the tree head being changed. */ void doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order); /** * Preconditions: * - n == 0 * - !nextChild.isNull() * Postconditions: * - 'this' and thisLoc are deallocated (and invalidated), any cursors * to them are updated, and the tree head may change. * - nextChild replaces thisLoc in the btree structure. */ void replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id ); /** * @return true iff the leftIndex and leftIndex + 1 children both exist, * and if their body sizes when packed and the thisLoc key at leftIndex * would fit in a single bucket body. */ bool canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const; /** * Preconditions: * - leftIndex and leftIndex + 1 children are packed * - leftIndex or leftIndex + 1 child is below lowWaterMark * @return index of the rebalanced separator; the index value is * determined as if we had a bucket with body * .push( ).concat( ) * and called splitPos( 0 ) on it. */ int rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const; /** * Preconditions: thisLoc has a parent * @return parent's index of thisLoc. */ int indexInParent( const DiskLoc &thisLoc ) const; public: Key keyAt(int i) const { if( i >= this->n ) return Key(); return Key(this->data + k(i).keyDataOfs()); } protected: /** * Preconditions: * - This bucket is packed. * - Cannot add a key of size KeyMax to this bucket. * - 0 <= keypos <= n is the position of a new key that will be inserted * - lchild is equal to the existing child at index keypos. * Postconditions: * - The thisLoc bucket is split into two packed buckets, possibly * invalidating the initial position of keypos, with a split key * promoted to the parent. The new key key/recordLoc will be inserted * into one of the split buckets, and lchild/rchild set appropriately. * Splitting may occur recursively, possibly changing the tree head. */ void split(const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const Key& key, const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx); /** * Preconditions: * - 0 <= keypos <= n * - If key / recordLoc are inserted at position keypos, with provided * lchild and rchild, the btree ordering requirements will be * maintained. * - lchild is equal to the existing child at index keypos. * - n == 0 is ok. * Postconditions: * - The key / recordLoc are inserted at position keypos, and the * bucket is split if necessary, which may change the tree head. * - The bucket may be packed or split, invalidating the specified value * of keypos. * This function will always modify thisLoc, but it's marked const because * it commonly relies on the specialized writ]e intent mechanism of basicInsert(). */ void insertHere(const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx) const; /** bt_insert() is basically just a wrapper around this. */ int _insert(const DiskLoc thisLoc, const DiskLoc recordLoc, const Key& key, const Ordering &order, bool dupsAllowed, const DiskLoc lChild, const DiskLoc rChild, IndexDetails &idx) const; void insertStepOne(DiskLoc thisLoc, Continuation& c, bool dupsAllowed) const; bool find(const IndexDetails& idx, const Key& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const; static bool customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) ; static void findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey); static int customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ); /** If child is non null, set its parent to thisLoc */ static void fix(const DiskLoc thisLoc, const DiskLoc child); /** * Preconditions: * - 0 <= keypos < n * - If the specified key and recordLoc are placed in keypos of thisLoc, * and lchild and rchild are set, the btree ordering properties will * be maintained. * - rchild == childForPos( keypos + 1 ) * - childForPos( keypos ) is referenced elsewhere if nonnull. * Postconditions: * - The key at keypos will be replaced with the specified key and * lchild, potentially splitting this bucket and changing the tree * head. * - childForPos( keypos ) will be orphaned. */ void setInternalKey( const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const Key &key, const Ordering &order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx); /** * Preconditions: * - 0 <= keypos < n * - The keypos or keypos+1 indexed child is non null. * Postconditions: * - The specified key is deleted by replacing it with another key if * possible. This replacement may cause a split and change the tree * head. The replacement key will be deleted from its original * location, potentially causing merges and splits that may invalidate * 'this' and thisLoc and change the tree head. * - If the key cannot be replaced, it will be marked as unused. This * is only expected in legacy btrees. */ void deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order ); public: /** simply builds and returns a dup key error message string */ static string dupKeyError( const IndexDetails& idx , const Key& key ); }; #pragma pack() class FieldRangeVector; class FieldRangeVectorIterator; class BtreeCursor : public Cursor { protected: BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction ); BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction ); public: virtual ~BtreeCursor(); /** makes an appropriate subclass depending on the index version */ static BtreeCursor* make( NamespaceDetails *_d, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction ); static BtreeCursor* make( NamespaceDetails *_d, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction ); static BtreeCursor* make( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction ); static BtreeCursor* make( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction ); virtual bool ok() { return !bucket.isNull(); } virtual bool advance(); virtual void noteLocation(); // updates keyAtKeyOfs... virtual void checkLocation() = 0; virtual bool supportGetMore() { return true; } virtual bool supportYields() { return true; } /** * used for multikey index traversal to avoid sending back dups. see Matcher::matches(). * if a multikey index traversal: * if loc has already been sent, returns true. * otherwise, marks loc as sent. * @return false if the loc has not been seen */ virtual bool getsetdup(DiskLoc loc) { if( _multikey ) { pair::iterator, bool> p = _dups.insert(loc); return !p.second; } return false; } virtual bool modifiedKeys() const { return _multikey; } virtual bool isMultiKey() const { return _multikey; } /*const _KeyNode& _currKeyNode() const { assert( !bucket.isNull() ); const _KeyNode& kn = keyNode(keyOfs); assert( kn.isUsed() ); return kn; }*/ /** returns BSONObj() if ofs is out of range */ virtual BSONObj keyAt(int ofs) const = 0; virtual BSONObj currKey() const = 0; virtual BSONObj indexKeyPattern() { return indexDetails.keyPattern(); } virtual void aboutToDeleteBucket(const DiskLoc& b) { if ( bucket == b ) keyOfs = -1; } virtual DiskLoc currLoc() = 0; // { return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc(); } virtual DiskLoc refLoc() { return currLoc(); } virtual Record* _current() { return currLoc().rec(); } virtual BSONObj current() { return BSONObj(_current()); } virtual string toString(); BSONObj prettyKey( const BSONObj &key ) const { return key.replaceFieldNames( indexDetails.keyPattern() ).clientReadable(); } virtual BSONObj prettyIndexBounds() const; virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); } virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; } virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher; } virtual long long nscanned() { return _nscanned; } /** for debugging only */ const DiskLoc getBucket() const { return bucket; } int getKeyOfs() const { return keyOfs; } // just for unit tests virtual bool curKeyHasChild() = 0; protected: /** * Our btrees may (rarely) have "unused" keys when items are deleted. * Skip past them. */ virtual bool skipUnusedKeys() = 0; bool skipOutOfRangeKeysAndCheckEnd(); void skipAndCheck(); void checkEnd(); /** selective audits on construction */ void audit(); virtual void _audit() = 0; virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc) = 0; virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) = 0; virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) = 0; /** set initial bucket */ void initWithoutIndependentFieldRanges(); /** if afterKey is true, we want the first key with values of the keyBegin fields greater than keyBegin */ void advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive ); set _dups; NamespaceDetails * const d; const int idxNo; BSONObj startKey; BSONObj endKey; bool _endKeyInclusive; bool _multikey; // this must be updated every getmore batch in case someone added a multikey const IndexDetails& indexDetails; const BSONObj _order; const Ordering _ordering; DiskLoc bucket; int keyOfs; const int _direction; // 1=fwd,-1=reverse BSONObj keyAtKeyOfs; // so we can tell if things moved around on us between the query and the getMore call DiskLoc locAtKeyOfs; const shared_ptr< FieldRangeVector > _bounds; auto_ptr< FieldRangeVectorIterator > _boundsIterator; shared_ptr< CoveredIndexMatcher > _matcher; bool _independentFieldRanges; long long _nscanned; }; template< class V > struct Continuation { //Continuation(const typename V::Key & k); Continuation(DiskLoc thisLoc, DiskLoc _recordLoc, const BSONObj &_key, Ordering _order, IndexDetails& _idx) : bLoc(thisLoc), recordLoc(_recordLoc), key(_key), order(_order), idx(_idx) { op = Nothing; } DiskLoc bLoc; DiskLoc recordLoc; typename V::KeyOwned key; const Ordering order; IndexDetails& idx; enum Op { Nothing, SetUsed, InsertHere } op; int pos; const BtreeBucket *b; void stepTwo() { if( op == Nothing ) return; else if( op == SetUsed ) { const typename V::_KeyNode& kn = b->k(pos); kn.writing().setUsed(); } else { b->insertHere(bLoc, pos, recordLoc, key, order, DiskLoc(), DiskLoc(), idx); } } }; /** Renames the index namespace for this btree's index. */ void renameIndexNamespace(const char *oldNs, const char *newNs); /** * give us a writable version of the btree bucket (declares write intent). * note it is likely more efficient to declare write intent on something smaller when you can. */ template< class V > BtreeBucket * DiskLoc::btreemod() const { assert( _a != -1 ); BtreeBucket *b = const_cast< BtreeBucket * >( btree() ); return static_cast< BtreeBucket* >( getDur().writingPtr( b, V::BucketSize ) ); } template< class V > BucketBasics::KeyNode::KeyNode(const BucketBasics& bb, const _KeyNode &k) : prevChildBucket(k.prevChildBucket), recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs()) { } } // namespace mongo;