// btree.h
/**
* Copyright (C) 2008 10gen Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*/
#pragma once
#include "../pch.h"
#include "jsobj.h"
#include "diskloc.h"
#include "pdfile.h"
#include "key.h"
namespace mongo {
/**
* Our btree implementation generally follows the standard btree algorithm,
* which is described in many places. The nodes of our btree are referred to
* as buckets below. These buckets are of size BucketSize and their body is
* an ordered array of pairs, where disk loc is the disk
* location of a document and bson key is a projection of this document into
* the schema of the index for this btree. Ordering is determined on the
* basis of bson key first and then disk loc in case of a tie. All bson keys
* for a btree have identical schemas with empty string field names and may
* not have an objsize() exceeding KeyMax. The btree's buckets are
* themselves organized into an ordered tree. Although there are exceptions,
* generally buckets with n keys have n+1 children and the body of a bucket is
* at least lowWaterMark bytes. A more strictly enforced requirement is that
* a non root bucket must have at least one key except in certain transient
* states.
*
* Our btrees support the following primary read operations: finding a
* specified key; iterating from a starting key to the next or previous
* ordered key; and skipping from a starting key to another specified key
* without checking every intermediate key. The primary write operations
* are insertion and deletion of keys. Insertion may trigger a bucket split
* if necessary to avoid bucket overflow. In such a case, subsequent splits
* will occur recursively as necessary. Deletion may trigger a bucket
* rebalance, in which a size deficient bucket is filled with keys from an
* adjacent bucket. In this case, splitting may potentially occur in the
* parent. Deletion may alternatively trigger a merge, in which the keys
* from two buckets and a key from their shared parent are combined into the
* same bucket. In such a case, rebalancing or merging may proceed
* recursively from the parent.
*
* While the btree data format has been relatively constant over time, btrees
* initially created by versions of mongo earlier than the current version
* may embody different properties than freshly created btrees (while
* following the same data format). These older btrees are referred to
* below as legacy btrees.
*/
const int OldBucketSize = 8192;
#pragma pack(1)
template< class Version > class BucketBasics;
/**
* This is the fixed width data component for storage of a key within a
* bucket. It contains an offset pointer to the variable width bson
* data component. A _KeyNode may be 'unused', please see below.
*/
template< class Loc >
struct __KeyNode {
/** Signals that we are writing this _KeyNode and casts away const */
__KeyNode & writing() const;
/**
* The 'left' child bucket of this key. If this is the i-th key, it
* points to the i index child bucket.
*/
Loc prevChildBucket;
/** The location of the record associated with this key. */
Loc recordLoc;
short keyDataOfs() const { return (short) _kdo; }
/** Offset within current bucket of the variable width bson key for this _KeyNode. */
unsigned short _kdo;
void setKeyDataOfs(short s) {
_kdo = s;
assert(s>=0);
}
/** Seems to be redundant. */
void setKeyDataOfsSavingUse(short s) {
_kdo = s;
assert(s>=0);
}
/**
* Unused keys are not returned by read operations. Keys may be marked
* as unused in cases where it is difficult to delete them while
* maintaining the constraints required of a btree.
*
* Setting ofs to odd is the sentinel for unused, as real recordLoc's
* are always even numbers. Note we need to keep its value basically
* the same as we use the recordLoc as part of the key in the index
* (to handle duplicate keys efficiently).
*
* Flagging keys as unused is a feature that is being phased out in favor
* of deleting the keys outright. The current btree implementation is
* not expected to mark a key as unused in a non legacy btree.
*/
void setUnused() {
recordLoc.GETOFS() |= 1;
}
void setUsed() { recordLoc.GETOFS() &= ~1; }
int isUnused() const {
return recordLoc.getOfs() & 1;
}
int isUsed() const {
return !isUnused();
}
};
/**
* This structure represents header data for a btree bucket. An object of
* this type is typically allocated inside of a buffer of size BucketSize,
* resulting in a full bucket with an appropriate header.
*
* The body of a btree bucket contains an array of _KeyNode objects starting
* from its lowest indexed bytes and growing to higher indexed bytes. The
* body also contains variable width bson keys, which are allocated from the
* highest indexed bytes toward lower indexed bytes.
*
* |hhhh|kkkkkkk--------bbbbbbbbbbbuuubbbuubbb|
* h = header data
* k = KeyNode data
* - = empty space
* b = bson key data
* u = unused (old) bson key data, that may be garbage collected
*/
class BtreeData_V0 {
protected:
/** Parent bucket of this bucket, which isNull() for the root bucket. */
DiskLoc parent;
/** Given that there are n keys, this is the n index child. */
DiskLoc nextChild;
/** can be reused, value is 8192 in current pdfile version Apr2010 */
unsigned short _wasSize;
/** zero */
unsigned short _reserved1;
int flags;
void _init() {
_reserved1 = 0;
_wasSize = BucketSize;
reserved = 0;
}
/** basicInsert() assumes the next three members are consecutive and in this order: */
/** Size of the empty region. */
int emptySize;
/** Size used for bson storage, including storage of old keys. */
int topSize;
/* Number of keys in the bucket. */
int n;
int reserved;
/* Beginning of the bucket's body */
char data[4];
public:
typedef __KeyNode _KeyNode;
typedef DiskLoc Loc;
typedef KeyBson Key;
typedef KeyBson KeyOwned;
enum { BucketSize = 8192 };
// largest key size we allow. note we very much need to support bigger keys (somehow) in the future.
static const int KeyMax = OldBucketSize / 10;
};
// a a a ofs ofs ofs ofs
class DiskLoc56Bit {
int ofs;
unsigned char _a[3];
unsigned long long Z() const {
// endian
return *((unsigned long long*)this) & 0x00ffffffffffffffULL;
}
enum {
// first bit of offsets used in _KeyNode we don't use -1 here.
OurNullOfs = -2
};
public:
template< class V >
const BtreeBucket * btree() const {
return DiskLoc(*this).btree();
}
template< class V >
BtreeBucket * btreemod() const {
return DiskLoc(*this).btreemod();
}
operator const DiskLoc() const {
// endian
if( isNull() ) return DiskLoc();
unsigned a = *((unsigned *) (_a-1));
return DiskLoc(a >> 8, ofs);
}
int& GETOFS() { return ofs; }
int getOfs() const { return ofs; }
bool operator<(const DiskLoc56Bit& rhs) const {
// the orderering of dup keys in btrees isn't too critical, but we'd like to put items that are
// close together on disk close together in the tree, so we do want the file # to be the most significant
// bytes
return Z() < rhs.Z();
}
int compare(const DiskLoc56Bit& rhs) const {
unsigned long long a = Z();
unsigned long long b = rhs.Z();
if( a < b ) return -1;
return a == b ? 0 : 1;
}
bool operator==(const DiskLoc56Bit& rhs) const { return Z() == rhs.Z(); }
bool operator!=(const DiskLoc56Bit& rhs) const { return Z() != rhs.Z(); }
bool operator==(const DiskLoc& rhs) const {
return DiskLoc(*this) == rhs;
}
bool operator!=(const DiskLoc& rhs) const { return !(*this==rhs); }
bool isNull() const { return ofs < 0; }
void Null() {
ofs = OurNullOfs;
_a[0] = _a[1] = _a[2] = 0;
}
string toString() const { return DiskLoc(*this).toString(); }
void operator=(const DiskLoc& loc) {
ofs = loc.getOfs();
int la = loc.a();
assert( la <= 0xffffff ); // must fit in 3 bytes
if( la < 0 ) {
assert( la == -1 );
la = 0;
ofs = OurNullOfs;
}
memcpy(_a, &la, 3); // endian
dassert( ofs != 0 );
}
DiskLoc56Bit& writing() const {
return *((DiskLoc56Bit*) getDur().writingPtr((void*)this, 7));
}
};
class BtreeData_V1 {
public:
typedef DiskLoc56Bit Loc;
//typedef DiskLoc Loc;
typedef __KeyNode _KeyNode;
typedef KeyV1 Key;
typedef KeyV1Owned KeyOwned;
enum { BucketSize = 8192-16 }; // leave room for Record header
// largest key size we allow. note we very much need to support bigger keys (somehow) in the future.
static const int KeyMax = 1024;
protected:
/** Parent bucket of this bucket, which isNull() for the root bucket. */
Loc parent;
/** Given that there are n keys, this is the n index child. */
Loc nextChild;
unsigned short flags;
/** basicInsert() assumes the next three members are consecutive and in this order: */
/** Size of the empty region. */
unsigned short emptySize;
/** Size used for bson storage, including storage of old keys. */
unsigned short topSize;
/* Number of keys in the bucket. */
unsigned short n;
/* Beginning of the bucket's body */
char data[4];
void _init() { }
};
typedef BtreeData_V0 V0;
typedef BtreeData_V1 V1;
/**
* This class adds functionality to BtreeData for managing a single bucket.
* The following policies are used in an attempt to encourage simplicity:
*
* Const member functions of this class are those which may be called on
* an object for which writing has not been signaled. Non const member
* functions may only be called on objects for which writing has been
* signaled. Note that currently some const functions write to the
* underlying memory representation of this bucket using optimized methods
* to signal write operations.
*
* DiskLoc parameters that may shadow references within the btree should
* be passed by value rather than by reference to non const member
* functions or to const member functions which may perform writes. This way
* a callee need not worry that write operations will change or invalidate
* its arguments.
*
* The current policy for dealing with bson arguments is the opposite of
* what is described above for DiskLoc arguments. We do not want to copy
* bson into memory as an intermediate step for btree changes, and if bson
* is to be moved it must be copied to the new location before the old
* location is invalidated. Care should be taken in cases where that invalid
* memory may be implicitly referenced by function arguments.
*
* A number of functions below require a thisLoc argument, which must be the
* disk location of the bucket mapped to 'this'.
*/
template< class Version >
class BucketBasics : public Version {
public:
template friend class BtreeBuilder;
typedef typename Version::Key Key;
typedef typename Version::_KeyNode _KeyNode;
typedef typename Version::Loc Loc;
int getN() const { return this->n; }
/**
* This is an in memory wrapper for a _KeyNode, and not itself part of btree
* storage. This object and its BSONObj 'key' will become invalid if the
* _KeyNode data that generated it is moved within the btree. In general,
* a KeyNode should not be expected to be valid after a write.
*/
class KeyNode {
public:
KeyNode(const BucketBasics& bb, const _KeyNode &k);
const Loc& prevChildBucket;
const Loc& recordLoc;
/* Points to the bson key storage for a _KeyNode */
Key key;
};
friend class KeyNode;
/** Assert write intent declared for this bucket already. */
void assertWritable();
void assertValid(const Ordering &order, bool force = false) const;
void assertValid(const BSONObj &orderObj, bool force = false) const { return assertValid(Ordering::make(orderObj),force); }
/**
* @return KeyNode for key at index i. The KeyNode will become invalid
* if the key is moved or reassigned, or if the node is packed. In general
* a KeyNode should not be expected to be valid after a write.
*/
const KeyNode keyNode(int i) const {
if ( i >= this->n ) {
massert( 13000 , (string)"invalid keyNode: " + BSON( "i" << i << "n" << this->n ).jsonString() , i < this->n );
}
return KeyNode(*this, k(i));
}
static int headerSize() {
const BucketBasics *d = 0;
return (char*)&(d->data) - (char*)&(d->parent);
}
static int bodySize() { return Version::BucketSize - headerSize(); }
static int lowWaterMark() { return bodySize() / 2 - Version::KeyMax - sizeof( _KeyNode ) + 1; } // see comment in btree.cpp
// for testing
int nKeys() const { return this->n; }
const DiskLoc getNextChild() const { return this->nextChild; }
protected:
char * dataAt(short ofs) { return this->data + ofs; }
/** Initialize the header for a new node. */
void init();
/**
* Preconditions:
* - 0 <= keypos <= n
* - If key is inserted at position keypos, the bucket's keys will still be
* in order.
* Postconditions:
* - If key can fit in the bucket, the bucket may be packed and keypos
* may be decreased to reflect deletion of earlier indexed keys during
* packing, the key will be inserted at the updated keypos index with
* a null prevChildBucket, the subsequent keys shifted to the right,
* and the function will return true.
* - If key cannot fit in the bucket, the bucket will be packed and
* the function will return false.
* Although this function is marked const, it modifies the underlying
* btree representation through an optimized write intent mechanism.
*/
bool basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order) const;
/**
* Preconditions:
* - key / recordLoc are > all existing keys
* - The keys in prevChild and their descendents are between all existing
* keys and 'key'.
* Postconditions:
* - If there is space for key without packing, it is inserted as the
* last key with specified prevChild and true is returned.
* Importantly, nextChild is not updated!
* - Otherwise false is returned and there is no change.
*/
bool _pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild);
void pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild) {
bool ok = _pushBack( recordLoc , key , order , prevChild );
assert(ok);
}
/**
* This is a special purpose function used by BtreeBuilder. The
* interface is quite dangerous if you're not careful. The bson key
* returned here points to bucket memory that has been invalidated but
* not yet reclaimed.
*
* TODO Maybe this could be replaced with two functions, one which
* returns the last key without deleting it and another which simply
* deletes the last key. Then the caller would have enough control to
* ensure proper memory integrity.
*
* Preconditions:
* - bucket is not empty
* - last key of bucket is used (not unused)
* - nextChild isNull()
* - _unalloc will work correctly as used - see code
* Postconditions:
* - The last key of the bucket is removed, and its key and recLoc are
* returned. As mentioned above, the key points to unallocated memory.
*/
void popBack(DiskLoc& recLoc, Key &key);
/**
* Preconditions:
* - 0 <= keypos < n
* - there is no child bucket at keypos
* - n > 1
* - if mayEmpty == false or nextChild.isNull(), n > 0
* Postconditions:
* - The key at keypos is removed, and remaining keys are shifted over.
* - The bucket becomes unpacked.
* - if mayEmpty is true and nextChild.isNull(), the bucket may have no keys.
*/
void _delKeyAtPos(int keypos, bool mayEmpty = false);
/* !Packed means there is deleted fragment space within the bucket.
We "repack" when we run out of space before considering the node
to be full.
*/
enum Flags { Packed=1 };
/** n == 0 is ok */
const Loc& childForPos(int p) const { return p == this->n ? this->nextChild : k(p).prevChildBucket; }
Loc& childForPos(int p) { return p == this->n ? this->nextChild : k(p).prevChildBucket; }
/** Same as bodySize(). */
int totalDataSize() const;
/**
* @return true when a key may be dropped by pack()
* @param index index of the key that may be dropped
* @param refPos index of a particular key of interest, which must not
* be dropped; = 0 to safely ignore
*/
bool mayDropKey( int index, int refPos ) const;
/**
* Pack the bucket to reclaim space from invalidated memory.
* @refPos is an index in the bucket which may be updated if we
* delete keys from the bucket
* This function may cast away const and perform a write.
* Preconditions: none
* Postconditions:
* - Bucket will be packed
* - Some unused nodes may be dropped, but not ones at index 0 or refPos
* - Some used nodes may be moved
* - If refPos is the index of an existing key, it will be updated to that
* key's new index if the key is moved.
*/
void _pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const;
/** Pack when already writable */
void _packReadyForMod(const Ordering &order, int &refPos);
/** @return the size the bucket's body would have if we were to call pack() */
int packedDataSize( int refPos ) const;
void setNotPacked() { this->flags &= ~Packed; }
void setPacked() { this->flags |= Packed; }
/**
* Preconditions: 'bytes' is <= emptySize
* Postconditions: A buffer of size 'bytes' is allocated on the top side,
* and its offset is returned.
*/
int _alloc(int bytes);
/**
* This function can be used to deallocate the lowest byte index bson
* buffer in the top region, which in some but not all cases is for the
* n - 1 index key. This function only works correctly in certain
* special cases, please be careful.
* Preconditions: 'bytes' <= topSize
* Postconditions: The top region is decreased
*/
void _unalloc(int bytes);
/**
* Preconditions: 'N' <= n
* Postconditions:
* - All keys after the N index key are dropped.
* - Then bucket is packed, without dropping refPos if < refPos N.
*/
void truncateTo(int N, const Ordering &order, int &refPos);
/**
* Preconditions:
* - 'nDrop' < n
* - for now, refPos should be zero.
* Postconditions:
* - All keys before the nDrop index key are dropped.
* - The bucket is packed.
*/
void dropFront(int nDrop, const Ordering &order, int &refPos);
/**
* Preconditions: 0 <= keypos < n
* Postconditions: keypos indexed key is marked unused.
*/
void markUnused(int keypos);
/**
* BtreeBuilder uses the parent var as a temp place to maintain a linked list chain.
* we use tempNext() when we do that to be less confusing. (one might have written a union in C)
*/
DiskLoc tempNext() const { return this->parent; }
void setTempNext(DiskLoc l) { this->parent = l; }
void _shape(int level, stringstream&) const;
int Size() const;
/** @return i-indexed _KeyNode, without bounds checking */
public:
const _KeyNode& k(int i) const { return ((const _KeyNode*)this->data)[i]; }
_KeyNode& _k(int i) { return ((_KeyNode*)this->data)[i]; }
protected:
_KeyNode& k(int i) { return ((_KeyNode*)this->data)[i]; }
/**
* Preconditions: 'this' is packed
* @return the key index to be promoted on split
* @param keypos The requested index of a key to insert, which may affect
* the choice of split position.
*/
int splitPos( int keypos ) const;
/**
* Preconditions: nAdd * sizeof( _KeyNode ) <= emptySize
* Postconditions:
* - Increases indexes of existing _KeyNode objects by nAdd, reserving
* space for additional _KeyNode objects at front.
* - Does not initialize ofs values for the bson data of these
* _KeyNode objects.
*/
void reserveKeysFront( int nAdd );
/**
* Preconditions:
* - 0 <= i < n
* - The bson 'key' must fit in the bucket without packing.
* - If 'key' and 'prevChildBucket' are set at index i, the btree
* ordering properties will be maintained.
* Postconditions:
* - The specified key is set at index i, replacing the existing
* _KeyNode data and without shifting any other _KeyNode objects.
*/
void setKey( int i, const DiskLoc recordLoc, const Key& key, const DiskLoc prevChildBucket );
};
template< class V>
struct Continuation;
/**
* This class adds functionality for manipulating buckets that are assembled
* in a tree. The requirements for const and non const functions and
* arguments are generally the same as in BtreeBucket. Because this class
* deals with tree structure, some functions that are marked const may
* trigger modification of another node in the btree or potentially of the
* current node. In such cases, the function's implementation explicitly
* casts away const when indicating an intent to write to the durability
* layer. The DiskLocs provided to such functions should be passed by
* value if they shadow pointers within the btree.
*
* To clarify enforcement of referential integrity in this implementation,
* we use the following pattern when deleting data we have a persistent
* pointer to. The pointer is cleared or removed explicitly, then the data
* it pointed to is cleaned up with a helper function.
*
* TODO It might make sense to put some of these functions in a class
* representing a full btree instead of a single btree bucket. That would
* allow us to use the const qualifier in a manner more consistent with
* standard usage. Right now the interface is for both a node and a tree,
* so assignment of const is sometimes nonideal.
*
* TODO There are several cases in which the 'this' pointer is invalidated
* as a result of deallocation. A seperate class representing a btree would
* alleviate some fragile cases where the implementation must currently
* behave correctly if the 'this' pointer is suddenly invalidated by a
* callee.
*/
template< class V >
class BtreeBucket : public BucketBasics {
friend class BtreeCursor;
friend struct Continuation;
public:
// make compiler happy:
typedef typename V::Key Key;
typedef typename V::KeyOwned KeyOwned;
typedef typename BucketBasics::KeyNode KeyNode;
typedef typename BucketBasics::_KeyNode _KeyNode;
typedef typename BucketBasics::Loc Loc;
const _KeyNode& k(int i) const { return static_cast< const BucketBasics * >(this)->k(i); }
protected:
_KeyNode& k(int i) { return static_cast< BucketBasics * >(this)->_k(i); }
public:
const KeyNode keyNode(int i) const { return static_cast< const BucketBasics * >(this)->keyNode(i); }
bool isHead() const { return this->parent.isNull(); }
void dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const;
long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order, long long *unusedCount = 0, bool strict = false, unsigned depth=0) const; /* traverses everything */
bool isUsed( int i ) const { return this->k(i).isUsed(); }
string bucketSummary() const;
void dump(unsigned depth=0) const;
/**
* @return true if key exists in index
*
* @order - indicates order of keys in the index. this is basically the index's key pattern, e.g.:
* BSONObj order = ((IndexDetails&)idx).keyPattern();
* likewise below in bt_insert() etc.
*/
private:
bool exists(const IndexDetails& idx, const DiskLoc &thisLoc, const Key& key, const Ordering& order) const;
public:
/**
* @param self - Don't complain about ourself already being in the index case.
* @return true = There is a duplicate used key.
*/
bool wouldCreateDup(
const IndexDetails& idx, const DiskLoc &thisLoc,
const Key& key, const Ordering& order,
const DiskLoc &self) const;
/**
* Preconditions: none
* Postconditions: @return a new bucket allocated from pdfile storage
* and init()-ed. This bucket is suitable to for use as a new root
* or any other new node in the tree.
*/
static DiskLoc addBucket(const IndexDetails&);
/**
* Preconditions: none
* Postconditions:
* - Some header values in this bucket are cleared, and the bucket is
* deallocated from pdfile storage.
* - The memory at thisLoc is invalidated, and 'this' is invalidated.
*/
void deallocBucket(const DiskLoc thisLoc, const IndexDetails &id);
/**
* Preconditions:
* - 'key' has a valid schema for this index.
* - All other paramenters are valid and consistent with this index if applicable.
* Postconditions:
* - If key is bigger than KeyMax, @return 2 or 3 and no change.
* - If key / recordLoc exist in the btree as an unused key, set them
* as used and @return 0
* - If key / recordLoc exist in the btree as a used key, @throw
* exception 10287 and no change.
* - If key / recordLoc do not exist in the btree, they are inserted
* and @return 0. The root of the btree may be changed, so
* 'this'/thisLoc may no longer be the root upon return.
*/
int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
const BSONObj& key, const Ordering &order, bool dupsAllowed,
IndexDetails& idx, bool toplevel = true) const;
/** does the insert in two steps - can then use an upgradable lock for step 1, which
is the part which may have page faults. also that step is most of the computational work.
*/
void twoStepInsert(DiskLoc thisLoc, Continuation &c, bool dupsAllowed) const;
/**
* Preconditions:
* - 'key' has a valid schema for this index, and may have objsize() > KeyMax.
* Postconditions:
* - If key / recordLoc are in the btree, they are removed (possibly
* by being marked as an unused key), @return true, and potentially
* invalidate 'this' / thisLoc and change the head.
* - If key / recordLoc are not in the btree, @return false and do nothing.
*/
bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const;
/**
* locate may return an "unused" key that is just a marker. so be careful.
* looks for a key:recordloc pair.
*
* @found - returns true if exact match found. note you can get back a position
* result even if found is false.
*/
DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const;
DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const Key& key, const Ordering &order,
int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const;
/**
* find the first instance of the key
* does not handle dups
* WARNING: findSingle may not be compound index safe. this may need to change. see notes in
* findSingle code.
* @return the record location of the first match
*/
DiskLoc findSingle( const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const;
/**
* Advance to next or previous key in the index.
* @param direction to advance.
*/
DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const;
/** Advance in specified direction to the specified key */
void advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const;
/** Locate a key with fields comprised of a combination of keyBegin fields and keyEnd fields. */
static void customLocate(DiskLoc &locInOut, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) ;
/** @return head of the btree by traversing from current bucket. */
const DiskLoc getHead(const DiskLoc& thisLoc) const;
/** get tree shape */
void shape(stringstream&) const;
static void a_test(IndexDetails&);
static int getKeyMax();
protected:
/**
* Preconditions:
* - 0 <= firstIndex <= n
* - -1 <= lastIndex <= n ( -1 is equivalent to n )
* Postconditions:
* - Any children at indexes firstIndex through lastIndex (inclusive)
* will have their parent pointers set to thisLoc.
*/
void fixParentPtrs(const DiskLoc thisLoc, int firstIndex = 0, int lastIndex = -1) const;
/**
* Preconditions:
* - thisLoc is not the btree head.
* - n == 0 is ok
* Postconditions:
* - All cursors pointing to this bucket will be updated.
* - This bucket's parent's child pointer is set to null.
* - This bucket is deallocated from pdfile storage.
* - 'this' and thisLoc are invalidated.
*/
void delBucket(const DiskLoc thisLoc, const IndexDetails&);
/**
* Preconditions: 0 <= p < n
* Postconditions:
* - The key at index p is removed from the btree.
* - 'this' and thisLoc may be invalidated.
* - The tree head may change.
*/
void delKeyAtPos(const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order);
/**
* Preconditions:
* - n == 0 is ok
* Postconditions:
* - If thisLoc is head, or if its body has at least lowWaterMark bytes,
* return false and do nothing.
* - Otherwise, if thisLoc has left or right neighbors, either balance
* or merge with them and return true. Also, 'this' and thisLoc may
* be invalidated and the tree head may change.
*/
bool mayBalanceWithNeighbors(const DiskLoc thisLoc, IndexDetails &id, const Ordering &order) const;
/**
* Preconditions:
* - 0 <= leftIndex < n
* - The child at leftIndex or the child at leftIndex + 1 contains
* fewer than lowWaterMark bytes.
* Postconditions:
* - If the child bucket at leftIndex can merge with the child index
* at leftIndex + 1, do nothing and return false.
* - Otherwise, balance keys between the leftIndex child and the
* leftIndex + 1 child, return true, and possibly change the tree head.
*/
bool tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const;
/**
* Preconditions:
* - All preconditions of tryBalanceChildren.
* - The leftIndex child and leftIndex + 1 child cannot be merged.
* Postconditions:
* - Keys are moved between the leftIndex child and the leftIndex + 1
* child such that neither child has fewer than lowWaterMark bytes.
* The tree head may change.
*/
void doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order );
/**
* Preconditions:
* - All preconditions of doBalanceChildren
* - The leftIndex and leftIndex + 1 children are packed.
* - The leftIndex + 1 child has fewer than lowWaterMark bytes.
* - split returned by rebalancedSeparatorPos()
* Postconditions:
* - The key in lchild at index split is set as thisLoc's key at index
* leftIndex, which may trigger a split and change the tree head.
* The previous key in thisLoc at index leftIndex and all keys with
* indexes greater than split in lchild are moved to rchild.
*/
void doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split,
BtreeBucket *l, const DiskLoc lchild,
BtreeBucket *r, const DiskLoc rchild,
IndexDetails &id, const Ordering &order );
/**
* Preconditions:
* - All preconditions of doBalanceChildren
* - The leftIndex and leftIndex + 1 children are packed.
* - The leftIndex child has fewer than lowWaterMark bytes.
* - split returned by rebalancedSeparatorPos()
* Postconditions:
* - The key in rchild at index split - l->n - 1 is set as thisLoc's key
* at index leftIndex, which may trigger a split and change the tree
* head. The previous key in thisLoc at index leftIndex and all keys
* with indexes less than split - l->n - 1 in rchild are moved to
* lchild.
*/
void doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split,
BtreeBucket *l, const DiskLoc lchild,
BtreeBucket *r, const DiskLoc rchild,
IndexDetails &id, const Ordering &order );
/**
* Preconditions:
* - 0 <= leftIndex < n
* - this->canMergeChildren( thisLoc, leftIndex ) == true
* Postconditions:
* - All of the above mentioned keys will be placed in the left child.
* - The tree may be updated recursively, resulting in 'this' and
* thisLoc being invalidated and the tree head being changed.
*/
void doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order);
/**
* Preconditions:
* - n == 0
* - !nextChild.isNull()
* Postconditions:
* - 'this' and thisLoc are deallocated (and invalidated), any cursors
* to them are updated, and the tree head may change.
* - nextChild replaces thisLoc in the btree structure.
*/
void replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id );
/**
* @return true iff the leftIndex and leftIndex + 1 children both exist,
* and if their body sizes when packed and the thisLoc key at leftIndex
* would fit in a single bucket body.
*/
bool canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const;
/**
* Preconditions:
* - leftIndex and leftIndex + 1 children are packed
* - leftIndex or leftIndex + 1 child is below lowWaterMark
* @return index of the rebalanced separator; the index value is
* determined as if we had a bucket with body
* .push( ).concat( )
* and called splitPos( 0 ) on it.
*/
int rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const;
/**
* Preconditions: thisLoc has a parent
* @return parent's index of thisLoc.
*/
int indexInParent( const DiskLoc &thisLoc ) const;
public:
Key keyAt(int i) const {
if( i >= this->n )
return Key();
return Key(this->data + k(i).keyDataOfs());
}
protected:
/**
* Preconditions:
* - This bucket is packed.
* - Cannot add a key of size KeyMax to this bucket.
* - 0 <= keypos <= n is the position of a new key that will be inserted
* - lchild is equal to the existing child at index keypos.
* Postconditions:
* - The thisLoc bucket is split into two packed buckets, possibly
* invalidating the initial position of keypos, with a split key
* promoted to the parent. The new key key/recordLoc will be inserted
* into one of the split buckets, and lchild/rchild set appropriately.
* Splitting may occur recursively, possibly changing the tree head.
*/
void split(const DiskLoc thisLoc, int keypos,
const DiskLoc recordLoc, const Key& key,
const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx);
/**
* Preconditions:
* - 0 <= keypos <= n
* - If key / recordLoc are inserted at position keypos, with provided
* lchild and rchild, the btree ordering requirements will be
* maintained.
* - lchild is equal to the existing child at index keypos.
* - n == 0 is ok.
* Postconditions:
* - The key / recordLoc are inserted at position keypos, and the
* bucket is split if necessary, which may change the tree head.
* - The bucket may be packed or split, invalidating the specified value
* of keypos.
* This function will always modify thisLoc, but it's marked const because
* it commonly relies on the specialized writ]e intent mechanism of basicInsert().
*/
void insertHere(const DiskLoc thisLoc, int keypos,
const DiskLoc recordLoc, const Key& key, const Ordering &order,
const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx) const;
/** bt_insert() is basically just a wrapper around this. */
int _insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
const Key& key, const Ordering &order, bool dupsAllowed,
const DiskLoc lChild, const DiskLoc rChild, IndexDetails &idx) const;
void insertStepOne(DiskLoc thisLoc, Continuation& c, bool dupsAllowed) const;
bool find(const IndexDetails& idx, const Key& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const;
static bool customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) ;
static void findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey);
static int customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction );
/** If child is non null, set its parent to thisLoc */
static void fix(const DiskLoc thisLoc, const DiskLoc child);
/**
* Preconditions:
* - 0 <= keypos < n
* - If the specified key and recordLoc are placed in keypos of thisLoc,
* and lchild and rchild are set, the btree ordering properties will
* be maintained.
* - rchild == childForPos( keypos + 1 )
* - childForPos( keypos ) is referenced elsewhere if nonnull.
* Postconditions:
* - The key at keypos will be replaced with the specified key and
* lchild, potentially splitting this bucket and changing the tree
* head.
* - childForPos( keypos ) will be orphaned.
*/
void setInternalKey( const DiskLoc thisLoc, int keypos,
const DiskLoc recordLoc, const Key &key, const Ordering &order,
const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx);
/**
* Preconditions:
* - 0 <= keypos < n
* - The keypos or keypos+1 indexed child is non null.
* Postconditions:
* - The specified key is deleted by replacing it with another key if
* possible. This replacement may cause a split and change the tree
* head. The replacement key will be deleted from its original
* location, potentially causing merges and splits that may invalidate
* 'this' and thisLoc and change the tree head.
* - If the key cannot be replaced, it will be marked as unused. This
* is only expected in legacy btrees.
*/
void deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order );
public:
/** simply builds and returns a dup key error message string */
static string dupKeyError( const IndexDetails& idx , const Key& key );
};
#pragma pack()
class FieldRangeVector;
class FieldRangeVectorIterator;
class BtreeCursor : public Cursor {
protected:
BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
public:
virtual ~BtreeCursor();
/** makes an appropriate subclass depending on the index version */
static BtreeCursor* make( NamespaceDetails *_d, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
static BtreeCursor* make( NamespaceDetails *_d, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
static BtreeCursor* make( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
static BtreeCursor* make( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
virtual bool ok() { return !bucket.isNull(); }
virtual bool advance();
virtual void noteLocation(); // updates keyAtKeyOfs...
virtual void checkLocation() = 0;
virtual bool supportGetMore() { return true; }
virtual bool supportYields() { return true; }
/**
* used for multikey index traversal to avoid sending back dups. see Matcher::matches().
* if a multikey index traversal:
* if loc has already been sent, returns true.
* otherwise, marks loc as sent.
* @return false if the loc has not been seen
*/
virtual bool getsetdup(DiskLoc loc) {
if( _multikey ) {
pair::iterator, bool> p = _dups.insert(loc);
return !p.second;
}
return false;
}
virtual bool modifiedKeys() const { return _multikey; }
virtual bool isMultiKey() const { return _multikey; }
/*const _KeyNode& _currKeyNode() const {
assert( !bucket.isNull() );
const _KeyNode& kn = keyNode(keyOfs);
assert( kn.isUsed() );
return kn;
}*/
/** returns BSONObj() if ofs is out of range */
virtual BSONObj keyAt(int ofs) const = 0;
virtual BSONObj currKey() const = 0;
virtual BSONObj indexKeyPattern() { return indexDetails.keyPattern(); }
virtual void aboutToDeleteBucket(const DiskLoc& b) {
if ( bucket == b )
keyOfs = -1;
}
virtual DiskLoc currLoc() = 0; // { return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc(); }
virtual DiskLoc refLoc() { return currLoc(); }
virtual Record* _current() { return currLoc().rec(); }
virtual BSONObj current() { return BSONObj(_current()); }
virtual string toString();
BSONObj prettyKey( const BSONObj &key ) const {
return key.replaceFieldNames( indexDetails.keyPattern() ).clientReadable();
}
virtual BSONObj prettyIndexBounds() const;
virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); }
virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; }
virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher; }
virtual long long nscanned() { return _nscanned; }
/** for debugging only */
const DiskLoc getBucket() const { return bucket; }
int getKeyOfs() const { return keyOfs; }
// just for unit tests
virtual bool curKeyHasChild() = 0;
protected:
/**
* Our btrees may (rarely) have "unused" keys when items are deleted.
* Skip past them.
*/
virtual bool skipUnusedKeys() = 0;
bool skipOutOfRangeKeysAndCheckEnd();
void skipAndCheck();
void checkEnd();
/** selective audits on construction */
void audit();
virtual void _audit() = 0;
virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc) = 0;
virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) = 0;
virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) = 0;
/** set initial bucket */
void initWithoutIndependentFieldRanges();
/** if afterKey is true, we want the first key with values of the keyBegin fields greater than keyBegin */
void advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive );
set _dups;
NamespaceDetails * const d;
const int idxNo;
BSONObj startKey;
BSONObj endKey;
bool _endKeyInclusive;
bool _multikey; // this must be updated every getmore batch in case someone added a multikey
const IndexDetails& indexDetails;
const BSONObj _order;
const Ordering _ordering;
DiskLoc bucket;
int keyOfs;
const int _direction; // 1=fwd,-1=reverse
BSONObj keyAtKeyOfs; // so we can tell if things moved around on us between the query and the getMore call
DiskLoc locAtKeyOfs;
const shared_ptr< FieldRangeVector > _bounds;
auto_ptr< FieldRangeVectorIterator > _boundsIterator;
shared_ptr< CoveredIndexMatcher > _matcher;
bool _independentFieldRanges;
long long _nscanned;
};
template< class V >
struct Continuation {
//Continuation(const typename V::Key & k);
Continuation(DiskLoc thisLoc, DiskLoc _recordLoc, const BSONObj &_key,
Ordering _order, IndexDetails& _idx) :
bLoc(thisLoc), recordLoc(_recordLoc), key(_key), order(_order), idx(_idx) {
op = Nothing;
}
DiskLoc bLoc;
DiskLoc recordLoc;
typename V::KeyOwned key;
const Ordering order;
IndexDetails& idx;
enum Op { Nothing, SetUsed, InsertHere } op;
int pos;
const BtreeBucket *b;
void stepTwo() {
if( op == Nothing )
return;
else if( op == SetUsed ) {
const typename V::_KeyNode& kn = b->k(pos);
kn.writing().setUsed();
}
else {
b->insertHere(bLoc, pos, recordLoc, key, order, DiskLoc(), DiskLoc(), idx);
}
}
};
/** Renames the index namespace for this btree's index. */
void renameIndexNamespace(const char *oldNs, const char *newNs);
/**
* give us a writable version of the btree bucket (declares write intent).
* note it is likely more efficient to declare write intent on something smaller when you can.
*/
template< class V >
BtreeBucket * DiskLoc::btreemod() const {
assert( _a != -1 );
BtreeBucket *b = const_cast< BtreeBucket * >( btree() );
return static_cast< BtreeBucket* >( getDur().writingPtr( b, V::BucketSize ) );
}
template< class V >
BucketBasics::KeyNode::KeyNode(const BucketBasics& bb, const _KeyNode &k) :
prevChildBucket(k.prevChildBucket),
recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs())
{ }
} // namespace mongo;