/**
* Copyright (C) 2014 MongoDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#pragma once
#include
#include "mongo/db/catalog/head_manager.h"
#include "mongo/db/catalog/index_catalog_entry.h"
#include "mongo/db/jsobj.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/storage/index_entry_comparison.h"
#include "mongo/db/storage/mmap_v1/btree/btree_ondisk.h"
#include "mongo/db/storage/mmap_v1/btree/key.h"
#include "mongo/db/storage/mmap_v1/diskloc.h"
namespace mongo {
class PseudoRandom;
class RecordStore;
class SavedCursorRegistry;
// Used for unit-testing only
template
class BtreeLogicTestBase;
template
class ArtificialTreeBuilder;
/**
* This is the logic for manipulating the Btree. It is (mostly) independent of the on-disk
* format.
*/
template
class BtreeLogic {
public:
// AKA _keyNode
typedef typename BtreeLayout::FixedWidthKeyType KeyHeaderType;
// AKA Key
typedef typename BtreeLayout::KeyType KeyDataType;
// AKA KeyOwned
typedef typename BtreeLayout::KeyOwnedType KeyDataOwnedType;
// AKA Loc
typedef typename BtreeLayout::LocType LocType;
// AKA BucketBasics or BtreeBucket, either one.
typedef typename BtreeLayout::BucketType BucketType;
/**
* 'head' manages the catalog information.
* 'store' allocates and frees buckets.
* 'ordering' is meta-information we store in the catalog.
* 'indexName' is a string identifying the index that we use to print errors with.
*/
BtreeLogic(HeadManager* head,
RecordStore* store,
SavedCursorRegistry* cursors,
const Ordering& ordering,
const std::string& indexName,
bool isUnique)
: _headManager(head),
_recordStore(store),
_cursorRegistry(cursors),
_ordering(ordering),
_indexName(indexName),
_isUnique(isUnique) {}
//
// Public-facing
//
class Builder {
public:
typedef typename BtreeLayout::KeyOwnedType KeyDataOwnedType;
typedef typename BtreeLayout::KeyType KeyDataType;
Status addKey(const BSONObj& key, const DiskLoc& loc);
private:
friend class BtreeLogic;
class SetRightLeafLocChange;
Builder(BtreeLogic* logic, OperationContext* txn, bool dupsAllowed);
/**
* Creates and returns a new empty bucket to the right of leftSib, maintaining the
* internal consistency of the tree. leftSib must be the right-most child of its parent
* or it must be the root.
*/
DiskLoc newBucket(BucketType* leftSib, DiskLoc leftSibLoc);
BucketType* _getModifiableBucket(DiskLoc loc);
BucketType* _getBucket(DiskLoc loc);
// Not owned.
BtreeLogic* _logic;
DiskLoc _rightLeafLoc; // DiskLoc of right-most (highest) leaf bucket.
bool _dupsAllowed;
std::unique_ptr _keyLast;
// Not owned.
OperationContext* _txn;
};
/**
* Caller owns the returned pointer.
* 'this' must outlive the returned pointer.
*/
Builder* newBuilder(OperationContext* txn, bool dupsAllowed);
Status dupKeyCheck(OperationContext* txn, const BSONObj& key, const DiskLoc& loc) const;
Status insert(OperationContext* txn,
const BSONObj& rawKey,
const DiskLoc& value,
bool dupsAllowed);
/**
* Navigates down the tree and locates the bucket and position containing a record with
* the specified combination.
*
* @return true if the exact was found. Otherwise, false and the
* bucketLocOut would contain the bucket containing key which is before or after the
* searched one (dependent on the direction).
*/
bool locate(OperationContext* txn,
const BSONObj& key,
const DiskLoc& recordLoc,
const int direction,
int* posOut,
DiskLoc* bucketLocOut) const;
void advance(OperationContext* txn,
DiskLoc* bucketLocInOut,
int* posInOut,
int direction) const;
bool exists(OperationContext* txn, const KeyDataType& key) const;
bool unindex(OperationContext* txn, const BSONObj& key, const DiskLoc& recordLoc);
bool isEmpty(OperationContext* txn) const;
long long fullValidate(OperationContext*,
long long* unusedCount,
bool strict,
bool dumpBuckets,
unsigned depth) const;
DiskLoc getDiskLoc(OperationContext* txn, const DiskLoc& bucketLoc, const int keyOffset) const;
BSONObj getKey(OperationContext* txn, const DiskLoc& bucketLoc, const int keyOffset) const;
/**
* Returns a pseudo-random element from the tree. It is an error to call this method if the tree
* is empty.
*/
IndexKeyEntry getRandomEntry(OperationContext* txn) const;
DiskLoc getHead(OperationContext* txn) const {
return DiskLoc::fromRecordId(_headManager->getHead(txn));
}
Status touch(OperationContext* txn) const;
//
// Composite key navigation methods
//
void customLocate(OperationContext* txn,
DiskLoc* locInOut,
int* keyOfsInOut,
const IndexSeekPoint& seekPoint,
int direction) const;
void advanceTo(OperationContext*,
DiskLoc* thisLocInOut,
int* keyOfsInOut,
const IndexSeekPoint& seekPoint,
int direction) const;
void restorePosition(OperationContext* txn,
const BSONObj& savedKey,
const DiskLoc& savedLoc,
int direction,
DiskLoc* bucketInOut,
int* keyOffsetInOut) const;
//
// Creation and deletion
//
/**
* Returns OK if the index was uninitialized before, error status otherwise.
*/
Status initAsEmpty(OperationContext* txn);
//
// Size constants
//
const RecordStore* getRecordStore() const {
return _recordStore;
}
SavedCursorRegistry* savedCursors() const {
return _cursorRegistry;
}
static int lowWaterMark();
Ordering ordering() const {
return _ordering;
}
int customBSONCmp(const BSONObj& inIndex_left,
const IndexSeekPoint& seekPoint_right,
int direction) const;
bool isUnique() const {
return _isUnique;
}
private:
friend class BtreeLogic::Builder;
// Used for unit-testing only
friend class BtreeLogicTestBase;
friend class ArtificialTreeBuilder;
/**
* This is an in memory wrapper for the variable length data associated with a
* KeyHeaderType. It points to on-disk data but is not itself on-disk data.
*
* This object and its BSONObj 'key' will become invalid if the KeyHeaderType data that owns
* this it is moved within the btree. In general, a KeyWrapper should not be expected to be
* valid after a write.
*/
struct FullKey {
FullKey(const BucketType* bucket, int i)
: header(getKeyHeader(bucket, i)),
prevChildBucket(header.prevChildBucket),
recordLoc(header.recordLoc),
data(bucket->data + header.keyDataOfs()) {}
// This is actually a reference to something on-disk.
const KeyHeaderType& header;
// These are actually in 'header'.
const LocType& prevChildBucket;
const LocType& recordLoc;
// This is *not* memory-mapped but its members point to something on-disk.
KeyDataType data;
};
//
// Functions that depend on the templated type info but nothing in 'this'.
//
static LocType& childLocForPos(BucketType* bucket, int pos);
static FullKey getFullKey(const BucketType* bucket, int i);
static KeyHeaderType& getKeyHeader(BucketType* bucket, int i);
static const KeyHeaderType& getKeyHeader(const BucketType* bucket, int i);
static char* dataAt(BucketType* bucket, short ofs);
static void markUnused(BucketType* bucket, int keypos);
static int totalDataSize(BucketType* bucket);
static void init(BucketType* bucket);
static int _alloc(BucketType* bucket, int bytes);
static void _unalloc(BucketType* bucket, int bytes);
static void _delKeyAtPos(BucketType* bucket, int keypos, bool mayEmpty = false);
static void popBack(BucketType* bucket, DiskLoc* recordLocOut, KeyDataType* keyDataOut);
static bool mayDropKey(BucketType* bucket, int index, int refPos);
static int _packedDataSize(BucketType* bucket, int refPos);
static void setPacked(BucketType* bucket);
static void setNotPacked(BucketType* bucket);
static BucketType* btreemod(OperationContext* txn, BucketType* bucket);
static int splitPos(BucketType* bucket, int keypos);
static void reserveKeysFront(BucketType* bucket, int nAdd);
static void setKey(BucketType* bucket,
int i,
const DiskLoc recordLoc,
const KeyDataType& key,
const DiskLoc prevChildBucket);
static bool isHead(BucketType* bucket);
static void dumpBucket(const BucketType* bucket, int indentLength = 0);
static void assertValid(const std::string& ns,
BucketType* bucket,
const Ordering& ordering,
bool force = false);
//
// 'this'-specific helpers (require record store, catalog information, or ordering, or type
// information).
//
bool basicInsert(OperationContext* txn,
BucketType* bucket,
const DiskLoc bucketLoc,
int& keypos,
const KeyDataType& key,
const DiskLoc recordLoc);
void dropFront(BucketType* bucket, int nDrop, int& refpos);
void _pack(OperationContext* txn, BucketType* bucket, const DiskLoc thisLoc, int& refPos);
void customLocate(OperationContext* txn,
DiskLoc* locInOut,
int* keyOfsInOut,
const IndexSeekPoint& seekPoint,
int direction,
std::pair& bestParent) const;
Status _find(OperationContext* txn,
BucketType* bucket,
const KeyDataType& key,
const DiskLoc& recordLoc,
bool errorIfDup,
int* keyPositionOut,
bool* foundOut) const;
bool customFind(OperationContext* txn,
int low,
int high,
const IndexSeekPoint& seekPoint,
int direction,
DiskLoc* thisLocInOut,
int* keyOfsInOut,
std::pair& bestParent) const;
void advanceToImpl(OperationContext* txn,
DiskLoc* thisLocInOut,
int* keyOfsInOut,
const IndexSeekPoint& seekPoint,
int direction) const;
bool wouldCreateDup(OperationContext* txn, const KeyDataType& key, const DiskLoc self) const;
bool keyIsUsed(OperationContext* txn, const DiskLoc& loc, const int& pos) const;
void skipUnusedKeys(OperationContext* txn, DiskLoc* loc, int* pos, int direction) const;
DiskLoc advance(OperationContext* txn,
const DiskLoc& bucketLoc,
int* posInOut,
int direction) const;
DiskLoc _locate(OperationContext* txn,
const DiskLoc& bucketLoc,
const KeyDataType& key,
int* posOut,
bool* foundOut,
const DiskLoc& recordLoc,
const int direction) const;
long long _fullValidate(OperationContext* txn,
const DiskLoc bucketLoc,
long long* unusedCount,
bool strict,
bool dumpBuckets,
unsigned depth) const;
DiskLoc _addBucket(OperationContext* txn);
bool canMergeChildren(OperationContext* txn,
BucketType* bucket,
const DiskLoc bucketLoc,
const int leftIndex);
// has to look in children of 'bucket' and requires record store
int _rebalancedSeparatorPos(OperationContext* txn, BucketType* bucket, int leftIndex);
void _packReadyForMod(BucketType* bucket, int& refPos);
void truncateTo(BucketType* bucket, int N, int& refPos);
void split(OperationContext* txn,
BucketType* bucket,
const DiskLoc bucketLoc,
int keypos,
const DiskLoc recordLoc,
const KeyDataType& key,
const DiskLoc lchild,
const DiskLoc rchild);
Status _insert(OperationContext* txn,
BucketType* bucket,
const DiskLoc bucketLoc,
const KeyDataType& key,
const DiskLoc recordLoc,
bool dupsAllowed,
const DiskLoc leftChild,
const DiskLoc rightChild);
// TODO take a BucketType*?
void insertHere(OperationContext* txn,
const DiskLoc bucketLoc,
int pos,
const KeyDataType& key,
const DiskLoc recordLoc,
const DiskLoc leftChild,
const DiskLoc rightChild);
std::string dupKeyError(const KeyDataType& key) const;
void setInternalKey(OperationContext* txn,
BucketType* bucket,
const DiskLoc bucketLoc,
int keypos,
const DiskLoc recordLoc,
const KeyDataType& key,
const DiskLoc lchild,
const DiskLoc rchild);
void fixParentPtrs(OperationContext* trans,
BucketType* bucket,
const DiskLoc bucketLoc,
int firstIndex = 0,
int lastIndex = -1);
bool mayBalanceWithNeighbors(OperationContext* txn,
BucketType* bucket,
const DiskLoc bucketLoc);
void doBalanceChildren(OperationContext* txn,
BucketType* bucket,
const DiskLoc bucketLoc,
int leftIndex);
void doBalanceLeftToRight(OperationContext* txn,
BucketType* bucket,
const DiskLoc thisLoc,
int leftIndex,
int split,
BucketType* l,
const DiskLoc lchild,
BucketType* r,
const DiskLoc rchild);
void doBalanceRightToLeft(OperationContext* txn,
BucketType* bucket,
const DiskLoc thisLoc,
int leftIndex,
int split,
BucketType* l,
const DiskLoc lchild,
BucketType* r,
const DiskLoc rchild);
bool tryBalanceChildren(OperationContext* txn,
BucketType* bucket,
const DiskLoc bucketLoc,
int leftIndex);
int indexInParent(OperationContext* txn, BucketType* bucket, const DiskLoc bucketLoc) const;
void doMergeChildren(OperationContext* txn,
BucketType* bucket,
const DiskLoc bucketLoc,
int leftIndex);
void replaceWithNextChild(OperationContext* txn, BucketType* bucket, const DiskLoc bucketLoc);
void deleteInternalKey(OperationContext* txn,
BucketType* bucket,
const DiskLoc bucketLoc,
int keypos);
void delKeyAtPos(OperationContext* txn, BucketType* bucket, const DiskLoc bucketLoc, int p);
void delBucket(OperationContext* txn, BucketType* bucket, const DiskLoc bucketLoc);
void deallocBucket(OperationContext* txn, BucketType* bucket, const DiskLoc bucketLoc);
bool _keyIsAt(const BSONObj& savedKey,
const DiskLoc& savedLoc,
BucketType* bucket,
int keyPos) const;
/**
* Tries to push key into bucket. Return false if it can't because key doesn't fit.
*
* bucket must be declared as writable by the caller.
* The new key/recordLoc pair must be higher than any others in bucket.
*
* TODO needs 'this' for _ordering for sanity check
*/
bool pushBack(BucketType* bucket,
const DiskLoc recordLoc,
const KeyDataType& key,
const DiskLoc prevChild);
BucketType* childForPos(OperationContext* txn, BucketType* bucket, int pos) const;
BucketType* getBucket(OperationContext* txn, const DiskLoc dl) const {
return getBucket(txn, dl.toRecordId());
}
BucketType* getBucket(OperationContext* txn, const RecordId dl) const;
BucketType* getRoot(OperationContext* txn) const;
DiskLoc getRootLoc(OperationContext* txn) const;
void recordRandomWalk(OperationContext* txn,
PseudoRandom* prng,
BucketType* curBucket,
int64_t nBucketsInCurrentLevel,
std::vector* nKeysInLevel,
std::vector* selectedKeys) const;
//
// Data
//
// Not owned here.
HeadManager* _headManager;
// Not owned here.
RecordStore* _recordStore;
// Not owned Here.
SavedCursorRegistry* _cursorRegistry;
Ordering _ordering;
std::string _indexName;
// True if this is a unique index, i.e. if duplicate key values are disallowed.
const bool _isUnique;
};
} // namespace mongo