diff options
-rw-r--r-- | docs/errors.md | 16 | ||||
-rw-r--r-- | src/mongo/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/db/diskloc.h | 7 | ||||
-rw-r--r-- | src/mongo/db/index/btree_based_access_method.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/index/btree_index_cursor.h | 2 | ||||
-rw-r--r-- | src/mongo/db/mongod.vcxproj | 2 | ||||
-rw-r--r-- | src/mongo/db/mongod.vcxproj.filters | 6 | ||||
-rw-r--r-- | src/mongo/db/structure/btree/btree.cpp | 1993 | ||||
-rw-r--r-- | src/mongo/db/structure/btree/btree.h | 1094 | ||||
-rw-r--r-- | src/mongo/dbtests/indexupdatetests.cpp | 1 | ||||
-rw-r--r-- | src/mongo/dbtests/test.vcxproj | 2 | ||||
-rwxr-xr-x | src/mongo/dbtests/test.vcxproj.filters | 6 |
12 files changed, 4 insertions, 3132 deletions
diff --git a/docs/errors.md b/docs/errors.md index 3f107a218f2..bd75626f5d4 100644 --- a/docs/errors.md +++ b/docs/errors.md @@ -189,22 +189,6 @@ src/mongo/client/syncclusterconnection.cpp * 8020 [code](http://github.com/mongodb/mongo/blob/master/src/mongo/client/syncclusterconnection.cpp#L283) SyncClusterConnection::remove prepare failed: -src/mongo/db/btree.cpp ----- -* 10281 [code](http://github.com/mongodb/mongo/blob/master/src/mongo/db/btree.cpp#L143) assert is misdefined -* 10282 [code](http://github.com/mongodb/mongo/blob/master/src/mongo/db/btree.cpp#L324) n==0 in btree popBack() -* 10283 [code](http://github.com/mongodb/mongo/blob/master/src/mongo/db/btree.cpp#L331) rchild not null in btree popBack() -* 10285 [code](http://github.com/mongodb/mongo/blob/master/src/mongo/db/btree.cpp#L1771) _insert: reuse key but lchild is not null -* 10286 [code](http://github.com/mongodb/mongo/blob/master/src/mongo/db/btree.cpp#L1772) _insert: reuse key but rchild is not null -* 10287 [code](http://github.com/mongodb/mongo/blob/master/src/mongo/db/btree.cpp#L84) btree: key+recloc already in index -* 15898 [code](http://github.com/mongodb/mongo/blob/master/src/mongo/db/btree.cpp#L42) error in index possibly corruption consider repairing - - -src/mongo/db/btree.h ----- -* 13000 [code](http://github.com/mongodb/mongo/blob/master/src/mongo/db/btree.h#L361) invalid keyNode: " + BSON( "i" << i << "n - - src/mongo/db/btreebuilder.cpp ---- * 10288 [code](http://github.com/mongodb/mongo/blob/master/src/mongo/db/btreebuilder.cpp#L76) bad key order in BtreeBuilder - server internal error diff --git a/src/mongo/SConscript b/src/mongo/SConscript index 375de4bebde..b61c29a7e61 100644 --- a/src/mongo/SConscript +++ b/src/mongo/SConscript @@ -553,7 +553,6 @@ serverOnlyFiles = [ "db/curop.cpp", "db/storage/mmap_v1/dur_journal.cpp", "db/storage/mmap_v1/dur_transaction.cpp", "db/introspect.cpp", - "db/structure/btree/btree.cpp", "db/structure/btree/btree_interface.cpp", "db/structure/btree/btree_logic.cpp", "db/structure/btree/btree_stats.cpp", diff --git a/src/mongo/db/diskloc.h b/src/mongo/db/diskloc.h index cebb2cc78d9..7ff3786a2e0 100644 --- a/src/mongo/db/diskloc.h +++ b/src/mongo/db/diskloc.h @@ -169,13 +169,6 @@ namespace mongo { */ BSONObj obj() const; // TODO(ERH): remove - template< class V > - const BtreeBucket<V> * btree() const; // TODO(ERH): remove - - // Explicitly signals we are writing and casts away const - template< class V > - BtreeBucket<V> * btreemod() const; // TODO(ERH): remove - /// members for Sorter struct SorterDeserializeSettings {}; // unused void serializeForSorter(BufBuilder& buf) const { buf.appendStruct(*this); } diff --git a/src/mongo/db/index/btree_based_access_method.cpp b/src/mongo/db/index/btree_based_access_method.cpp index ebd933cb0ac..092e5f8d17f 100644 --- a/src/mongo/db/index/btree_based_access_method.cpp +++ b/src/mongo/db/index/btree_based_access_method.cpp @@ -43,11 +43,14 @@ #include "mongo/db/pdfile_private.h" #include "mongo/db/repl/is_master.h" #include "mongo/db/repl/rs.h" +#include "mongo/db/server_parameters.h" #include "mongo/db/structure/btree/btree_interface.h" #include "mongo/util/progress_meter.h" namespace mongo { + MONGO_EXPORT_SERVER_PARAMETER(failIndexKeyTooLong, bool, true); + BtreeBasedAccessMethod::BtreeBasedAccessMethod(IndexCatalogEntry* btreeState) : _btreeState(btreeState), _descriptor(btreeState->descriptor()) { @@ -58,9 +61,6 @@ namespace mongo { _descriptor->version())); } - // This is currently in btree.cpp. Once that file is retired it'll move here. - extern bool failIndexKeyTooLong; - // Find the keys for obj, put them in the tree pointing to loc Status BtreeBasedAccessMethod::insert(const BSONObj& obj, const DiskLoc& loc, diff --git a/src/mongo/db/index/btree_index_cursor.h b/src/mongo/db/index/btree_index_cursor.h index 2f9ab2ae494..e77116a05e3 100644 --- a/src/mongo/db/index/btree_index_cursor.h +++ b/src/mongo/db/index/btree_index_cursor.h @@ -46,7 +46,7 @@ namespace mongo { bool isEOF() const; /** - * Called from btree.cpp when we're about to delete a Btree bucket. + * Called from btree_logic.cpp when we're about to delete a Btree bucket. */ static void aboutToDeleteBucket(const DiskLoc& bucket); diff --git a/src/mongo/db/mongod.vcxproj b/src/mongo/db/mongod.vcxproj index 4a642721e99..516ea3485f3 100644 --- a/src/mongo/db/mongod.vcxproj +++ b/src/mongo/db/mongod.vcxproj @@ -4587,7 +4587,6 @@ cscript //Nologo "$(ProjectDir)..\shell\createCPPfromJavaScriptFiles.js" "$(Proj <ClCompile Include="stats\counters.cpp" />
<ClCompile Include="stats\snapshots.cpp" />
<ClCompile Include="stats\top.cpp" />
- <ClCompile Include="btree.cpp" />
<ClCompile Include="btreecursor.cpp" />
<ClCompile Include="repl\health.cpp" />
<ClCompile Include="repl\rs.cpp" />
@@ -4890,7 +4889,6 @@ cscript //Nologo "$(ProjectDir)..\shell\createCPPfromJavaScriptFiles.js" "$(Proj <ClInclude Include="..\scripting\engine_v8.h" />
<ClInclude Include="..\scripting\v8_db.h" />
<ClInclude Include="..\scripting\v8_utils.h" />
- <ClInclude Include="btree.h" />
<ClInclude Include="repl\health.h" />
<ClInclude Include="repl\rs.h" />
<ClInclude Include="repl\rs_config.h" />
diff --git a/src/mongo/db/mongod.vcxproj.filters b/src/mongo/db/mongod.vcxproj.filters index e7d7c998e19..c1445e3635e 100644 --- a/src/mongo/db/mongod.vcxproj.filters +++ b/src/mongo/db/mongod.vcxproj.filters @@ -485,9 +485,6 @@ <ClCompile Include="..\util\util.cpp">
<Filter>util\Source Files</Filter>
</ClCompile>
- <ClCompile Include="btree.cpp">
- <Filter>db\Source Files\a to d</Filter>
- </ClCompile>
<ClCompile Include="durop.cpp">
<Filter>db\Source Files\a to d</Filter>
</ClCompile>
@@ -2786,9 +2783,6 @@ <ClInclude Include="durop.h">
<Filter>db\Header Files\a to d</Filter>
</ClInclude>
- <ClInclude Include="btree.h">
- <Filter>db\Header Files\a to d</Filter>
- </ClInclude>
<ClInclude Include="client.h">
<Filter>db\Header Files\a to d</Filter>
</ClInclude>
diff --git a/src/mongo/db/structure/btree/btree.cpp b/src/mongo/db/structure/btree/btree.cpp deleted file mode 100644 index 4537f73d8d9..00000000000 --- a/src/mongo/db/structure/btree/btree.cpp +++ /dev/null @@ -1,1993 +0,0 @@ -// btree.cpp - -/** -* Copyright (C) 2008 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#include "mongo/pch.h" - -#include "mongo/db/structure/btree/btree.h" - -#include "mongo/db/structure/btree/btree_stats.h" -#include "mongo/db/client.h" -#include "mongo/db/clientcursor.h" -#include "mongo/db/curop-inl.h" -#include "mongo/db/db.h" -#include "mongo/db/dbhelpers.h" -#include "mongo/db/storage/mmap_v1/dur_commitjob.h" -#include "mongo/db/index/btree_index_cursor.h" // for aboutToDeleteBucket -#include "mongo/db/json.h" -#include "mongo/db/kill_current_op.h" -#include "mongo/db/pdfile.h" -#include "mongo/db/repl/is_master.h" -#include "mongo/db/server_parameters.h" -#include "mongo/db/stats/counters.h" -#include "mongo/server.h" -#include "mongo/util/startup_test.h" - -namespace mongo { - - BOOST_STATIC_ASSERT( Record::HeaderSize == 16 ); - BOOST_STATIC_ASSERT( Record::HeaderSize + BtreeData_V1::BucketSize == 8192 ); - - MONGO_EXPORT_SERVER_PARAMETER(failIndexKeyTooLong, bool, true); - - - void keyTooLongAssert( int code, const string& msg ) { - - if ( !isMaster( NULL ) ) { - // on secondaries, we do old behavior - return; - } - - if ( !failIndexKeyTooLong ) { - // user doesn't want this - return; - } - - uasserted( code, msg ); - } - - NOINLINE_DECL void checkFailed(unsigned line) { - static time_t last; - if( time(0) - last >= 10 ) { - msgasserted(15898, str::stream() << "error in index; possibly corruption. " - "See http://dochub.mongodb.org/core/data-recovery" << line); - } - } - - /** data check. like assert, but gives a reasonable error message to the user. */ -#define check(expr) if(!(expr) ) { checkFailed(__LINE__); } - -#define VERIFYTHISLOC dassert( thisLoc.btree<V>() == this ); - - template< class Loc > - __KeyNode<Loc> & __KeyNode<Loc>::writing() const { - return *getDur().writing( const_cast< __KeyNode<Loc> * >( this ) ); - } - - // BucketBasics::lowWaterMark() - // - // We define this value as the maximum number of bytes such that, if we have - // fewer than this many bytes, we must be able to either merge with or receive - // keys from any neighboring node. If our utilization goes below this value we - // know we can bring up the utilization with a simple operation. Ignoring the - // 90/10 split policy which is sometimes employed and our 'unused' nodes, this - // is a lower bound on bucket utilization for non root buckets. - // - // Note that the exact value here depends on the implementation of - // rebalancedSeparatorPos(). The conditions for lowWaterMark - 1 are as - // follows: We know we cannot merge with the neighbor, so the total data size - // for us, the neighbor, and the separator must be at least - // BtreeBucket<V>::bodySize() + 1. We must be able to accept one key of any - // allowed size, so our size plus storage for that additional key must be - // <= BtreeBucket<V>::bodySize() / 2. This way, with the extra key we'll have a - // new bucket data size < half the total data size and by the implementation - // of rebalancedSeparatorPos() the key must be added. - - static const int split_debug = 0; - static const int insert_debug = 0; - - /** - * this error is ok/benign when doing a background indexing -- that logic in pdfile checks explicitly - * for the 10287 error code. - */ - static void alreadyInIndex() { - // we don't use massert() here as that does logging and this is 'benign' - see catches in _indexRecord() - throw MsgAssertionException(10287, "btree: key+recloc already in index"); - } - - /* BucketBasics --------------------------------------------------- */ - - template< class V > - void BucketBasics<V>::assertWritable() { - if (storageGlobalParams.dur) - dur::assertAlreadyDeclared(this, V::BucketSize); - } - - template< class V > - string BtreeBucket<V>::bucketSummary() const { - stringstream ss; - ss << " Bucket info:" << endl; - ss << " n: " << this->n << endl; - ss << " parent: " << this->parent.toString() << endl; - ss << " nextChild: " << this->nextChild.toString() << endl; - ss << " flags:" << this->flags << endl; - ss << " emptySize: " << this->emptySize << " topSize: " << this->topSize << endl; - return ss.str(); - } - - template< class V > - int BucketBasics<V>::Size() const { - return V::BucketSize; - } - - template< class V > - void BucketBasics<V>::_shape(int level, stringstream& ss) const { - for ( int i = 0; i < level; i++ ) ss << ' '; - ss << "*[" << this->n << "]\n"; - for ( int i = 0; i < this->n; i++ ) { - if ( !k(i).prevChildBucket.isNull() ) { - DiskLoc ll = k(i).prevChildBucket; - ll.btree<V>()->_shape(level+1,ss); - } - } - if ( !this->nextChild.isNull() ) { - DiskLoc ll = this->nextChild; - ll.btree<V>()->_shape(level+1,ss); - } - } - - int bt_fv=0; - int bt_dmp=0; - - template< class V > - void BtreeBucket<V>::dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const { - bt_dmp=1; - fullValidate(thisLoc, order); - bt_dmp=0; - } - - template< class V > - long long BtreeBucket<V>::fullValidate(const DiskLoc& thisLoc, const BSONObj &order, long long *unusedCount, bool strict, unsigned depth) const { - killCurrentOp.checkForInterrupt(); - this->assertValid(order, true); - - if ( bt_dmp ) { - log() << thisLoc.toString() << ' '; - ((BtreeBucket *) this)->dump(depth); - } - - // keycount - long long kc = 0; - - for ( int i = 0; i < this->n; i++ ) { - const _KeyNode& kn = this->k(i); - - if ( kn.isUsed() ) { - kc++; - } - else { - if ( unusedCount ) { - ++( *unusedCount ); - } - } - if ( !kn.prevChildBucket.isNull() ) { - DiskLoc left = kn.prevChildBucket; - const BtreeBucket *b = left.btree<V>(); - if ( strict ) { - verify( b->parent == thisLoc ); - } - else { - wassert( b->parent == thisLoc ); - } - kc += b->fullValidate(kn.prevChildBucket, order, unusedCount, strict, depth+1); - } - } - if ( !this->nextChild.isNull() ) { - DiskLoc ll = this->nextChild; - const BtreeBucket *b = ll.btree<V>(); - if ( strict ) { - verify( b->parent == thisLoc ); - } - else { - wassert( b->parent == thisLoc ); - } - kc += b->fullValidate(this->nextChild, order, unusedCount, strict, depth+1); - } - - return kc; - } - - int nDumped = 0; - - template< class V > - void BucketBasics<V>::assertValid(const Ordering &order, bool force) const { - if ( !debug && !force ) - return; - { - int foo = this->n; - wassert( foo >= 0 && this->n < Size() ); - foo = this->emptySize; - wassert( foo >= 0 && this->emptySize < V::BucketSize ); - wassert( this->topSize >= this->n && this->topSize <= V::BucketSize ); - } - - // this is very slow so don't do often - { - static int _k; - if( ++_k % 128 ) - return; - } - - DEV { - // slow: - for ( int i = 0; i < this->n-1; i++ ) { - Key k1 = keyNode(i).key; - Key k2 = keyNode(i+1).key; - int z = k1.woCompare(k2, order); //OK - if ( z > 0 ) { - out() << "ERROR: btree key order corrupt. Keys:" << endl; - if ( ++nDumped < 5 ) { - for ( int j = 0; j < this->n; j++ ) { - out() << " " << keyNode(j).key.toString() << endl; - } - ((BtreeBucket<V> *) this)->dump(); - } - wassert(false); - break; - } - else if ( z == 0 ) { - if ( !(k(i).recordLoc < k(i+1).recordLoc) ) { - out() << "ERROR: btree key order corrupt (recordloc's wrong):" << endl; - out() << " k(" << i << ")" << keyNode(i).key.toString() << " RL:" << k(i).recordLoc.toString() << endl; - out() << " k(" << i+1 << ")" << keyNode(i+1).key.toString() << " RL:" << k(i+1).recordLoc.toString() << endl; - wassert( k(i).recordLoc < k(i+1).recordLoc ); - } - } - } - } - else { - //faster: - if ( this->n > 1 ) { - Key k1 = keyNode(0).key; - Key k2 = keyNode(this->n-1).key; - int z = k1.woCompare(k2, order); - //wassert( z <= 0 ); - if ( z > 0 ) { - problem() << "btree keys out of order" << '\n'; - ONCE { - ((BtreeBucket<V> *) this)->dump(); - } - verify(false); - } - } - } - } - - template< class V > - inline void BucketBasics<V>::markUnused(int keypos) { - verify( keypos >= 0 && keypos < this->n ); - k(keypos).setUnused(); - } - - template< class V > - inline int BucketBasics<V>::totalDataSize() const { - return (int) (Size() - (this->data-(char*)this)); - } - - template< class V > - void BucketBasics<V>::init() { - this->_init(); - this->parent.Null(); - this->nextChild.Null(); - this->flags = Packed; - this->n = 0; - this->emptySize = totalDataSize(); - this->topSize = 0; - } - - /** see _alloc */ - template< class V > - inline void BucketBasics<V>::_unalloc(int bytes) { - this->topSize -= bytes; - this->emptySize += bytes; - } - - /** - * we allocate space from the end of the buffer for data. - * the keynodes grow from the front. - */ - template< class V > - inline int BucketBasics<V>::_alloc(int bytes) { - verify( this->emptySize >= bytes ); - this->topSize += bytes; - this->emptySize -= bytes; - int ofs = totalDataSize() - this->topSize; - verify( ofs > 0 ); - return ofs; - } - - template< class V > - void BucketBasics<V>::_delKeyAtPos(int keypos, bool mayEmpty) { - // TODO This should be keypos < n - verify( keypos >= 0 && keypos <= this->n ); - verify( childForPos(keypos).isNull() ); - // TODO audit cases where nextChild is null - verify( ( mayEmpty && this->n > 0 ) || this->n > 1 || this->nextChild.isNull() ); - this->emptySize += sizeof(_KeyNode); - this->n--; - for ( int j = keypos; j < this->n; j++ ) - k(j) = k(j+1); - setNotPacked(); - } - - /** - * pull rightmost key from the bucket. this version requires its right child to be null so it - * does not bother returning that value. - */ - template< class V > - void BucketBasics<V>::popBack(DiskLoc& recLoc, Key &key) { - massert( 10282 , "n==0 in btree popBack()", this->n > 0 ); - verify( k(this->n-1).isUsed() ); // no unused skipping in this function at this point - btreebuilder doesn't require that - KeyNode kn = keyNode(this->n-1); - recLoc = kn.recordLoc; - key.assign(kn.key); - int keysize = kn.key.dataSize(); - - massert( 10283 , "rchild not null in btree popBack()", this->nextChild.isNull()); - - // weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't full. - this->nextChild = kn.prevChildBucket; - - this->n--; - // This is risky because the key we are returning points to this unalloc'ed memory, - // and we are assuming that the last key points to the last allocated - // bson region. - this->emptySize += sizeof(_KeyNode); - _unalloc(keysize); - } - - /** add a key. must be > all existing. be careful to set next ptr right. */ - template< class V > - bool BucketBasics<V>::_pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild) { - int bytesNeeded = key.dataSize() + sizeof(_KeyNode); - if ( bytesNeeded > this->emptySize ) - return false; - verify( bytesNeeded <= this->emptySize ); - if( this->n ) { - const KeyNode klast = keyNode(this->n-1); - if( klast.key.woCompare(key, order) > 0 ) { - log() << "btree bucket corrupt? consider reindexing or running validate command" << endl; - log() << " klast: " << keyNode(this->n-1).key.toString() << endl; - log() << " key: " << key.toString() << endl; - DEV klast.key.woCompare(key, order); - verify(false); - } - } - this->emptySize -= sizeof(_KeyNode); - _KeyNode& kn = k(this->n++); - kn.prevChildBucket = prevChild; - kn.recordLoc = recordLoc; - kn.setKeyDataOfs( (short) _alloc(key.dataSize()) ); - short ofs = kn.keyDataOfs(); - char *p = dataAt(ofs); - memcpy(p, key.data(), key.dataSize()); - - return true; - } - - /* durability note - we do separate intent declarations herein. arguably one could just declare - the whole bucket given we do group commits. this is something we could investigate - later as to what is faster under what situations. - */ - /** insert a key in a bucket with no complexity -- no splits required - @return false if a split is required. - */ - template< class V > - bool BucketBasics<V>::basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order) const { - check( this->n < 1024 ); - check( keypos >= 0 && keypos <= this->n ); - int bytesNeeded = key.dataSize() + sizeof(_KeyNode); - if ( bytesNeeded > this->emptySize ) { - _pack(thisLoc, order, keypos); - if ( bytesNeeded > this->emptySize ) - return false; - } - - BucketBasics *b; - { - const char *p = (const char *) &k(keypos); - const char *q = (const char *) &k(this->n+1); - // declare that we will write to [k(keypos),k(n)] - // todo: this writes a medium amount to the journal. we may want to add a verb "shift" to the redo log so - // we can log a very small amount. - b = (BucketBasics*) getDur().writingAtOffset((void *) this, p-(char*)this, q-p); - - // e.g. n==3, keypos==2 - // 1 4 9 - // -> - // 1 4 _ 9 - for ( int j = this->n; j > keypos; j-- ) // make room - b->k(j) = b->k(j-1); - } - - getDur().declareWriteIntent(&b->emptySize, sizeof(this->emptySize)+sizeof(this->topSize)+sizeof(this->n)); - b->emptySize -= sizeof(_KeyNode); - b->n++; - - // This _KeyNode was marked for writing above. - _KeyNode& kn = b->k(keypos); - kn.prevChildBucket.Null(); - kn.recordLoc = recordLoc; - kn.setKeyDataOfs((short) b->_alloc(key.dataSize()) ); - char *p = b->dataAt(kn.keyDataOfs()); - getDur().declareWriteIntent(p, key.dataSize()); - memcpy(p, key.data(), key.dataSize()); - return true; - } - - /** - * With this implementation, refPos == 0 disregards effect of refPos. - * index > 0 prevents creation of an empty bucket. - */ - template< class V > - bool BucketBasics<V>::mayDropKey( int index, int refPos ) const { - return index > 0 && ( index != refPos ) && k( index ).isUnused() && k( index ).prevChildBucket.isNull(); - } - - template< class V > - int BucketBasics<V>::packedDataSize( int refPos ) const { - if ( this->flags & Packed ) { - return V::BucketSize - this->emptySize - headerSize(); - } - int size = 0; - for( int j = 0; j < this->n; ++j ) { - if ( mayDropKey( j, refPos ) ) { - continue; - } - size += keyNode( j ).key.dataSize() + sizeof( _KeyNode ); - } - return size; - } - - /** - * when we delete things we just leave empty space until the node is - * full and then we repack it. - */ - template< class V > - void BucketBasics<V>::_pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const { - if ( this->flags & Packed ) - return; - - VERIFYTHISLOC - - /** TODO perhaps this can be optimized. for example if packing does no write, we can skip intent decl. - an empirical approach is probably best than just adding new code : perhaps the bucket would need - declaration anyway within the group commit interval, in which case we would just be adding - code and complexity without benefit. - */ - thisLoc.btreemod<V>()->_packReadyForMod(order, refPos); - } - - /** version when write intent already declared */ - template< class V > - void BucketBasics<V>::_packReadyForMod( const Ordering &order, int &refPos ) { - assertWritable(); - - if ( this->flags & Packed ) - return; - - int tdz = totalDataSize(); - char temp[V::BucketSize]; - int ofs = tdz; - this->topSize = 0; - int i = 0; - for ( int j = 0; j < this->n; j++ ) { - if( mayDropKey( j, refPos ) ) { - continue; // key is unused and has no children - drop it - } - if( i != j ) { - if ( refPos == j ) { - refPos = i; // i < j so j will never be refPos again - } - k( i ) = k( j ); - } - short ofsold = k(i).keyDataOfs(); - int sz = keyNode(i).key.dataSize(); - ofs -= sz; - this->topSize += sz; - memcpy(temp+ofs, dataAt(ofsold), sz); - k(i).setKeyDataOfsSavingUse( ofs ); - ++i; - } - if ( refPos == this->n ) { - refPos = i; - } - this->n = i; - int dataUsed = tdz - ofs; - memcpy(this->data + ofs, temp + ofs, dataUsed); - - // assertWritable(); - // TEMP TEST getDur().declareWriteIntent(this, sizeof(*this)); - - this->emptySize = tdz - dataUsed - this->n * sizeof(_KeyNode); - { - int foo = this->emptySize; - verify( foo >= 0 ); - } - - setPacked(); - - assertValid( order ); - } - - template< class V > - inline void BucketBasics<V>::truncateTo(int N, const Ordering &order, int &refPos) { - verify( Lock::somethingWriteLocked() ); - assertWritable(); - this->n = N; - setNotPacked(); - _packReadyForMod( order, refPos ); - } - - /** - * In the standard btree algorithm, we would split based on the - * existing keys _and_ the new key. But that's more work to - * implement, so we split the existing keys and then add the new key. - * - * There are several published heuristic algorithms for doing splits, - * but basically what you want are (1) even balancing between the two - * sides and (2) a small split key so the parent can have a larger - * branching factor. - * - * We just have a simple algorithm right now: if a key includes the - * halfway point (or 10% way point) in terms of bytes, split on that key; - * otherwise split on the key immediately to the left of the halfway - * point (or 10% point). - * - * This function is expected to be called on a packed bucket. - */ - template< class V > - int BucketBasics<V>::splitPos( int keypos ) const { - verify( this->n > 2 ); - int split = 0; - int rightSize = 0; - // when splitting a btree node, if the new key is greater than all the other keys, we should not do an even split, but a 90/10 split. - // see SERVER-983 - // TODO I think we only want to do the 90% split on the rhs node of the tree. - int rightSizeLimit = ( this->topSize + sizeof( _KeyNode ) * this->n ) / ( keypos == this->n ? 10 : 2 ); - for( int i = this->n - 1; i > -1; --i ) { - rightSize += keyNode( i ).key.dataSize() + sizeof( _KeyNode ); - if ( rightSize > rightSizeLimit ) { - split = i; - break; - } - } - // safeguards - we must not create an empty bucket - if ( split < 1 ) { - split = 1; - } - else if ( split > this->n - 2 ) { - split = this->n - 2; - } - - return split; - } - - template< class V > - void BucketBasics<V>::reserveKeysFront( int nAdd ) { - verify( this->emptySize >= int( sizeof( _KeyNode ) * nAdd ) ); - this->emptySize -= sizeof( _KeyNode ) * nAdd; - for( int i = this->n - 1; i > -1; --i ) { - k( i + nAdd ) = k( i ); - } - this->n += nAdd; - } - - template< class V > - void BucketBasics<V>::setKey( int i, const DiskLoc recordLoc, const Key &key, const DiskLoc prevChildBucket ) { - _KeyNode &kn = k( i ); - kn.recordLoc = recordLoc; - kn.prevChildBucket = prevChildBucket; - short ofs = (short) _alloc( key.dataSize() ); - kn.setKeyDataOfs( ofs ); - char *p = dataAt( ofs ); - memcpy( p, key.data(), key.dataSize() ); - } - - template< class V > - void BucketBasics<V>::dropFront( int nDrop, const Ordering &order, int &refpos ) { - for( int i = nDrop; i < this->n; ++i ) { - k( i - nDrop ) = k( i ); - } - this->n -= nDrop; - setNotPacked(); - _packReadyForMod( order, refpos ); - } - - /* - BtreeBucket --------------------------------------------------- */ - - /** @return largest key in the subtree. */ - template< class V > - void BtreeBucket<V>::findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey) { - DiskLoc loc = thisLoc; - while ( 1 ) { - const BtreeBucket *b = loc.btree<V>(); - if ( !b->nextChild.isNull() ) { - loc = b->nextChild; - continue; - } - - verify(b->n>0); - largestLoc = loc; - largestKey = b->n-1; - - break; - } - } - - /** - * NOTE Currently the Ordering implementation assumes a compound index will - * not have more keys than an unsigned variable has bits. The same - * assumption is used in the implementation below with respect to the 'mask' - * variable. - * - * @param l a regular bsonobj - * @param rBegin composed partly of an existing bsonobj, and the remaining keys are taken from a vector of elements that frequently changes - * - * see - * jstests/index_check6.js - * https://jira.mongodb.org/browse/SERVER-371 - */ - /* static */ - template< class V > - int BtreeBucket<V>::customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ) { - BSONObjIterator ll( l ); - BSONObjIterator rr( rBegin ); - vector< const BSONElement * >::const_iterator rr2 = rEnd.begin(); - vector< bool >::const_iterator inc = rEndInclusive.begin(); - unsigned mask = 1; - for( int i = 0; i < rBeginLen; ++i, mask <<= 1 ) { - BSONElement lll = ll.next(); - BSONElement rrr = rr.next(); - ++rr2; - ++inc; - - int x = lll.woCompare( rrr, false ); - if ( o.descending( mask ) ) - x = -x; - if ( x != 0 ) - return x; - } - if ( rSup ) { - return -direction; - } - for( ; ll.more(); mask <<= 1 ) { - BSONElement lll = ll.next(); - BSONElement rrr = **rr2; - ++rr2; - int x = lll.woCompare( rrr, false ); - if ( o.descending( mask ) ) - x = -x; - if ( x != 0 ) - return x; - if ( !*inc ) { - return -direction; - } - ++inc; - } - return 0; - } - - template< class V > - bool BtreeBucket<V>::exists(const IndexCatalogEntry* btreeState, - const DiskLoc &thisLoc, - const Key& key ) const { - int pos; - bool found; - DiskLoc b = locate(btreeState, thisLoc, key, pos, found, minDiskLoc); - - // skip unused keys - while ( 1 ) { - if( b.isNull() ) - break; - const BtreeBucket *bucket = b.btree<V>(); - const _KeyNode& kn = bucket->k(pos); - if ( kn.isUsed() ) - return bucket->keyAt(pos).woEqual(key); - b = bucket->advance(b, pos, 1, "BtreeBucket<V>::exists"); - } - return false; - } - - template< class V > - bool BtreeBucket<V>::wouldCreateDup(const IndexCatalogEntry* btreeState, - const DiskLoc& thisLoc, - const Key& key, - const DiskLoc& self) const { - int pos; - bool found; - DiskLoc b = locate(btreeState, thisLoc, key, pos, found, minDiskLoc); - - while ( !b.isNull() ) { - // we skip unused keys - Record* record = btreeState->recordStore()->recordFor( b ); - const BtreeBucket *bucket = asVersion( record ); - const _KeyNode& kn = bucket->k(pos); - if ( kn.isUsed() ) { - if( bucket->keyAt(pos).woEqual(key) ) - return kn.recordLoc != self; - break; - } - b = bucket->advance(b, pos, 1, "BtreeBucket<V>::dupCheck"); - } - - return false; - } - - template< class V > - string BtreeBucket<V>::dupKeyError( const IndexDescriptor* idx , const Key& key ) { - stringstream ss; - ss << "E11000 duplicate key error "; - ss << "index: " << idx->indexNamespace() << " "; - ss << "dup key: " << key.toString(); - return ss.str(); - } - - /** - * Find a key withing this btree bucket. - * - * When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the - * key. That assures that even when there are many duplicates (e.g., 1 million) for a key, - * our performance is still good. - * - * assertIfDup: if the key exists (ignoring the recordLoc), uassert - * - * pos: for existing keys k0...kn-1. - * returns # it goes BEFORE. so key[pos-1] < key < key[pos] - * returns n if it goes after the last existing key. - * note result might be an Unused location! - */ - - bool guessIncreasing = false; - template< class V > - bool BtreeBucket<V>::find(const IndexCatalogEntry* btreeState, - const Key& key, - const DiskLoc &rl, - int& pos, - bool assertIfDup) const { - Loc recordLoc; - recordLoc = rl; - globalIndexCounters->btree( reinterpret_cast<const char*>(this) ); - - // binary search for this key - bool dupsChecked = false; - int l=0; - int h=this->n-1; - int m = (l+h)/2; - if( guessIncreasing ) { - m = h; - } - while ( l <= h ) { - KeyNode M = this->keyNode(m); - int x = key.woCompare(M.key, btreeState->ordering()); - if ( x == 0 ) { - if( assertIfDup ) { - if( k(m).isUnused() ) { - // ok that key is there if unused. but we need to check that there aren't other - // entries for the key then. as it is very rare that we get here, we don't put any - // coding effort in here to make this particularly fast - if( !dupsChecked ) { - dupsChecked = true; - DiskLoc head = btreeState->head(); - if( head.btree<V>()->exists(btreeState, head, key) ) { - if( head.btree<V>()->wouldCreateDup(btreeState, head, key, recordLoc) ) - uasserted( ASSERT_ID_DUPKEY , dupKeyError( btreeState->descriptor(), key ) ); - else - alreadyInIndex(); - } - } - } - else { - if( M.recordLoc == recordLoc ) - alreadyInIndex(); - uasserted( ASSERT_ID_DUPKEY, dupKeyError( btreeState->descriptor(), key ) ); - } - } - - // dup keys allowed. use recordLoc as if it is part of the key - Loc unusedRL = M.recordLoc; - unusedRL.GETOFS() &= ~1; // so we can test equality without the used bit messing us up - x = recordLoc.compare(unusedRL); - } - if ( x < 0 ) // key < M.key - h = m-1; - else if ( x > 0 ) - l = m+1; - else { - // found it. - pos = m; - return true; - } - m = (l+h)/2; - } - // not found - pos = l; - if ( pos != this->n ) { - Key keyatpos = keyNode(pos).key; - wassert( key.woCompare(keyatpos, btreeState->ordering()) <= 0 ); - if ( pos > 0 ) { - if( !( keyNode(pos-1).key.woCompare(key, btreeState->ordering()) <= 0 ) ) { - DEV { - log() << key.toString() << endl; - log() << keyNode(pos-1).key.toString() << endl; - } - wassert(false); - } - } - } - - return false; - } - - template< class V > - void BtreeBucket<V>::delBucket(IndexCatalogEntry* btreeState, const DiskLoc thisLoc ) { - BtreeIndexCursor::aboutToDeleteBucket(thisLoc); - verify( !isHead() ); - - DiskLoc ll = this->parent; - const BtreeBucket *p = ll.btree<V>(); - int parentIdx = indexInParent( thisLoc ); - p->childForPos( parentIdx ).writing().Null(); - deallocBucket( btreeState, thisLoc ); - } - - template< class V > - void BtreeBucket<V>::deallocBucket(IndexCatalogEntry* btreeState, const DiskLoc thisLoc ) { - // Mark the bucket as deallocated, see SERVER-4575. - this->n = this->INVALID_N_SENTINEL; - // defensive: - this->parent.Null(); - btreeState->recordStore()->deleteRecord( thisLoc ); - } - - /** note: may delete the entire bucket! this invalid upon return sometimes. */ - template< class V > - void BtreeBucket<V>::delKeyAtPos(IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, int p ) { - verify(this->n>0); - DiskLoc left = this->childForPos(p); - - if ( this->n == 1 ) { - if ( left.isNull() && this->nextChild.isNull() ) { - this->_delKeyAtPos(p); - if ( isHead() ) { - // we don't delete the top bucket ever - } - else { - if ( !mayBalanceWithNeighbors( btreeState, thisLoc ) ) { - // An empty bucket is only allowed as a transient state. If - // there are no neighbors to balance with, we delete ourself. - // This condition is only expected in legacy btrees. - delBucket(btreeState, thisLoc); - } - } - return; - } - deleteInternalKey( btreeState, thisLoc, p ); - return; - } - - if ( left.isNull() ) { - this->_delKeyAtPos(p); - mayBalanceWithNeighbors( btreeState, thisLoc ); - } - else { - deleteInternalKey( btreeState, thisLoc, p ); - } - } - - /** - * This function replaces the specified key (k) by either the prev or next - * key in the btree (k'). We require that k have either a left or right - * child. If k has a left child, we set k' to the prev key of k, which must - * be a leaf present in the left child. If k does not have a left child, we - * set k' to the next key of k, which must be a leaf present in the right - * child. When we replace k with k', we copy k' over k (which may cause a - * split) and then remove k' from its original location. Because k' is - * stored in a descendent of k, replacing k by k' will not modify the - * storage location of the original k', and we can easily remove k' from - * its original location. - * - * This function is only needed in cases where k has a left or right child; - * in other cases a simpler key removal implementation is possible. - * - * NOTE on noncompliant BtreeBuilder btrees: - * It is possible (though likely rare) for btrees created by BtreeBuilder to - * have k' that is not a leaf, see SERVER-2732. These cases are handled in - * the same manner as described in the "legacy btree structures" note below. - * - * NOTE on legacy btree structures: - * In legacy btrees, k' can be a nonleaf. In such a case we 'delete' k by - * marking it as an unused node rather than replacing it with k'. Also, k' - * may be a leaf but marked as an unused node. In such a case we replace - * k by k', preserving the key's unused marking. This function is only - * expected to mark a key as unused when handling a legacy btree. - */ - template< class V > - void BtreeBucket<V>::deleteInternalKey(IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, - int keypos ) { - DiskLoc lchild = this->childForPos( keypos ); - DiskLoc rchild = this->childForPos( keypos + 1 ); - verify( !lchild.isNull() || !rchild.isNull() ); - int advanceDirection = lchild.isNull() ? 1 : -1; - int advanceKeyOfs = keypos; - DiskLoc advanceLoc = advance( thisLoc, advanceKeyOfs, advanceDirection, __FUNCTION__ ); - // advanceLoc must be a descentant of thisLoc, because thisLoc has a - // child in the proper direction and all descendants of thisLoc must be - // nonempty because they are not the root. - - if ( !advanceLoc.btree<V>()->childForPos( advanceKeyOfs ).isNull() || - !advanceLoc.btree<V>()->childForPos( advanceKeyOfs + 1 ).isNull() ) { - // only expected with legacy btrees, see note above - this->markUnused( keypos ); - return; - } - - KeyNode kn = advanceLoc.btree<V>()->keyNode( advanceKeyOfs ); - // Because advanceLoc is a descendant of thisLoc, updating thisLoc will - // not affect packing or keys of advanceLoc and kn will be stable - // during the following setInternalKey() - setInternalKey( btreeState, thisLoc, keypos, kn.recordLoc, kn.key, - this->childForPos( keypos ), this->childForPos( keypos + 1 ) ); - advanceLoc.btreemod<V>()->delKeyAtPos( btreeState, advanceLoc, advanceKeyOfs ); - } - -//#define BTREE(loc) (static_cast<DiskLoc>(loc).btree<V>()) -#define BTREE(loc) (loc.template btree<V>()) -//#define BTREEMOD(loc) (static_cast<DiskLoc>(loc).btreemod<V>()) -#define BTREEMOD(loc) (loc.template btreemod<V>()) - - template< class V > - const BtreeBucket<V>* BtreeBucket<V>::asVersion( Record* rec ) { - return reinterpret_cast< BtreeBucket<V>* >( rec->data() ); - } - - template< class V > - BtreeBucket<V>* BtreeBucket<V>::asVersionMod( Record* rec ) { - BtreeBucket<V>* bucket = const_cast<BtreeBucket<V>* >( asVersion( rec ) ); - return static_cast< BtreeBucket<V>* >( getDur().writingPtr( bucket, V::BucketSize ) ); - } - - template< class V > - void BtreeBucket<V>::replaceWithNextChild( IndexCatalogEntry* btreeState, - const DiskLoc thisLoc ) { - verify( this->n == 0 && !this->nextChild.isNull() ); - if ( this->parent.isNull() ) { - verify( btreeState->head() == thisLoc ); - btreeState->setHead( this->nextChild ); - } - else { - DiskLoc ll = this->parent; - ll.btree<V>()->childForPos( indexInParent( thisLoc ) ).writing() = this->nextChild; - } - BTREE(this->nextChild)->parent.writing() = this->parent; - BtreeIndexCursor::aboutToDeleteBucket(thisLoc); - deallocBucket( btreeState, thisLoc ); - } - - template< class V > - bool BtreeBucket<V>::canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const { - verify( leftIndex >= 0 && leftIndex < this->n ); - DiskLoc leftNodeLoc = this->childForPos( leftIndex ); - DiskLoc rightNodeLoc = this->childForPos( leftIndex + 1 ); - if ( leftNodeLoc.isNull() || rightNodeLoc.isNull() ) { - // TODO if this situation is possible in long term implementation, maybe we should compact somehow anyway - return false; - } - int pos = 0; - { - const BtreeBucket *l = leftNodeLoc.btree<V>(); - const BtreeBucket *r = rightNodeLoc.btree<V>(); - if ( ( this->headerSize() + l->packedDataSize( pos ) + r->packedDataSize( pos ) + keyNode( leftIndex ).key.dataSize() + sizeof(_KeyNode) > unsigned( V::BucketSize ) ) ) { - return false; - } - } - return true; - } - - /** - * This implementation must respect the meaning and value of lowWaterMark. - * Also see comments in splitPos(). - */ - template< class V > - int BtreeBucket<V>::rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const { - int split = -1; - int rightSize = 0; - const BtreeBucket *l = BTREE(this->childForPos( leftIndex )); - const BtreeBucket *r = BTREE(this->childForPos( leftIndex + 1 )); - - int KNS = sizeof( _KeyNode ); - int rightSizeLimit = ( l->topSize + l->n * KNS + keyNode( leftIndex ).key.dataSize() + KNS + r->topSize + r->n * KNS ) / 2; - // This constraint should be ensured by only calling this function - // if we go below the low water mark. - verify( rightSizeLimit < BtreeBucket<V>::bodySize() ); - for( int i = r->n - 1; i > -1; --i ) { - rightSize += r->keyNode( i ).key.dataSize() + KNS; - if ( rightSize > rightSizeLimit ) { - split = l->n + 1 + i; - break; - } - } - if ( split == -1 ) { - rightSize += keyNode( leftIndex ).key.dataSize() + KNS; - if ( rightSize > rightSizeLimit ) { - split = l->n; - } - } - if ( split == -1 ) { - for( int i = l->n - 1; i > -1; --i ) { - rightSize += l->keyNode( i ).key.dataSize() + KNS; - if ( rightSize > rightSizeLimit ) { - split = i; - break; - } - } - } - // safeguards - we must not create an empty bucket - if ( split < 1 ) { - split = 1; - } - else if ( split > l->n + 1 + r->n - 2 ) { - split = l->n + 1 + r->n - 2; - } - - return split; - } - - template< class V > - void BtreeBucket<V>::doMergeChildren( IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, - int leftIndex ) { - DiskLoc leftNodeLoc = this->childForPos( leftIndex ); - DiskLoc rightNodeLoc = this->childForPos( leftIndex + 1 ); - BtreeBucket *l = leftNodeLoc.btreemod<V>(); - BtreeBucket *r = rightNodeLoc.btreemod<V>(); - int pos = 0; - l->_packReadyForMod( btreeState->ordering(), pos ); - r->_packReadyForMod( btreeState->ordering(), pos ); // pack r in case there are droppable keys - - // We know the additional keys below will fit in l because canMergeChildren() - // must be true. - int oldLNum = l->n; - { - // left child's right child becomes old parent key's left child - KeyNode kn = keyNode( leftIndex ); - l->pushBack( kn.recordLoc, kn.key, btreeState->ordering(), l->nextChild ); - } - for( int i = 0; i < r->n; ++i ) { - KeyNode kn = r->keyNode( i ); - l->pushBack( kn.recordLoc, kn.key, btreeState->ordering(), kn.prevChildBucket ); - } - l->nextChild = r->nextChild; - l->fixParentPtrs( leftNodeLoc, oldLNum ); - r->delBucket( btreeState, rightNodeLoc ); - this->childForPos( leftIndex + 1 ) = leftNodeLoc; - this->childForPos( leftIndex ) = DiskLoc(); - this->_delKeyAtPos( leftIndex, true ); - if ( this->n == 0 ) { - // will trash this and thisLoc - // TODO To ensure all leaves are of equal height, we should ensure - // this is only called on the root. - replaceWithNextChild( btreeState, thisLoc ); - } - else { - // balance recursively - maybe we should do this even when n == 0? - mayBalanceWithNeighbors( btreeState, thisLoc ); - } - } - - template< class V > - int BtreeBucket<V>::indexInParent( const DiskLoc &thisLoc ) const { - verify( !this->parent.isNull() ); - const BtreeBucket *p = BTREE(this->parent); - if ( p->nextChild == thisLoc ) { - return p->n; - } - else { - for( int i = 0; i < p->n; ++i ) { - if ( p->k( i ).prevChildBucket == thisLoc ) { - return i; - } - } - } - out() << "ERROR: can't find ref to child bucket.\n"; - out() << "child: " << thisLoc << "\n"; - dump(); - out() << "Parent: " << this->parent << "\n"; - p->dump(); - verify(false); - return -1; // just to compile - } - - template< class V > - bool BtreeBucket<V>::tryBalanceChildren( IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, - int leftIndex ) const { - // If we can merge, then we must merge rather than balance to preserve - // bucket utilization constraints. - if ( canMergeChildren( thisLoc, leftIndex ) ) { - return false; - } - thisLoc.btreemod<V>()->doBalanceChildren( btreeState, thisLoc, leftIndex ); - return true; - } - - template< class V > - void BtreeBucket<V>::doBalanceLeftToRight( IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, int leftIndex, int split, - BtreeBucket *l, const DiskLoc lchild, - BtreeBucket *r, const DiskLoc rchild ) { - // TODO maybe do some audits the same way pushBack() does? - // As a precondition, rchild + the old separator are <= half a body size, - // and lchild is at most completely full. Based on the value of split, - // rchild will get <= half of the total bytes which is at most 75% - // of a full body. So rchild will have room for the following keys: - int rAdd = l->n - split; - r->reserveKeysFront( rAdd ); - for( int i = split + 1, j = 0; i < l->n; ++i, ++j ) { - KeyNode kn = l->keyNode( i ); - r->setKey( j, kn.recordLoc, kn.key, kn.prevChildBucket ); - } - { - KeyNode kn = keyNode( leftIndex ); - r->setKey( rAdd - 1, kn.recordLoc, kn.key, l->nextChild ); // left child's right child becomes old parent key's left child - } - r->fixParentPtrs( rchild, 0, rAdd - 1 ); - { - KeyNode kn = l->keyNode( split ); - l->nextChild = kn.prevChildBucket; - // Because lchild is a descendant of thisLoc, updating thisLoc will - // not affect packing or keys of lchild and kn will be stable - // during the following setInternalKey() - setInternalKey( btreeState, thisLoc, leftIndex, kn.recordLoc, kn.key, lchild, rchild ); - } - int zeropos = 0; - // lchild and rchild cannot be merged, so there must be >0 (actually more) - // keys to the left of split. - l->truncateTo( split, btreeState->ordering(), zeropos ); - } - - template< class V > - void BtreeBucket<V>::doBalanceRightToLeft( IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, int leftIndex, int split, - BtreeBucket *l, const DiskLoc lchild, - BtreeBucket *r, const DiskLoc rchild ) { - // As a precondition, lchild + the old separator are <= half a body size, - // and rchild is at most completely full. Based on the value of split, - // lchild will get less than half of the total bytes which is at most 75% - // of a full body. So lchild will have room for the following keys: - int lN = l->n; - { - // left child's right child becomes old parent key's left child - KeyNode kn = keyNode( leftIndex ); - l->pushBack( kn.recordLoc, kn.key, btreeState->ordering(), l->nextChild ); - } - for( int i = 0; i < split - lN - 1; ++i ) { - KeyNode kn = r->keyNode( i ); - l->pushBack( kn.recordLoc, kn.key, btreeState->ordering(), kn.prevChildBucket ); - } - { - KeyNode kn = r->keyNode( split - lN - 1 ); - l->nextChild = kn.prevChildBucket; - // Child lN was lchild's old nextChild, and don't need to fix that one. - l->fixParentPtrs( lchild, lN + 1, l->n ); - // Because rchild is a descendant of thisLoc, updating thisLoc will - // not affect packing or keys of rchild and kn will be stable - // during the following setInternalKey() - setInternalKey( btreeState, thisLoc, leftIndex, kn.recordLoc, kn.key, lchild, rchild ); - } - int zeropos = 0; - // lchild and rchild cannot be merged, so there must be >0 (actually more) - // keys to the right of split. - r->dropFront( split - lN, btreeState->ordering(), zeropos ); - } - - template< class V > - void BtreeBucket<V>::doBalanceChildren( IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, - int leftIndex ) { - DiskLoc lchild = this->childForPos( leftIndex ); - DiskLoc rchild = this->childForPos( leftIndex + 1 ); - int zeropos = 0; - BtreeBucket *l = lchild.btreemod<V>(); - l->_packReadyForMod( btreeState->ordering(), zeropos ); - BtreeBucket *r = rchild.btreemod<V>(); - r->_packReadyForMod( btreeState->ordering(), zeropos ); - int split = rebalancedSeparatorPos( thisLoc, leftIndex ); - - // By definition, if we are below the low water mark and cannot merge - // then we must actively balance. - verify( split != l->n ); - if ( split < l->n ) { - doBalanceLeftToRight( btreeState, thisLoc, leftIndex, split, l, lchild, r, rchild ); - } - else { - doBalanceRightToLeft( btreeState, thisLoc, leftIndex, split, l, lchild, r, rchild ); - } - } - - template< class V > - bool BtreeBucket<V>::mayBalanceWithNeighbors(IndexCatalogEntry* btreeState, - const DiskLoc thisLoc ) { - if ( this->parent.isNull() ) { // we are root, there are no neighbors - return false; - } - - if ( this->packedDataSize( 0 ) >= this->lowWaterMark() ) { - return false; - } - - const BtreeBucket *p = BTREE(this->parent); - int parentIdx = indexInParent( thisLoc ); - - // TODO will missing neighbor case be possible long term? Should we try to merge/balance somehow in that case if so? - bool mayBalanceRight = ( ( parentIdx < p->n ) && !p->childForPos( parentIdx + 1 ).isNull() ); - bool mayBalanceLeft = ( ( parentIdx > 0 ) && !p->childForPos( parentIdx - 1 ).isNull() ); - - // Balance if possible on one side - we merge only if absolutely necessary - // to preserve btree bucket utilization constraints since that's a more - // heavy duty operation (especially if we must re-split later). - if ( mayBalanceRight && - p->tryBalanceChildren( btreeState, this->parent, parentIdx) ) { - return true; - } - if ( mayBalanceLeft && - p->tryBalanceChildren( btreeState, this->parent, parentIdx - 1 ) ) { - return true; - } - - BtreeBucket *pm = BTREEMOD(this->parent); - if ( mayBalanceRight ) { - pm->doMergeChildren( btreeState, this->parent, parentIdx ); - return true; - } - else if ( mayBalanceLeft ) { - pm->doMergeChildren( btreeState, this->parent, parentIdx - 1 ); - return true; - } - - return false; - } - - /** remove a key from the index */ - template< class V > - bool BtreeBucket<V>::unindex(IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, - const BSONObj& key, - const DiskLoc recordLoc) const { - int pos; - bool found; - DiskLoc loc = locate(btreeState, thisLoc, key, pos, found, recordLoc, 1); - if ( found ) { - loc.btreemod<V>()->delKeyAtPos(btreeState, loc, pos); - return true; - } - return false; - } - - template< class V > - inline void BtreeBucket<V>::fix(const DiskLoc thisLoc, const DiskLoc child) { - if ( !child.isNull() ) { - if ( insert_debug ) - out() << " fix " << child.toString() << ".parent=" << thisLoc.toString() << endl; - child.btree<V>()->parent.writing() = thisLoc; - } - } - - /** - * This can cause a lot of additional page writes when we assign buckets to - * different parents. Maybe get rid of parent ptrs? - */ - template< class V > - void BtreeBucket<V>::fixParentPtrs(const DiskLoc thisLoc, int firstIndex, int lastIndex) const { - VERIFYTHISLOC - if ( lastIndex == -1 ) { - lastIndex = this->n; - } - for ( int i = firstIndex; i <= lastIndex; i++ ) { - fix(thisLoc, this->childForPos(i)); - } - } - - template< class V > - void BtreeBucket<V>::setInternalKey( IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, - int keypos, - const DiskLoc recordLoc, - const Key &key, - const DiskLoc lchild, - const DiskLoc rchild ) { - this->childForPos( keypos ).Null(); - - // This may leave the bucket empty (n == 0) which is ok only as a - // transient state. In the instant case, the implementation of - // insertHere behaves correctly when n == 0 and as a side effect - // increments n. - this->_delKeyAtPos( keypos, true ); - - // Ensure we do not orphan neighbor's old child. - verify( this->childForPos( keypos ) == rchild ); - - // Just set temporarily - required to pass validation in insertHere() - this->childForPos( keypos ) = lchild; - - insertHere(btreeState, thisLoc, keypos, recordLoc, key, lchild, rchild ); - } - - /** - * insert a key in this bucket, splitting if necessary. - * @keypos - where to insert the key in range 0..n. 0=make leftmost, n=make rightmost. - * NOTE this function may free some data, and as a result the value passed for keypos may - * be invalid after calling insertHere() - * - * Some of the write intent signaling below relies on the implementation of - * the optimized write intent code in basicInsert(). - */ - template< class V > - void BtreeBucket<V>::insertHere( IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, int keypos, - const DiskLoc recordLoc, const Key& key, - const DiskLoc lchild, const DiskLoc rchild ) const { - if ( insert_debug ) - out() << " " << thisLoc.toString() << ".insertHere " << key.toString() << '/' << recordLoc.toString() << ' ' - << lchild.toString() << ' ' << rchild.toString() << " keypos:" << keypos << endl; - - if ( !this->basicInsert(thisLoc, keypos, recordLoc, key, btreeState->ordering()) ) { - // If basicInsert() fails, the bucket will be packed as required by split(). - thisLoc.btreemod<V>()->split(btreeState, thisLoc, keypos, recordLoc, key, lchild, rchild ); - return; - } - - { - const _KeyNode *_kn = &k(keypos); - _KeyNode *kn = (_KeyNode *) getDur().alreadyDeclared((_KeyNode*) _kn); // already declared intent in basicInsert() - if ( keypos+1 == this->n ) { // last key - if ( this->nextChild != lchild ) { - out() << "ERROR nextChild != lchild" << endl; - out() << " thisLoc: " << thisLoc.toString() << ' ' << btreeState->descriptor()->indexNamespace() << endl; - out() << " keyPos: " << keypos << " n:" << this->n << endl; - out() << " nextChild: " << this->nextChild.toString() << " lchild: " << lchild.toString() << endl; - out() << " recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl; - out() << " key: " << key.toString() << endl; - dump(); - verify(false); - } - kn->prevChildBucket = this->nextChild; - verify( kn->prevChildBucket == lchild ); - this->nextChild.writing() = rchild; - if ( !rchild.isNull() ) - BTREE(rchild)->parent.writing() = thisLoc; - } - else { - kn->prevChildBucket = lchild; - if ( k(keypos+1).prevChildBucket != lchild ) { - out() << "ERROR k(keypos+1).prevChildBucket != lchild" << endl; - out() << " thisLoc: " << thisLoc.toString() << ' ' << btreeState->descriptor()->indexNamespace() << endl; - out() << " keyPos: " << keypos << " n:" << this->n << endl; - out() << " k(keypos+1).pcb: " << k(keypos+1).prevChildBucket.toString() << " lchild: " << lchild.toString() << endl; - out() << " recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl; - out() << " key: " << key.toString() << endl; - dump(); - verify(false); - } - const Loc *pc = &k(keypos+1).prevChildBucket; - *getDur().alreadyDeclared( const_cast<Loc*>(pc) ) = rchild; // declared in basicInsert() - if ( !rchild.isNull() ) - rchild.btree<V>()->parent.writing() = thisLoc; - } - return; - } - } - - template< class V > - void BtreeBucket<V>::split(IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, int keypos, - const DiskLoc recordLoc, const Key& key, - const DiskLoc lchild, const DiskLoc rchild) { - this->assertWritable(); - - if ( split_debug ) - out() << " " << thisLoc.toString() << ".split" << endl; - - int split = this->splitPos( keypos ); - DiskLoc rLoc = addBucket(btreeState); - BtreeBucket *r = rLoc.btreemod<V>(); - if ( split_debug ) - out() << " split:" << split << ' ' << keyNode(split).key.toString() << " n:" << this->n << endl; - for ( int i = split+1; i < this->n; i++ ) { - KeyNode kn = keyNode(i); - r->pushBack(kn.recordLoc, kn.key, btreeState->ordering(), kn.prevChildBucket); - } - r->nextChild = this->nextChild; - r->assertValid( btreeState->ordering() ); - - if ( split_debug ) - out() << " new rLoc:" << rLoc.toString() << endl; - r = 0; - rLoc.btree<V>()->fixParentPtrs(rLoc); - - { - KeyNode splitkey = keyNode(split); - this->nextChild = splitkey.prevChildBucket; // splitkey key gets promoted, its children will be thisLoc (l) and rLoc (r) - if ( split_debug ) { - out() << " splitkey key:" << splitkey.key.toString() << endl; - } - - // Because thisLoc is a descendant of parent, updating parent will - // not affect packing or keys of thisLoc and splitkey will be stable - // during the following: - - // promote splitkey to a parent this->node - if ( this->parent.isNull() ) { - // make a new parent if we were the root - DiskLoc L = addBucket(btreeState); - BtreeBucket *p = L.btreemod<V>(); - p->pushBack(splitkey.recordLoc, splitkey.key, btreeState->ordering(), thisLoc); - p->nextChild = rLoc; - p->assertValid( btreeState->ordering() ); - this->parent = L; - btreeState->setHead( L ); - if ( split_debug ) - out() << " we were root, making new root:" << hex << this->parent.getOfs() << dec << endl; - rLoc.btree<V>()->parent.writing() = this->parent; - } - else { - // set this before calling _insert - if it splits it will do fixParent() logic and change the value. - rLoc.btree<V>()->parent.writing() = this->parent; - if ( split_debug ) - out() << " promoting splitkey key " << splitkey.key.toString() << endl; - BTREE(this->parent)->_insert(btreeState, this->parent, splitkey.recordLoc, splitkey.key, /*dupsallowed*/true, thisLoc, rLoc ); - } - } - - int newpos = keypos; - // note this may trash splitkey.key. thus we had to promote it before finishing up here. - this->truncateTo(split, btreeState->ordering(), newpos); - - // add our this->new key, there is room this->now - { - if ( keypos <= split ) { - if ( split_debug ) - out() << " keypos<split, insertHere() the new key" << endl; - insertHere(btreeState, thisLoc, newpos, recordLoc, key, lchild, rchild ); - } - else { - int kp = keypos-split-1; - verify(kp>=0); - BTREE(rLoc)->insertHere(btreeState, rLoc, kp, recordLoc, key, lchild, rchild ); - } - } - - if ( split_debug ) - out() << " split end " << hex << thisLoc.getOfs() << dec << endl; - } - - class DummyDocWriter : public DocWriter { - public: - DummyDocWriter( size_t sz ) : _sz( sz ){} - virtual void writeDocument( char* buf ) const { /* no-op */ } - virtual size_t documentSize() const { return _sz; } - private: - size_t _sz; - }; - - /** start a new index off, empty */ - template< class V > - DiskLoc BtreeBucket<V>::addBucket(IndexCatalogEntry* btreeState) { - DummyDocWriter docWriter( V::BucketSize ); - StatusWith<DiskLoc> loc = btreeState->recordStore()->insertRecord( &docWriter, 0 ); - uassertStatusOK( loc.getStatus() ); - BtreeBucket *b = BTREEMOD(loc.getValue()); - b->init(); - return loc.getValue(); - } - - template< class V > - const DiskLoc BtreeBucket<V>::getHead(const DiskLoc& thisLoc) const { - DiskLoc p = thisLoc; - while ( !BTREE(p)->isHead() ) - p = BTREE(p)->parent; - return p; - } - - template< class V > - DiskLoc BtreeBucket<V>::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const { - if ( keyOfs < 0 || keyOfs >= this->n ) { - out() << "ASSERT failure BtreeBucket<V>::advance, caller: " << caller << endl; - out() << " thisLoc: " << thisLoc.toString() << endl; - out() << " keyOfs: " << keyOfs << " n:" << this->n << " direction: " << direction << endl; - out() << bucketSummary() << endl; - verify(false); - } - int adj = direction < 0 ? 1 : 0; - int ko = keyOfs + direction; - DiskLoc nextDown = this->childForPos(ko+adj); - if ( !nextDown.isNull() ) { - while ( 1 ) { - keyOfs = direction>0 ? 0 : BTREE(nextDown)->n - 1; - DiskLoc loc = BTREE(nextDown)->childForPos(keyOfs + adj); - if ( loc.isNull() ) - break; - nextDown = loc; - } - return nextDown; - } - - if ( ko < this->n && ko >= 0 ) { - keyOfs = ko; - return thisLoc; - } - - // end of bucket. traverse back up. - DiskLoc childLoc = thisLoc; - DiskLoc ancestor = this->parent; - while ( 1 ) { - if ( ancestor.isNull() ) - break; - const BtreeBucket *an = BTREE(ancestor); - for ( int i = 0; i < an->n; i++ ) { - if ( an->childForPos(i+adj) == childLoc ) { - keyOfs = i; - return ancestor; - } - } - verify( direction<0 || an->nextChild == childLoc ); - // parent exhausted also, keep going up - childLoc = ancestor; - ancestor = an->parent; - } - - return DiskLoc(); - } - - template< class V > - DiskLoc BtreeBucket<V>::locate(const IndexCatalogEntry* btreeState, - const DiskLoc& thisLoc, - const BSONObj& key, - int& pos, - bool& found, - const DiskLoc& recordLoc, - int direction) const { - KeyOwned k(key); - return locate(btreeState, thisLoc, k, pos, found, recordLoc, direction); - } - - template< class V > - DiskLoc BtreeBucket<V>::locate(const IndexCatalogEntry* btreeState, - const DiskLoc& thisLoc, - const Key& key, - int& pos, - bool& found, - const DiskLoc& recordLoc, - int direction) const { - int p; - found = find(btreeState, key, recordLoc, p, /*assertIfDup*/ false); - if ( found ) { - pos = p; - return thisLoc; - } - - DiskLoc child = this->childForPos(p); - - if ( !child.isNull() ) { - DiskLoc l = BTREE(child)->locate(btreeState, child, key, pos, found, recordLoc, direction); - if ( !l.isNull() ) - return l; - } - - pos = p; - if ( direction < 0 ) - return --pos == -1 ? DiskLoc() /*theend*/ : thisLoc; - else - return pos == this->n ? DiskLoc() /*theend*/ : thisLoc; - } - - template< class V > - bool BtreeBucket<V>::customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) { - const BtreeBucket<V> * bucket = BTREE(thisLoc); - while( 1 ) { - if ( l + 1 == h ) { - keyOfs = ( direction > 0 ) ? h : l; - DiskLoc next = bucket->k( h ).prevChildBucket; - if ( !next.isNull() ) { - bestParent = make_pair( thisLoc, keyOfs ); - thisLoc = next; - return true; - } - else { - return false; - } - } - int m = l + ( h - l ) / 2; - int cmp = customBSONCmp( bucket->keyNode( m ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ); - if ( cmp < 0 ) { - l = m; - } - else if ( cmp > 0 ) { - h = m; - } - else { - if ( direction < 0 ) { - l = m; - } - else { - h = m; - } - } - } - } - - /** - * find smallest/biggest value greater-equal/less-equal than specified - * starting thisLoc + keyOfs will be strictly less than/strictly greater than keyBegin/keyBeginLen/keyEnd - * All the direction checks below allowed me to refactor the code, but possibly separate forward and reverse implementations would be more efficient - */ - template< class V > - void BtreeBucket<V>::advanceTo(const IndexCatalogEntry* btreeState, - DiskLoc &thisLoc, - int &keyOfs, - const BSONObj &keyBegin, - int keyBeginLen, - bool afterKey, - const vector<const BSONElement*>& keyEnd, - const vector<bool>& keyEndInclusive, - int direction) const { - - const Ordering& order = btreeState->ordering(); - - int l,h; - bool dontGoUp; - if ( direction > 0 ) { - l = keyOfs; - h = this->n - 1; - dontGoUp = ( customBSONCmp( keyNode( h ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ); - } - else { - l = 0; - h = keyOfs; - dontGoUp = ( customBSONCmp( keyNode( l ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ); - } - pair< DiskLoc, int > bestParent; - if ( dontGoUp ) { - // this comparison result assures h > l - if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, thisLoc, keyOfs, bestParent ) ) { - return; - } - } - else { - // go up parents until rightmost/leftmost node is >=/<= target or at top - while( !BTREE(thisLoc)->parent.isNull() ) { - thisLoc = BTREE(thisLoc)->parent; - if ( direction > 0 ) { - if ( customBSONCmp( BTREE(thisLoc)->keyNode( BTREE(thisLoc)->n - 1 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ) { - break; - } - } - else { - if ( customBSONCmp( BTREE(thisLoc)->keyNode( 0 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ) { - break; - } - } - } - } - customLocate(btreeState, thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, direction, bestParent ); - } - - /** @param thisLoc in/out param. perhaps thisLoc isn't the best name given that. - Ut is used by advanceTo, which skips - from one key to another key without necessarily checking all the keys - between them in the btree (it can skip to different btree buckets). - The advanceTo function can get called a lot, and for different targets - we want to advance to, don't want to create a bson obj in a new - buffer each time we call that function. The - customLocate function necessary for advanceTo, and does the same thing - as normal locate function but takes basically the same arguments - as advanceTo. - */ - template< class V > - void BtreeBucket<V>::customLocate(const IndexCatalogEntry* btreeState, - DiskLoc& locInOut, - int& keyOfs, - const BSONObj& keyBegin, - int keyBeginLen, bool afterKey, - const vector<const BSONElement*>& keyEnd, - const vector<bool>& keyEndInclusive, - int direction, - pair<DiskLoc, int>& bestParent) { - const Ordering& order = btreeState->ordering(); - - dassert( direction == 1 || direction == -1 ); - const BtreeBucket<V> *bucket = BTREE(locInOut); - if ( bucket->n == 0 ) { - locInOut = DiskLoc(); - return; - } - // go down until find smallest/biggest >=/<= target - while( 1 ) { - int l = 0; - int h = bucket->n - 1; - - // +direction: 0, -direction: h - int z = (1-direction)/2*h; - - // leftmost/rightmost key may possibly be >=/<= search key - int res = customBSONCmp( bucket->keyNode( z ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ); - bool firstCheck = direction*res >= 0; - - if ( firstCheck ) { - DiskLoc next; - keyOfs = z; - if ( direction > 0 ) { - dassert( z == 0 ); - next = bucket->k( 0 ).prevChildBucket; - } - else { - next = bucket->nextChild; - } - if ( !next.isNull() ) { - bestParent = pair< DiskLoc, int >( locInOut, keyOfs ); - locInOut = next; - bucket = BTREE(locInOut); - continue; - } - else { - return; - } - } - - res = customBSONCmp( bucket->keyNode( h-z ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ); - bool secondCheck = direction*res < 0; - - if ( secondCheck ) { - DiskLoc next; - if ( direction > 0 ) { - next = bucket->nextChild; - } - else { - next = bucket->k( 0 ).prevChildBucket; - } - if ( next.isNull() ) { - // if bestParent is null, we've hit the end and locInOut gets set to DiskLoc() - locInOut = bestParent.first; - keyOfs = bestParent.second; - return; - } - else { - locInOut = next; - bucket = BTREE(locInOut); - continue; - } - } - - if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, locInOut, keyOfs, bestParent ) ) { - return; - } - bucket = BTREE(locInOut); - } - } - - /** @thisLoc disk location of *this */ - template< class V > - int BtreeBucket<V>::_insert(IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, const DiskLoc recordLoc, - const Key& key, bool dupsAllowed, - const DiskLoc lChild, const DiskLoc rChild ) const { - if ( key.dataSize() > getKeyMax() ) { - string msg = str::stream() << "ERROR: key too large len:" << key.dataSize() - << " max:" << getKeyMax() << ' ' << key.dataSize() - << ' ' << btreeState->descriptor()->indexNamespace(); - - problem() << msg << endl; - keyTooLongAssert( 17281, msg ); - return 2; - } - verify( key.dataSize() > 0 ); - - int pos; - bool found = find(btreeState, key, recordLoc, pos, !dupsAllowed); - if ( insert_debug ) { - out() << " " << thisLoc.toString() << '.' << "_insert " << - key.toString() << '/' << recordLoc.toString() << - " l:" << lChild.toString() << " r:" << rChild.toString() << endl; - out() << " found:" << found << " pos:" << pos << " n:" << this->n << endl; - } - - if ( found ) { - const _KeyNode& kn = k(pos); - if ( kn.isUnused() ) { - LOG(4) << "btree _insert: reusing unused key" << endl; - massert( 10285 , "_insert: reuse key but lchild is not null", lChild.isNull()); - massert( 10286 , "_insert: reuse key but rchild is not null", rChild.isNull()); - kn.writing().setUsed(); - return 0; - } - - DEV { - log() << "_insert(): key already exists in index (ok for background:true)\n"; - log() << " " << btreeState->descriptor()->indexNamespace() << " thisLoc:" << thisLoc.toString() << '\n'; - log() << " " << key.toString() << '\n'; - log() << " " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl; - log() << " old l r: " << this->childForPos(pos).toString() << ' ' << this->childForPos(pos+1).toString() << endl; - log() << " new l r: " << lChild.toString() << ' ' << rChild.toString() << endl; - } - alreadyInIndex(); - } - - DEBUGGING out() << "TEMP: key: " << key.toString() << endl; - Loc ch = this->childForPos(pos); - DiskLoc child = ch; - if ( insert_debug ) - out() << " getChild(" << pos << "): " << child.toString() << endl; - // In current usage, rChild isNull() for a new key and false when we are - // promoting a split key. These are the only two cases where _insert() - // is called currently. - if ( child.isNull() || !rChild.isNull() ) { - // A new key will be inserted at the same tree height as an adjacent existing key. - insertHere(btreeState, thisLoc, pos, recordLoc, key, lChild, rChild); - return 0; - } - - return child.btree<V>()->_insert(btreeState, child, recordLoc, key, dupsAllowed, /*lchild*/DiskLoc(), /*rchild*/DiskLoc() ); - } - - template< class V > - void BtreeBucket<V>::dump(unsigned depth) const { - string indent = string(depth, ' '); - LogstreamBuilder l = log(); - l << "BUCKET n:" << this->n; - l << " parent:" << hex << this->parent.getOfs() << dec; - for ( int i = 0; i < this->n; i++ ) { - l << '\n' << indent; - KeyNode k = keyNode(i); - string ks = k.key.toString(); - l << " " << hex << k.prevChildBucket.getOfs() << '\n'; - l << indent << " " << i << ' ' << ks.substr(0, 30) << " Loc:" << k.recordLoc.toString() << dec; - if ( this->k(i).isUnused() ) - l << " UNUSED"; - } - l << "\n" << indent << " " << hex << this->nextChild.getOfs() << dec << endl; - } - - /** todo: meaning of return code unclear clean up */ - template< class V > - int BtreeBucket<V>::bt_insert(IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, - const DiskLoc recordLoc, - const BSONObj& keyBson, - bool dupsAllowed, - bool toplevel) const { - guessIncreasing = keyBson.firstElementType() == jstOID && btreeState->descriptor()->isIdIndex(); - KeyOwned key(keyBson); - - dassert(toplevel); - if ( toplevel ) { - if ( key.dataSize() > getKeyMax() ) { - string msg = str::stream() << "Btree::insert: key too large to index, failing " - << btreeState->descriptor()->indexNamespace() << ' ' - << key.dataSize() << ' ' << key.toString(); - problem() << msg << endl; - keyTooLongAssert( 17280, msg ); - return 3; - } - } - - int x; - try { - x = _insert(btreeState, thisLoc, recordLoc, key, dupsAllowed, DiskLoc(), DiskLoc() ); - this->assertValid( btreeState->ordering() ); - } - catch( ... ) { - guessIncreasing = false; - throw; - } - guessIncreasing = false; - return x; - } - - template< class V > - void BtreeBucket<V>::shape(stringstream& ss) const { - this->_shape(0, ss); - } - - template< class V > - int BtreeBucket<V>::getKeyMax() { - return V::KeyMax; - } - - template< class V > - DiskLoc BtreeBucket<V>::findSingle( const IndexCatalogEntry* btreeState, - const DiskLoc& thisLoc, - const BSONObj& key ) const { - int pos; - bool found; - // TODO: is it really ok here that the order is a default? - // for findById() use, yes. for checkNoIndexConflicts, no? - DiskLoc bucket = locate( btreeState, btreeState->head(), key, pos, found, minDiskLoc ); - if ( bucket.isNull() ) - return bucket; - - const BtreeBucket<V> *b = bucket.btree<V>(); - while ( 1 ) { - const _KeyNode& knraw = b->k(pos); - if ( knraw.isUsed() ) - break; - bucket = b->advance( bucket , pos , 1 , "findSingle" ); - if ( bucket.isNull() ) - return bucket; - b = bucket.btree<V>(); - } - KeyNode kn = b->keyNode( pos ); - if ( KeyOwned(key).woCompare( kn.key, btreeState->ordering() ) != 0 ) - return DiskLoc(); - return kn.recordLoc; - } - - template class BucketBasics<V0>; - template class BucketBasics<V1>; - template class BtreeBucket<V0>; - template class BtreeBucket<V1>; - template struct __KeyNode<DiskLoc>; - template struct __KeyNode<DiskLoc56Bit>; - - struct BTUnitTest : public StartupTest { - void run() { - DiskLoc big(0xf12312, 0x70001234); - DiskLoc56Bit bigl; - { - bigl = big; - verify( big == bigl ); - DiskLoc e = bigl; - verify( big == e ); - } - { - DiskLoc d; - verify( d.isNull() ); - DiskLoc56Bit l; - l = d; - verify( l.isNull() ); - d = l; - verify( d.isNull() ); - verify( l < bigl ); - } - } - } btunittest; -} - diff --git a/src/mongo/db/structure/btree/btree.h b/src/mongo/db/structure/btree/btree.h deleted file mode 100644 index ad89b763bd5..00000000000 --- a/src/mongo/db/structure/btree/btree.h +++ /dev/null @@ -1,1094 +0,0 @@ -// btree.h - -/** -* Copyright (C) 2008 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include "mongo/pch.h" - -#include "mongo/db/diskloc.h" -#include "mongo/db/storage/mmap_v1/dur.h" -#include "mongo/db/jsobj.h" -#include "mongo/db/storage/record.h" -#include "mongo/db/structure/btree/key.h" - -namespace mongo { - - class IndexCatalogEntry; - class IndexDescriptor; - - template< class Version > class BtreeBucket; - - /** - * Our btree implementation generally follows the standard btree algorithm, - * which is described in many places. The nodes of our btree are referred to - * as buckets below. These buckets are of size BucketSize and their body is - * an ordered array of <bson key, disk loc> pairs, where disk loc is the disk - * location of a document and bson key is a projection of this document into - * the schema of the index for this btree. Ordering is determined on the - * basis of bson key first and then disk loc in case of a tie. All bson keys - * for a btree have identical schemas with empty string field names and may - * not have an objsize() exceeding KeyMax. The btree's buckets are - * themselves organized into an ordered tree. Although there are exceptions, - * generally buckets with n keys have n+1 children and the body of a bucket is - * at least lowWaterMark bytes. A more strictly enforced requirement is that - * a non root bucket must have at least one key except in certain transient - * states. - * - * Our btrees support the following primary read operations: finding a - * specified key; iterating from a starting key to the next or previous - * ordered key; and skipping from a starting key to another specified key - * without checking every intermediate key. The primary write operations - * are insertion and deletion of keys. Insertion may trigger a bucket split - * if necessary to avoid bucket overflow. In such a case, subsequent splits - * will occur recursively as necessary. Deletion may trigger a bucket - * rebalance, in which a size deficient bucket is filled with keys from an - * adjacent bucket. In this case, splitting may potentially occur in the - * parent. Deletion may alternatively trigger a merge, in which the keys - * from two buckets and a key from their shared parent are combined into the - * same bucket. In such a case, rebalancing or merging may proceed - * recursively from the parent. - * - * While the btree data format has been relatively constant over time, btrees - * initially created by versions of mongo earlier than the current version - * may embody different properties than freshly created btrees (while - * following the same data format). These older btrees are referred to - * below as legacy btrees. - */ - - const int OldBucketSize = 8192; - -#pragma pack(1) - template< class Version > class BucketBasics; - - /** - * This is the fixed width data component for storage of a key within a - * bucket. It contains an offset pointer to the variable width bson - * data component. A _KeyNode may be 'unused', please see below. - */ - template< class Loc > - struct __KeyNode { - /** Signals that we are writing this _KeyNode and casts away const */ - __KeyNode<Loc> & writing() const; - /** - * The 'left' child bucket of this key. If this is the i-th key, it - * points to the i index child bucket. - */ - Loc prevChildBucket; - /** The location of the record associated with this key. */ - Loc recordLoc; - short keyDataOfs() const { return (short) _kdo; } - - /** Offset within current bucket of the variable width bson key for this _KeyNode. */ - unsigned short _kdo; - void setKeyDataOfs(short s) { - _kdo = s; - verify(s>=0); - } - /** Seems to be redundant. */ - void setKeyDataOfsSavingUse(short s) { - _kdo = s; - verify(s>=0); - } - /** - * Unused keys are not returned by read operations. Keys may be marked - * as unused in cases where it is difficult to delete them while - * maintaining the constraints required of a btree. - * - * Setting ofs to odd is the sentinel for unused, as real recordLoc's - * are always even numbers. Note we need to keep its value basically - * the same as we use the recordLoc as part of the key in the index - * (to handle duplicate keys efficiently). - * - * Flagging keys as unused is a feature that is being phased out in favor - * of deleting the keys outright. The current btree implementation is - * not expected to mark a key as unused in a non legacy btree. - */ - void setUnused() { - recordLoc.GETOFS() |= 1; - } - void setUsed() { recordLoc.GETOFS() &= ~1; } - int isUnused() const { - return recordLoc.getOfs() & 1; - } - int isUsed() const { - return !isUnused(); - } - }; - - /** - * This structure represents header data for a btree bucket. An object of - * this type is typically allocated inside of a buffer of size BucketSize, - * resulting in a full bucket with an appropriate header. - * - * The body of a btree bucket contains an array of _KeyNode objects starting - * from its lowest indexed bytes and growing to higher indexed bytes. The - * body also contains variable width bson keys, which are allocated from the - * highest indexed bytes toward lower indexed bytes. - * - * |hhhh|kkkkkkk--------bbbbbbbbbbbuuubbbuubbb| - * h = header data - * k = KeyNode data - * - = empty space - * b = bson key data - * u = unused (old) bson key data, that may be garbage collected - */ - class BtreeData_V0 { - protected: - /** Parent bucket of this bucket, which isNull() for the root bucket. */ - DiskLoc parent; - /** Given that there are n keys, this is the n index child. */ - DiskLoc nextChild; - /** can be reused, value is 8192 in current pdfile version Apr2010 */ - unsigned short _wasSize; - /** zero */ - unsigned short _reserved1; - int flags; - - void _init() { - _reserved1 = 0; - _wasSize = BucketSize; - reserved = 0; - } - - /** basicInsert() assumes the next three members are consecutive and in this order: */ - - /** Size of the empty region. */ - int emptySize; - /** Size used for bson storage, including storage of old keys. */ - int topSize; - /* Number of keys in the bucket. */ - int n; - - int reserved; - /* Beginning of the bucket's body */ - char data[4]; - - public: - typedef __KeyNode<DiskLoc> _KeyNode; - typedef DiskLoc Loc; - typedef KeyBson Key; - typedef KeyBson KeyOwned; - enum { BucketSize = 8192 }; - - // largest key size we allow. note we very much need to support bigger keys (somehow) in the future. - static const int KeyMax = OldBucketSize / 10; - // A sentinel value sometimes used to identify a deallocated bucket. - static const int INVALID_N_SENTINEL = -1; - }; - - // a a a ofs ofs ofs ofs - class DiskLoc56Bit { - int ofs; - unsigned char _a[3]; - unsigned long long Z() const { - // endian - unsigned long long result = ofs; - char* cursor = reinterpret_cast<char *>(&result); - *reinterpret_cast<uint16_t*>(cursor + 4) = *reinterpret_cast<const uint16_t*>(&_a[0]); - *reinterpret_cast<uint8_t*>(cursor + 6) = *reinterpret_cast<const uint8_t*>(&_a[2]); - *reinterpret_cast<uint8_t*>(cursor + 7) = uint8_t(0); - return result; - } - enum { - // first bit of offsets used in _KeyNode we don't use -1 here. - OurNullOfs = -2 - }; - public: - template< class V > - const BtreeBucket<V> * btree() const { - return DiskLoc(*this).btree<V>(); - } - template< class V > - BtreeBucket<V> * btreemod() const { - return DiskLoc(*this).btreemod<V>(); - } - operator const DiskLoc() const { - // endian - if( isNull() ) return DiskLoc(); - unsigned a = *((unsigned *) (_a-1)); - return DiskLoc(a >> 8, ofs); - } - int& GETOFS() { return ofs; } - int getOfs() const { return ofs; } - bool operator<(const DiskLoc56Bit& rhs) const { - // the orderering of dup keys in btrees isn't too critical, but we'd like to put items that are - // close together on disk close together in the tree, so we do want the file # to be the most significant - // bytes - return Z() < rhs.Z(); - } - int compare(const DiskLoc56Bit& rhs) const { - unsigned long long a = Z(); - unsigned long long b = rhs.Z(); - if( a < b ) return -1; - return a == b ? 0 : 1; - } - bool operator==(const DiskLoc56Bit& rhs) const { return Z() == rhs.Z(); } - bool operator!=(const DiskLoc56Bit& rhs) const { return Z() != rhs.Z(); } - bool operator==(const DiskLoc& rhs) const { - return DiskLoc(*this) == rhs; - } - bool operator!=(const DiskLoc& rhs) const { return !(*this==rhs); } - bool isNull() const { return ofs < 0; } - void Null() { - ofs = OurNullOfs; - _a[0] = _a[1] = _a[2] = 0; - } - string toString() const { return DiskLoc(*this).toString(); } - void operator=(const DiskLoc& loc) { - ofs = loc.getOfs(); - int la = loc.a(); - verify( la <= 0xffffff ); // must fit in 3 bytes - if( la < 0 ) { - if ( la != -1 ) { - log() << "btree diskloc isn't negative 1: " << la << endl; - verify ( la == -1 ); - } - la = 0; - ofs = OurNullOfs; - } - memcpy(_a, &la, 3); // endian - } - DiskLoc56Bit& writing() const { - return *((DiskLoc56Bit*) getDur().writingPtr((void*)this, 7)); - } - }; - - class BtreeData_V1 { - public: - typedef DiskLoc56Bit Loc; - //typedef DiskLoc Loc; - typedef __KeyNode<Loc> _KeyNode; - typedef KeyV1 Key; - typedef KeyV1Owned KeyOwned; - enum { BucketSize = 8192-16 }; // leave room for Record header - // largest key size we allow. note we very much need to support bigger keys (somehow) in the future. - static const int KeyMax = 1024; - // A sentinel value sometimes used to identify a deallocated bucket. - static const unsigned short INVALID_N_SENTINEL = 0xffff; - protected: - /** Parent bucket of this bucket, which isNull() for the root bucket. */ - Loc parent; - /** Given that there are n keys, this is the n index child. */ - Loc nextChild; - - unsigned short flags; - - /** basicInsert() assumes the next three members are consecutive and in this order: */ - - /** Size of the empty region. */ - unsigned short emptySize; - /** Size used for bson storage, including storage of old keys. */ - unsigned short topSize; - /* Number of keys in the bucket. */ - unsigned short n; - - /* Beginning of the bucket's body */ - char data[4]; - - void _init() { } - }; - - typedef BtreeData_V0 V0; - typedef BtreeData_V1 V1; - - /** - * This class adds functionality to BtreeData for managing a single bucket. - * The following policies are used in an attempt to encourage simplicity: - * - * Const member functions of this class are those which may be called on - * an object for which writing has not been signaled. Non const member - * functions may only be called on objects for which writing has been - * signaled. Note that currently some const functions write to the - * underlying memory representation of this bucket using optimized methods - * to signal write operations. - * - * DiskLoc parameters that may shadow references within the btree should - * be passed by value rather than by reference to non const member - * functions or to const member functions which may perform writes. This way - * a callee need not worry that write operations will change or invalidate - * its arguments. - * - * The current policy for dealing with bson arguments is the opposite of - * what is described above for DiskLoc arguments. We do not want to copy - * bson into memory as an intermediate step for btree changes, and if bson - * is to be moved it must be copied to the new location before the old - * location is invalidated. Care should be taken in cases where that invalid - * memory may be implicitly referenced by function arguments. - * - * A number of functions below require a thisLoc argument, which must be the - * disk location of the bucket mapped to 'this'. - */ - template< class Version > - class BucketBasics : public Version { - public: - template <class U> friend class BtreeBuilder; - typedef typename Version::Key Key; - typedef typename Version::_KeyNode _KeyNode; - typedef typename Version::Loc Loc; - - int getN() const { return this->n; } - - /** - * This is an in memory wrapper for a _KeyNode, and not itself part of btree - * storage. This object and its BSONObj 'key' will become invalid if the - * _KeyNode data that generated it is moved within the btree. In general, - * a KeyNode should not be expected to be valid after a write. - */ - class KeyNode { - public: - KeyNode(const BucketBasics<Version>& bb, const _KeyNode &k); - const Loc& prevChildBucket; - const Loc& recordLoc; - /* Points to the bson key storage for a _KeyNode */ - Key key; - }; - friend class KeyNode; - - /** Assert write intent declared for this bucket already. */ - void assertWritable(); - - void assertValid(const Ordering &order, bool force = false) const; - void assertValid(const BSONObj &orderObj, bool force = false) const { return assertValid(Ordering::make(orderObj),force); } - - /** - * @return KeyNode for key at index i. The KeyNode will become invalid - * if the key is moved or reassigned, or if the node is packed. In general - * a KeyNode should not be expected to be valid after a write. - */ - const KeyNode keyNode(int i) const { - if ( i >= this->n ) { - massert( 13000 , (string)"invalid keyNode: " + BSON( "i" << i << "n" << this->n ).jsonString() , i < this->n ); - } - return KeyNode(*this, k(i)); - } - - static int headerSize() { - const BucketBasics *d = 0; - return (char*)&(d->data) - (char*)&(d->parent); - } - static int bodySize() { return Version::BucketSize - headerSize(); } - static int lowWaterMark() { return bodySize() / 2 - Version::KeyMax - sizeof( _KeyNode ) + 1; } // see comment in btree.cpp - - // for testing - int nKeys() const { return this->n; } - const DiskLoc getNextChild() const { return this->nextChild; } - - // for tree inspection and statistical analysis - // NOTE: topSize and emptySize have different types in BtreeData_V0 and BtreeData_V1 - - /** Size used for bson storage, including storage of old keys. */ - unsigned int getTopSize() const { return static_cast<unsigned int>(this->topSize); } - /** Size of the empty region. */ - unsigned int getEmptySize() const { return static_cast<unsigned int>(this->emptySize); } - - protected: - char * dataAt(short ofs) { return this->data + ofs; } - - /** Initialize the header for a new node. */ - void init(); - - /** - * Preconditions: - * - 0 <= keypos <= n - * - If key is inserted at position keypos, the bucket's keys will still be - * in order. - * Postconditions: - * - If key can fit in the bucket, the bucket may be packed and keypos - * may be decreased to reflect deletion of earlier indexed keys during - * packing, the key will be inserted at the updated keypos index with - * a null prevChildBucket, the subsequent keys shifted to the right, - * and the function will return true. - * - If key cannot fit in the bucket, the bucket will be packed and - * the function will return false. - * Although this function is marked const, it modifies the underlying - * btree representation through an optimized write intent mechanism. - */ - bool basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order) const; - - /** - * Preconditions: - * - key / recordLoc are > all existing keys - * - The keys in prevChild and their descendants are between all existing - * keys and 'key'. - * Postconditions: - * - If there is space for key without packing, it is inserted as the - * last key with specified prevChild and true is returned. - * Importantly, nextChild is not updated! - * - Otherwise false is returned and there is no change. - */ - bool _pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild); - void pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild) { - bool ok = _pushBack( recordLoc , key , order , prevChild ); - verify(ok); - } - - /** - * This is a special purpose function used by BtreeBuilder. The - * interface is quite dangerous if you're not careful. The bson key - * returned here points to bucket memory that has been invalidated but - * not yet reclaimed. - * - * TODO Maybe this could be replaced with two functions, one which - * returns the last key without deleting it and another which simply - * deletes the last key. Then the caller would have enough control to - * ensure proper memory integrity. - * - * Preconditions: - * - bucket is not empty - * - last key of bucket is used (not unused) - * - nextChild isNull() - * - _unalloc will work correctly as used - see code - * Postconditions: - * - The last key of the bucket is removed, and its key and recLoc are - * returned. As mentioned above, the key points to unallocated memory. - */ - void popBack(DiskLoc& recLoc, Key &key); - - /** - * Preconditions: - * - 0 <= keypos < n - * - there is no child bucket at keypos - * - n > 1 - * - if mayEmpty == false or nextChild.isNull(), n > 0 - * Postconditions: - * - The key at keypos is removed, and remaining keys are shifted over. - * - The bucket becomes unpacked. - * - if mayEmpty is true and nextChild.isNull(), the bucket may have no keys. - */ - void _delKeyAtPos(int keypos, bool mayEmpty = false); - - /* !Packed means there is deleted fragment space within the bucket. - We "repack" when we run out of space before considering the node - to be full. - */ - enum Flags { Packed=1 }; - - /** n == 0 is ok */ - const Loc& childForPos(int p) const { return p == this->n ? this->nextChild : k(p).prevChildBucket; } - Loc& childForPos(int p) { return p == this->n ? this->nextChild : k(p).prevChildBucket; } - - /** Same as bodySize(). */ - int totalDataSize() const; - /** - * @return true when a key may be dropped by pack() - * @param index index of the key that may be dropped - * @param refPos index of a particular key of interest, which must not - * be dropped; = 0 to safely ignore - */ - bool mayDropKey( int index, int refPos ) const; - - /** - * Pack the bucket to reclaim space from invalidated memory. - * @refPos is an index in the bucket which may be updated if we - * delete keys from the bucket - * This function may cast away const and perform a write. - * Preconditions: none - * Postconditions: - * - Bucket will be packed - * - Some unused nodes may be dropped, but not ones at index 0 or refPos - * - Some used nodes may be moved - * - If refPos is the index of an existing key, it will be updated to that - * key's new index if the key is moved. - */ - void _pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const; - /** Pack when already writable */ - void _packReadyForMod(const Ordering &order, int &refPos); - - /** @return the size the bucket's body would have if we were to call pack() */ - int packedDataSize( int refPos ) const; - void setNotPacked() { this->flags &= ~Packed; } - void setPacked() { this->flags |= Packed; } - /** - * Preconditions: 'bytes' is <= emptySize - * Postconditions: A buffer of size 'bytes' is allocated on the top side, - * and its offset is returned. - */ - int _alloc(int bytes); - /** - * This function can be used to deallocate the lowest byte index bson - * buffer in the top region, which in some but not all cases is for the - * n - 1 index key. This function only works correctly in certain - * special cases, please be careful. - * Preconditions: 'bytes' <= topSize - * Postconditions: The top region is decreased - */ - void _unalloc(int bytes); - /** - * Preconditions: 'N' <= n - * Postconditions: - * - All keys after the N index key are dropped. - * - Then bucket is packed, without dropping refPos if < refPos N. - */ - void truncateTo(int N, const Ordering &order, int &refPos); - /** - * Preconditions: - * - 'nDrop' < n - * - for now, refPos should be zero. - * Postconditions: - * - All keys before the nDrop index key are dropped. - * - The bucket is packed. - */ - void dropFront(int nDrop, const Ordering &order, int &refPos); - /** - * Preconditions: 0 <= keypos < n - * Postconditions: keypos indexed key is marked unused. - */ - void markUnused(int keypos); - - /** - * BtreeBuilder uses the parent var as a temp place to maintain a linked list chain. - * we use tempNext() when we do that to be less confusing. (one might have written a union in C) - */ - DiskLoc tempNext() const { return this->parent; } - void setTempNext(DiskLoc l) { this->parent = l; } - - void _shape(int level, stringstream&) const; - int Size() const; - - /** @return i-indexed _KeyNode, without bounds checking */ - public: - const _KeyNode& k(int i) const { return ((const _KeyNode*)this->data)[i]; } - _KeyNode& _k(int i) { return ((_KeyNode*)this->data)[i]; } - protected: - _KeyNode& k(int i) { return ((_KeyNode*)this->data)[i]; } - - /** - * Preconditions: 'this' is packed - * @return the key index to be promoted on split - * @param keypos The requested index of a key to insert, which may affect - * the choice of split position. - */ - int splitPos( int keypos ) const; - - /** - * Preconditions: nAdd * sizeof( _KeyNode ) <= emptySize - * Postconditions: - * - Increases indexes of existing _KeyNode objects by nAdd, reserving - * space for additional _KeyNode objects at front. - * - Does not initialize ofs values for the bson data of these - * _KeyNode objects. - */ - void reserveKeysFront( int nAdd ); - - /** - * Preconditions: - * - 0 <= i < n - * - The bson 'key' must fit in the bucket without packing. - * - If 'key' and 'prevChildBucket' are set at index i, the btree - * ordering properties will be maintained. - * Postconditions: - * - The specified key is set at index i, replacing the existing - * _KeyNode data and without shifting any other _KeyNode objects. - */ - void setKey( int i, const DiskLoc recordLoc, const Key& key, const DiskLoc prevChildBucket ); - }; - - /** - * This class adds functionality for manipulating buckets that are assembled - * in a tree. The requirements for const and non const functions and - * arguments are generally the same as in BtreeBucket. Because this class - * deals with tree structure, some functions that are marked const may - * trigger modification of another node in the btree or potentially of the - * current node. In such cases, the function's implementation explicitly - * casts away const when indicating an intent to write to the durability - * layer. The DiskLocs provided to such functions should be passed by - * value if they shadow pointers within the btree. - * - * To clarify enforcement of referential integrity in this implementation, - * we use the following pattern when deleting data we have a persistent - * pointer to. The pointer is cleared or removed explicitly, then the data - * it pointed to is cleaned up with a helper function. - * - * TODO It might make sense to put some of these functions in a class - * representing a full btree instead of a single btree bucket. That would - * allow us to use the const qualifier in a manner more consistent with - * standard usage. Right now the interface is for both a node and a tree, - * so assignment of const is sometimes nonideal. - * - * TODO There are several cases in which the 'this' pointer is invalidated - * as a result of deallocation. A separate class representing a btree would - * alleviate some fragile cases where the implementation must currently - * behave correctly if the 'this' pointer is suddenly invalidated by a - * callee. - */ - template< class V > - class BtreeBucket : public BucketBasics<V> { - friend class BtreeCursor; - public: - // make compiler happy: - typedef typename V::Key Key; - typedef typename V::KeyOwned KeyOwned; - typedef typename BucketBasics<V>::KeyNode KeyNode; - typedef typename BucketBasics<V>::_KeyNode _KeyNode; - typedef typename BucketBasics<V>::Loc Loc; - const _KeyNode& k(int i) const { return static_cast< const BucketBasics<V> * >(this)->k(i); } - protected: - _KeyNode& k(int i) { return static_cast< BucketBasics<V> * >(this)->_k(i); } - public: - - static const BtreeBucket<V>* asVersion( Record* rec ); - static BtreeBucket<V>* asVersionMod( Record* rec ); - - const KeyNode keyNode(int i) const { return static_cast< const BucketBasics<V> * >(this)->keyNode(i); } - - bool isHead() const { return this->parent.isNull(); } - void dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const; - long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order, long long *unusedCount = 0, bool strict = false, unsigned depth=0) const; /* traverses everything */ - - bool isUsed( int i ) const { return this->k(i).isUsed(); } - string bucketSummary() const; - void dump(unsigned depth=0) const; - - /** - * @return true if key exists in index - * - * @order - indicates order of keys in the index. this is basically the index's key pattern, e.g.: - * BSONObj order = ((IndexDetails&)idx).keyPattern(); - * likewise below in bt_insert() etc. - */ - private: - bool exists(const IndexCatalogEntry* btreeState, const DiskLoc &thisLoc, const Key& key ) const; - public: - - /** - * @param self - Don't complain about ourself already being in the index case. - * @return true = There is a duplicate used key. - */ - bool wouldCreateDup(const IndexCatalogEntry* btreeState, - const DiskLoc& thisLoc, - const Key& key, - const DiskLoc& self) const; - - /** - * Preconditions: none - * Postconditions: @return a new bucket allocated from pdfile storage - * and init()-ed. This bucket is suitable to for use as a new root - * or any other new node in the tree. - */ - static DiskLoc addBucket(IndexCatalogEntry* btreeState); - - /** - * Preconditions: none - * Postconditions: - * - Some header values in this bucket are cleared, and the bucket is - * deallocated from pdfile storage. - * - The memory at thisLoc is invalidated, and 'this' is invalidated. - */ - void deallocBucket(IndexCatalogEntry* btreeState, const DiskLoc thisLoc ); - - /** - * Preconditions: - * - 'key' has a valid schema for this index. - * - All other paramenters are valid and consistent with this index if applicable. - * Postconditions: - * - If key is bigger than KeyMax, @return 2 or 3 and no change. - * - If key / recordLoc exist in the btree as an unused key, set them - * as used and @return 0 - * - If key / recordLoc exist in the btree as a used key, @throw - * exception 10287 and no change. - * - If key / recordLoc do not exist in the btree, they are inserted - * and @return 0. The root of the btree may be changed, so - * 'this'/thisLoc may no longer be the root upon return. - */ - int bt_insert(IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, - const DiskLoc recordLoc, - const BSONObj& key, - bool dupsallowed, - bool toplevel) const; - /** - * Preconditions: - * - 'key' has a valid schema for this index, and may have objsize() > KeyMax. - * Postconditions: - * - If key / recordLoc are in the btree, they are removed (possibly - * by being marked as an unused key), @return true, and potentially - * invalidate 'this' / thisLoc and change the head. - * - If key / recordLoc are not in the btree, @return false and do nothing. - */ - bool unindex(IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, - const BSONObj& key, - const DiskLoc recordLoc) const; - - /** - * locate may return an "unused" key that is just a marker. so be careful. - * looks for a key:recordloc pair. - * - * @found - returns true if exact match found. note you can get back a position - * result even if found is false. - */ - DiskLoc locate(const IndexCatalogEntry* btreeState, - const DiskLoc& thisLoc, - const BSONObj& key, - int& pos, - bool& found, - const DiskLoc& recordLoc, - int direction=1) const; - - DiskLoc locate(const IndexCatalogEntry* btreeState, - const DiskLoc& thisLoc, - const Key& key, - int& pos, - bool& found, - const DiskLoc& recordLoc, - int direction=1) const; - - /** - * find the first instance of the key - * does not handle dups - * WARNING: findSingle may not be compound index safe. this may need to change. see notes in - * findSingle code. - * @return the record location of the first match - */ - DiskLoc findSingle( const IndexCatalogEntry* btreeState, - const DiskLoc& thisLoc, - const BSONObj& key ) const; - - /** - * Advance to next or previous key in the index. - * @param direction to advance. - */ - DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const; - - /** Advance in specified direction to the specified key */ - void advanceTo(const IndexCatalogEntry* btreeState, - DiskLoc &thisLoc, - int &keyOfs, - const BSONObj &keyBegin, - int keyBeginLen, - bool afterKey, - const vector<const BSONElement*>& keyEnd, - const vector<bool>& keyEndInclusive, - int direction) const; - - /** Locate a key with fields comprised of a combination of keyBegin fields and keyEnd fields. */ - static void customLocate(const IndexCatalogEntry* btreeState, - DiskLoc& locInOut, - int& keyOfs, - const BSONObj& keyBegin, - int keyBeginLen, bool afterVersion, - const vector<const BSONElement*>& keyEnd, - const vector<bool>& keyEndInclusive, - int direction, - pair<DiskLoc, int>& bestParent); - - /** @return head of the btree by traversing from current bucket. */ - const DiskLoc getHead(const DiskLoc& thisLoc) const; - - /** get tree shape */ - void shape(stringstream&) const; - - static int getKeyMax(); - - protected: - /** - * Preconditions: - * - 0 <= firstIndex <= n - * - -1 <= lastIndex <= n ( -1 is equivalent to n ) - * Postconditions: - * - Any children at indexes firstIndex through lastIndex (inclusive) - * will have their parent pointers set to thisLoc. - */ - void fixParentPtrs(const DiskLoc thisLoc, int firstIndex = 0, int lastIndex = -1) const; - - /** - * Preconditions: - * - thisLoc is not the btree head. - * - n == 0 is ok - * Postconditions: - * - All cursors pointing to this bucket will be updated. - * - This bucket's parent's child pointer is set to null. - * - This bucket is deallocated from pdfile storage. - * - 'this' and thisLoc are invalidated. - */ - void delBucket(IndexCatalogEntry* btreeState, const DiskLoc thisLoc ); - - /** - * Preconditions: 0 <= p < n - * Postconditions: - * - The key at index p is removed from the btree. - * - 'this' and thisLoc may be invalidated. - * - The tree head may change. - */ - void delKeyAtPos(IndexCatalogEntry* btreeSate, - const DiskLoc thisLoc, - int p ); - - /** - * Preconditions: - * - n == 0 is ok - * Postconditions: - * - If thisLoc is head, or if its body has at least lowWaterMark bytes, - * return false and do nothing. - * - Otherwise, if thisLoc has left or right neighbors, either balance - * or merge with them and return true. Also, 'this' and thisLoc may - * be invalidated and the tree head may change. - */ - bool mayBalanceWithNeighbors(IndexCatalogEntry* btreeState, const DiskLoc thisLoc); - - /** - * Preconditions: - * - 0 <= leftIndex < n - * - The child at leftIndex or the child at leftIndex + 1 contains - * fewer than lowWaterMark bytes. - * Postconditions: - * - If the child bucket at leftIndex can merge with the child index - * at leftIndex + 1, do nothing and return false. - * - Otherwise, balance keys between the leftIndex child and the - * leftIndex + 1 child, return true, and possibly change the tree head. - */ - bool tryBalanceChildren(IndexCatalogEntry* btreeState, const DiskLoc thisLoc, int leftIndex) const; - - /** - * Preconditions: - * - All preconditions of tryBalanceChildren. - * - The leftIndex child and leftIndex + 1 child cannot be merged. - * Postconditions: - * - Keys are moved between the leftIndex child and the leftIndex + 1 - * child such that neither child has fewer than lowWaterMark bytes. - * The tree head may change. - */ - void doBalanceChildren( IndexCatalogEntry* btreeState, const DiskLoc thisLoc, int leftIndex ); - - /** - * Preconditions: - * - All preconditions of doBalanceChildren - * - The leftIndex and leftIndex + 1 children are packed. - * - The leftIndex + 1 child has fewer than lowWaterMark bytes. - * - split returned by rebalancedSeparatorPos() - * Postconditions: - * - The key in lchild at index split is set as thisLoc's key at index - * leftIndex, which may trigger a split and change the tree head. - * The previous key in thisLoc at index leftIndex and all keys with - * indexes greater than split in lchild are moved to rchild. - */ - void doBalanceLeftToRight( IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, int leftIndex, int split, - BtreeBucket<V> *l, const DiskLoc lchild, - BtreeBucket<V> *r, const DiskLoc rchild ); - /** - * Preconditions: - * - All preconditions of doBalanceChildren - * - The leftIndex and leftIndex + 1 children are packed. - * - The leftIndex child has fewer than lowWaterMark bytes. - * - split returned by rebalancedSeparatorPos() - * Postconditions: - * - The key in rchild at index split - l->n - 1 is set as thisLoc's key - * at index leftIndex, which may trigger a split and change the tree - * head. The previous key in thisLoc at index leftIndex and all keys - * with indexes less than split - l->n - 1 in rchild are moved to - * lchild. - */ - void doBalanceRightToLeft( IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, int leftIndex, int split, - BtreeBucket<V> *l, const DiskLoc lchild, - BtreeBucket<V> *r, const DiskLoc rchild ); - - /** - * Preconditions: - * - 0 <= leftIndex < n - * - this->canMergeChildren( thisLoc, leftIndex ) == true - * Postconditions: - * - All of the above mentioned keys will be placed in the left child. - * - The tree may be updated recursively, resulting in 'this' and - * thisLoc being invalidated and the tree head being changed. - */ - void doMergeChildren(IndexCatalogEntry* btreeState,const DiskLoc thisLoc, int leftIndex ); - - /** - * Preconditions: - * - n == 0 - * - !nextChild.isNull() - * Postconditions: - * - 'this' and thisLoc are deallocated (and invalidated), any cursors - * to them are updated, and the tree head may change. - * - nextChild replaces thisLoc in the btree structure. - */ - void replaceWithNextChild( IndexCatalogEntry* btreeState, const DiskLoc thisLoc ); - - /** - * @return true iff the leftIndex and leftIndex + 1 children both exist, - * and if their body sizes when packed and the thisLoc key at leftIndex - * would fit in a single bucket body. - */ - bool canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const; - - /** - * Preconditions: - * - leftIndex and leftIndex + 1 children are packed - * - leftIndex or leftIndex + 1 child is below lowWaterMark - * @return index of the rebalanced separator; the index value is - * determined as if we had a bucket with body - * <left bucket keys array>.push( <old separator> ).concat( <right bucket keys array> ) - * and called splitPos( 0 ) on it. - */ - int rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const; - - /** - * Preconditions: thisLoc has a parent - * @return parent's index of thisLoc. - */ - int indexInParent( const DiskLoc &thisLoc ) const; - - public: - Key keyAt(int i) const { - if( i >= this->n ) - return Key(); - return Key(this->data + k(i).keyDataOfs()); - } - protected: - - /** - * Preconditions: - * - This bucket is packed. - * - Cannot add a key of size KeyMax to this bucket. - * - 0 <= keypos <= n is the position of a new key that will be inserted - * - lchild is equal to the existing child at index keypos. - * Postconditions: - * - The thisLoc bucket is split into two packed buckets, possibly - * invalidating the initial position of keypos, with a split key - * promoted to the parent. The new key key/recordLoc will be inserted - * into one of the split buckets, and lchild/rchild set appropriately. - * Splitting may occur recursively, possibly changing the tree head. - */ - void split(IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, int keypos, - const DiskLoc recordLoc, const Key& key, - const DiskLoc lchild, const DiskLoc rchild); - - /** - * Preconditions: - * - 0 <= keypos <= n - * - If key / recordLoc are inserted at position keypos, with provided - * lchild and rchild, the btree ordering requirements will be - * maintained. - * - lchild is equal to the existing child at index keypos. - * - n == 0 is ok. - * Postconditions: - * - The key / recordLoc are inserted at position keypos, and the - * bucket is split if necessary, which may change the tree head. - * - The bucket may be packed or split, invalidating the specified value - * of keypos. - * This function will always modify thisLoc, but it's marked const because - * it commonly relies on the specialized writ]e intent mechanism of basicInsert(). - */ - void insertHere(IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, int keypos, - const DiskLoc recordLoc, const Key& key, - const DiskLoc lchild, const DiskLoc rchild ) const; - - /** bt_insert() is basically just a wrapper around this. */ - int _insert(IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, const DiskLoc recordLoc, - const Key& key, bool dupsallowed, - const DiskLoc lChild, const DiskLoc rChild ) const; - - bool find(const IndexCatalogEntry* btreeState, - const Key& key, - const DiskLoc &recordLoc, - int& pos, - bool assertIfDup) const; - - static bool customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) ; - static void findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey); - static int customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ); - - /** If child is non null, set its parent to thisLoc */ - static void fix(const DiskLoc thisLoc, const DiskLoc child); - - /** - * Preconditions: - * - 0 <= keypos < n - * - If the specified key and recordLoc are placed in keypos of thisLoc, - * and lchild and rchild are set, the btree ordering properties will - * be maintained. - * - rchild == childForPos( keypos + 1 ) - * - childForPos( keypos ) is referenced elsewhere if nonnull. - * Postconditions: - * - The key at keypos will be replaced with the specified key and - * lchild, potentially splitting this bucket and changing the tree - * head. - * - childForPos( keypos ) will be orphaned. - */ - void setInternalKey( IndexCatalogEntry* btreeState, - const DiskLoc thisLoc, - int keypos, - const DiskLoc recordLoc, - const Key &key, - const DiskLoc lchild, - const DiskLoc rchild ); - - /** - * Preconditions: - * - 0 <= keypos < n - * - The keypos or keypos+1 indexed child is non null. - * Postconditions: - * - The specified key is deleted by replacing it with another key if - * possible. This replacement may cause a split and change the tree - * head. The replacement key will be deleted from its original - * location, potentially causing merges and splits that may invalidate - * 'this' and thisLoc and change the tree head. - * - If the key cannot be replaced, it will be marked as unused. This - * is only expected in legacy btrees. - */ - void deleteInternalKey( IndexCatalogEntry* btreeState, const DiskLoc thisLoc, int keypos ); - public: - /** simply builds and returns a dup key error message string */ - static string dupKeyError( const IndexDescriptor* idx , const Key& key ); - }; -#pragma pack() - - template< class V > - BucketBasics<V>::KeyNode::KeyNode(const BucketBasics<V>& bb, const _KeyNode &k) : - prevChildBucket(k.prevChildBucket), - recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs()) - { } - - template< class V > - const BtreeBucket<V> * DiskLoc::btree() const { - verify( _a != -1 ); - Record *r = rec(); - return (const BtreeBucket<V> *) r->data(); - } - - /** - * give us a writable version of the btree bucket (declares write intent). - * note it is likely more efficient to declare write intent on something smaller when you can. - */ - template< class V > - BtreeBucket<V> * DiskLoc::btreemod() const { - verify( _a != -1 ); - BtreeBucket<V> *b = const_cast< BtreeBucket<V> * >( btree<V>() ); - return static_cast< BtreeBucket<V>* >( getDur().writingPtr( b, V::BucketSize ) ); - } - - - -} // namespace mongo; diff --git a/src/mongo/dbtests/indexupdatetests.cpp b/src/mongo/dbtests/indexupdatetests.cpp index d28edbf2ab4..fa5b119922c 100644 --- a/src/mongo/dbtests/indexupdatetests.cpp +++ b/src/mongo/dbtests/indexupdatetests.cpp @@ -28,7 +28,6 @@ * then also delete it in the license file. */ -#include "mongo/db/structure/btree/btree.h" #include "mongo/db/catalog/index_catalog.h" #include "mongo/db/dbhelpers.h" #include "mongo/db/index/btree_based_bulk_access_method.h" diff --git a/src/mongo/dbtests/test.vcxproj b/src/mongo/dbtests/test.vcxproj index ced56ad2b7b..4ae594420e2 100644 --- a/src/mongo/dbtests/test.vcxproj +++ b/src/mongo/dbtests/test.vcxproj @@ -987,7 +987,6 @@ cscript //Nologo "$(ProjectDir)..\shell\createCPPfromJavaScriptFiles.js" "$(Proj <ClInclude Include="..\..\third_party\pcre-8.30\pcre.h" />
<ClInclude Include="..\client\connpool.h" />
<ClInclude Include="..\client\dbclient.h" />
- <ClInclude Include="..\db\btree.h" />
<ClInclude Include="..\db\clientcursor.h" />
<ClInclude Include="..\db\cmdline.h" />
<ClInclude Include="..\db\commands.h" />
@@ -3621,7 +3620,6 @@ cscript //Nologo "$(ProjectDir)..\shell\createCPPfromJavaScriptFiles.js" "$(Proj <ClCompile Include="..\client\connpool.cpp" />
<ClCompile Include="..\client\dbclient.cpp" />
<ClCompile Include="..\client\syncclusterconnection.cpp" />
- <ClCompile Include="..\db\btree.cpp" />
<ClCompile Include="..\db\btreecursor.cpp" />
<ClCompile Include="..\pch.cpp" />
<ClCompile Include="..\db\client.cpp" />
diff --git a/src/mongo/dbtests/test.vcxproj.filters b/src/mongo/dbtests/test.vcxproj.filters index 4082afd669a..30e8d34eb63 100755 --- a/src/mongo/dbtests/test.vcxproj.filters +++ b/src/mongo/dbtests/test.vcxproj.filters @@ -526,9 +526,6 @@ <ClInclude Include="..\db\mongomutex.h">
<Filter>db\Header Files\e to n</Filter>
</ClInclude>
- <ClInclude Include="..\db\btree.h">
- <Filter>db\Header Files\a to d</Filter>
- </ClInclude>
<ClInclude Include="..\db\durop.h">
<Filter>db\Header Files\a to d</Filter>
</ClInclude>
@@ -3327,9 +3324,6 @@ <ClCompile Include="..\db\repl\rs_rollback.cpp">
<Filter>db\repl</Filter>
</ClCompile>
- <ClCompile Include="..\db\btree.cpp">
- <Filter>db\Source Files\a to d</Filter>
- </ClCompile>
<ClCompile Include="..\db\dur_writetodatafiles.cpp">
<Filter>db\Source Files\a to d</Filter>
</ClCompile>
|