diff options
author | Eliot Horowitz <eliot@10gen.com> | 2011-12-24 15:33:26 -0500 |
---|---|---|
committer | Eliot Horowitz <eliot@10gen.com> | 2011-12-24 15:33:45 -0500 |
commit | ae1ecd9c786911f9f1f0242f0f7d702b3e5dfeba (patch) | |
tree | 92f8e1649e6f080b251ff5f1763679a72eb59b34 /src/mongo/db | |
parent | dfa4cd7e2cf109b072440155fabc08a93c8045a0 (diff) | |
download | mongo-ae1ecd9c786911f9f1f0242f0f7d702b3e5dfeba.tar.gz |
bulk move of code to src/ SERVER-4551
Diffstat (limited to 'src/mongo/db')
237 files changed, 76762 insertions, 0 deletions
diff --git a/src/mongo/db/background.h b/src/mongo/db/background.h new file mode 100644 index 00000000000..ea424c97107 --- /dev/null +++ b/src/mongo/db/background.h @@ -0,0 +1,56 @@ +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* background.h + + Concurrency coordination for administrative operations. +*/ + +#pragma once + +namespace mongo { + + /* these are administrative operations / jobs + for a namespace running in the background, and that only one + at a time per namespace is permitted, and that if in progress, + you aren't allowed to do other NamespaceDetails major manipulations + (such as dropping ns or db) even in the foreground and must + instead uassert. + + It's assumed this is not for super-high RPS things, so we don't do + anything special in the implementation here to be fast. + */ + class BackgroundOperation : public boost::noncopyable { + public: + static bool inProgForDb(const char *db); + static bool inProgForNs(const char *ns); + static void assertNoBgOpInProgForDb(const char *db); + static void assertNoBgOpInProgForNs(const char *ns); + static void dump(stringstream&); + + /* check for in progress before instantiating */ + BackgroundOperation(const char *ns); + + virtual ~BackgroundOperation(); + + private: + NamespaceString _ns; + static map<string, unsigned> dbsInProg; + static set<string> nsInProg; + }; + +} // namespace mongo + diff --git a/src/mongo/db/btree.cpp b/src/mongo/db/btree.cpp new file mode 100644 index 00000000000..5c55fad33c3 --- /dev/null +++ b/src/mongo/db/btree.cpp @@ -0,0 +1,1980 @@ +// btree.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "db.h" +#include "btree.h" +#include "pdfile.h" +#include "json.h" +#include "clientcursor.h" +#include "client.h" +#include "dbhelpers.h" +#include "curop-inl.h" +#include "stats/counters.h" +#include "dur_commitjob.h" +#include "btreebuilder.h" +#include "../util/unittest.h" +#include "../server.h" + +namespace mongo { + + BOOST_STATIC_ASSERT( Record::HeaderSize == 16 ); + BOOST_STATIC_ASSERT( Record::HeaderSize + BtreeData_V1::BucketSize == 8192 ); + + NOINLINE_DECL void checkFailed(unsigned line) { + static time_t last; + if( time(0) - last >= 10 ) { + msgasserted(15898, str::stream() << "error in index possibly corruption consider repairing " << line); + } + } + + /** data check. like assert, but gives a reasonable error message to the user. */ +#define check(expr) if(!(expr) ) { checkFailed(__LINE__); } + +#define VERIFYTHISLOC dassert( thisLoc.btree<V>() == this ); + + template< class Loc > + __KeyNode<Loc> & __KeyNode<Loc>::writing() const { + return *getDur().writing( const_cast< __KeyNode<Loc> * >( this ) ); + } + + // BucketBasics::lowWaterMark() + // + // We define this value as the maximum number of bytes such that, if we have + // fewer than this many bytes, we must be able to either merge with or receive + // keys from any neighboring node. If our utilization goes below this value we + // know we can bring up the utilization with a simple operation. Ignoring the + // 90/10 split policy which is sometimes employed and our 'unused' nodes, this + // is a lower bound on bucket utilization for non root buckets. + // + // Note that the exact value here depends on the implementation of + // rebalancedSeparatorPos(). The conditions for lowWaterMark - 1 are as + // follows: We know we cannot merge with the neighbor, so the total data size + // for us, the neighbor, and the separator must be at least + // BtreeBucket<V>::bodySize() + 1. We must be able to accept one key of any + // allowed size, so our size plus storage for that additional key must be + // <= BtreeBucket<V>::bodySize() / 2. This way, with the extra key we'll have a + // new bucket data size < half the total data size and by the implementation + // of rebalancedSeparatorPos() the key must be added. + + static const int split_debug = 0; + static const int insert_debug = 0; + + /** + * this error is ok/benign when doing a background indexing -- that logic in pdfile checks explicitly + * for the 10287 error code. + */ + static void alreadyInIndex() { + // we don't use massert() here as that does logging and this is 'benign' - see catches in _indexRecord() + throw MsgAssertionException(10287, "btree: key+recloc already in index"); + } + + /* BucketBasics --------------------------------------------------- */ + + template< class V > + void BucketBasics<V>::assertWritable() { + if( cmdLine.dur ) + dur::assertAlreadyDeclared(this, V::BucketSize); + } + + template< class V > + string BtreeBucket<V>::bucketSummary() const { + stringstream ss; + ss << " Bucket info:" << endl; + ss << " n: " << this->n << endl; + ss << " parent: " << this->parent.toString() << endl; + ss << " nextChild: " << this->nextChild.toString() << endl; + ss << " flags:" << this->flags << endl; + ss << " emptySize: " << this->emptySize << " topSize: " << this->topSize << endl; + return ss.str(); + } + + template< class V > + int BucketBasics<V>::Size() const { + return V::BucketSize; + } + + template< class V > + void BucketBasics<V>::_shape(int level, stringstream& ss) const { + for ( int i = 0; i < level; i++ ) ss << ' '; + ss << "*[" << this->n << "]\n"; + for ( int i = 0; i < this->n; i++ ) { + if ( !k(i).prevChildBucket.isNull() ) { + DiskLoc ll = k(i).prevChildBucket; + ll.btree<V>()->_shape(level+1,ss); + } + } + if ( !this->nextChild.isNull() ) { + DiskLoc ll = this->nextChild; + ll.btree<V>()->_shape(level+1,ss); + } + } + + int bt_fv=0; + int bt_dmp=0; + + template< class V > + void BtreeBucket<V>::dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const { + bt_dmp=1; + fullValidate(thisLoc, order); + bt_dmp=0; + } + + template< class V > + long long BtreeBucket<V>::fullValidate(const DiskLoc& thisLoc, const BSONObj &order, long long *unusedCount, bool strict, unsigned depth) const { + { + bool f = false; + assert( f = true ); + massert( 10281 , "assert is misdefined", f); + } + + killCurrentOp.checkForInterrupt(); + this->assertValid(order, true); + + if ( bt_dmp ) { + _log() << thisLoc.toString() << ' '; + ((BtreeBucket *) this)->dump(depth); + } + + // keycount + long long kc = 0; + + for ( int i = 0; i < this->n; i++ ) { + const _KeyNode& kn = this->k(i); + + if ( kn.isUsed() ) { + kc++; + } + else { + if ( unusedCount ) { + ++( *unusedCount ); + } + } + if ( !kn.prevChildBucket.isNull() ) { + DiskLoc left = kn.prevChildBucket; + const BtreeBucket *b = left.btree<V>(); + if ( strict ) { + assert( b->parent == thisLoc ); + } + else { + wassert( b->parent == thisLoc ); + } + kc += b->fullValidate(kn.prevChildBucket, order, unusedCount, strict, depth+1); + } + } + if ( !this->nextChild.isNull() ) { + DiskLoc ll = this->nextChild; + const BtreeBucket *b = ll.btree<V>(); + if ( strict ) { + assert( b->parent == thisLoc ); + } + else { + wassert( b->parent == thisLoc ); + } + kc += b->fullValidate(this->nextChild, order, unusedCount, strict, depth+1); + } + + return kc; + } + + int nDumped = 0; + + template< class V > + void BucketBasics<V>::assertValid(const Ordering &order, bool force) const { + if ( !debug && !force ) + return; + { + int foo = this->n; + wassert( foo >= 0 && this->n < Size() ); + foo = this->emptySize; + wassert( foo >= 0 && this->emptySize < V::BucketSize ); + wassert( this->topSize >= this->n && this->topSize <= V::BucketSize ); + } + + // this is very slow so don't do often + { + static int _k; + if( ++_k % 128 ) + return; + } + + DEV { + // slow: + for ( int i = 0; i < this->n-1; i++ ) { + Key k1 = keyNode(i).key; + Key k2 = keyNode(i+1).key; + int z = k1.woCompare(k2, order); //OK + if ( z > 0 ) { + out() << "ERROR: btree key order corrupt. Keys:" << endl; + if ( ++nDumped < 5 ) { + for ( int j = 0; j < this->n; j++ ) { + out() << " " << keyNode(j).key.toString() << endl; + } + ((BtreeBucket<V> *) this)->dump(); + } + wassert(false); + break; + } + else if ( z == 0 ) { + if ( !(k(i).recordLoc < k(i+1).recordLoc) ) { + out() << "ERROR: btree key order corrupt (recordloc's wrong):" << endl; + out() << " k(" << i << ")" << keyNode(i).key.toString() << " RL:" << k(i).recordLoc.toString() << endl; + out() << " k(" << i+1 << ")" << keyNode(i+1).key.toString() << " RL:" << k(i+1).recordLoc.toString() << endl; + wassert( k(i).recordLoc < k(i+1).recordLoc ); + } + } + } + } + else { + //faster: + if ( this->n > 1 ) { + Key k1 = keyNode(0).key; + Key k2 = keyNode(this->n-1).key; + int z = k1.woCompare(k2, order); + //wassert( z <= 0 ); + if ( z > 0 ) { + problem() << "btree keys out of order" << '\n'; + ONCE { + ((BtreeBucket<V> *) this)->dump(); + } + assert(false); + } + } + } + } + + template< class V > + inline void BucketBasics<V>::markUnused(int keypos) { + assert( keypos >= 0 && keypos < this->n ); + k(keypos).setUnused(); + } + + template< class V > + inline int BucketBasics<V>::totalDataSize() const { + return (int) (Size() - (this->data-(char*)this)); + } + + template< class V > + void BucketBasics<V>::init() { + this->_init(); + this->parent.Null(); + this->nextChild.Null(); + this->flags = Packed; + this->n = 0; + this->emptySize = totalDataSize(); + this->topSize = 0; + } + + /** see _alloc */ + template< class V > + inline void BucketBasics<V>::_unalloc(int bytes) { + this->topSize -= bytes; + this->emptySize += bytes; + } + + /** + * we allocate space from the end of the buffer for data. + * the keynodes grow from the front. + */ + template< class V > + inline int BucketBasics<V>::_alloc(int bytes) { + assert( this->emptySize >= bytes ); + this->topSize += bytes; + this->emptySize -= bytes; + int ofs = totalDataSize() - this->topSize; + assert( ofs > 0 ); + return ofs; + } + + template< class V > + void BucketBasics<V>::_delKeyAtPos(int keypos, bool mayEmpty) { + // TODO This should be keypos < n + assert( keypos >= 0 && keypos <= this->n ); + assert( childForPos(keypos).isNull() ); + // TODO audit cases where nextChild is null + assert( ( mayEmpty && this->n > 0 ) || this->n > 1 || this->nextChild.isNull() ); + this->emptySize += sizeof(_KeyNode); + this->n--; + for ( int j = keypos; j < this->n; j++ ) + k(j) = k(j+1); + setNotPacked(); + } + + /** + * pull rightmost key from the bucket. this version requires its right child to be null so it + * does not bother returning that value. + */ + template< class V > + void BucketBasics<V>::popBack(DiskLoc& recLoc, Key &key) { + massert( 10282 , "n==0 in btree popBack()", this->n > 0 ); + assert( k(this->n-1).isUsed() ); // no unused skipping in this function at this point - btreebuilder doesn't require that + KeyNode kn = keyNode(this->n-1); + recLoc = kn.recordLoc; + key.assign(kn.key); + int keysize = kn.key.dataSize(); + + massert( 10283 , "rchild not null in btree popBack()", this->nextChild.isNull()); + + // weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't full. + this->nextChild = kn.prevChildBucket; + + this->n--; + // This is risky because the key we are returning points to this unalloc'ed memory, + // and we are assuming that the last key points to the last allocated + // bson region. + this->emptySize += sizeof(_KeyNode); + _unalloc(keysize); + } + + /** add a key. must be > all existing. be careful to set next ptr right. */ + template< class V > + bool BucketBasics<V>::_pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild) { + int bytesNeeded = key.dataSize() + sizeof(_KeyNode); + if ( bytesNeeded > this->emptySize ) + return false; + assert( bytesNeeded <= this->emptySize ); + if( this->n ) { + const KeyNode klast = keyNode(this->n-1); + if( klast.key.woCompare(key, order) > 0 ) { + log() << "btree bucket corrupt? consider reindexing or running validate command" << endl; + log() << " klast: " << keyNode(this->n-1).key.toString() << endl; + log() << " key: " << key.toString() << endl; + DEV klast.key.woCompare(key, order); + assert(false); + } + } + this->emptySize -= sizeof(_KeyNode); + _KeyNode& kn = k(this->n++); + kn.prevChildBucket = prevChild; + kn.recordLoc = recordLoc; + kn.setKeyDataOfs( (short) _alloc(key.dataSize()) ); + short ofs = kn.keyDataOfs(); + char *p = dataAt(ofs); + memcpy(p, key.data(), key.dataSize()); + + return true; + } + + /* durability note + we do separate intent declarations herein. arguably one could just declare + the whole bucket given we do group commits. this is something we could investigate + later as to what is faster under what situations. + */ + /** insert a key in a bucket with no complexity -- no splits required + @return false if a split is required. + */ + template< class V > + bool BucketBasics<V>::basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order) const { + check( this->n < 1024 ); + check( keypos >= 0 && keypos <= this->n ); + int bytesNeeded = key.dataSize() + sizeof(_KeyNode); + if ( bytesNeeded > this->emptySize ) { + _pack(thisLoc, order, keypos); + if ( bytesNeeded > this->emptySize ) + return false; + } + + BucketBasics *b; + { + const char *p = (const char *) &k(keypos); + const char *q = (const char *) &k(this->n+1); + // declare that we will write to [k(keypos),k(n)] + // todo: this writes a medium amount to the journal. we may want to add a verb "shift" to the redo log so + // we can log a very small amount. + b = (BucketBasics*) getDur().writingAtOffset((void *) this, p-(char*)this, q-p); + + // e.g. n==3, keypos==2 + // 1 4 9 + // -> + // 1 4 _ 9 + for ( int j = this->n; j > keypos; j-- ) // make room + b->k(j) = b->k(j-1); + } + + getDur().declareWriteIntent(&b->emptySize, sizeof(this->emptySize)+sizeof(this->topSize)+sizeof(this->n)); + b->emptySize -= sizeof(_KeyNode); + b->n++; + + // This _KeyNode was marked for writing above. + _KeyNode& kn = b->k(keypos); + kn.prevChildBucket.Null(); + kn.recordLoc = recordLoc; + kn.setKeyDataOfs((short) b->_alloc(key.dataSize()) ); + char *p = b->dataAt(kn.keyDataOfs()); + getDur().declareWriteIntent(p, key.dataSize()); + memcpy(p, key.data(), key.dataSize()); + return true; + } + + /** + * With this implementation, refPos == 0 disregards effect of refPos. + * index > 0 prevents creation of an empty bucket. + */ + template< class V > + bool BucketBasics<V>::mayDropKey( int index, int refPos ) const { + return index > 0 && ( index != refPos ) && k( index ).isUnused() && k( index ).prevChildBucket.isNull(); + } + + template< class V > + int BucketBasics<V>::packedDataSize( int refPos ) const { + if ( this->flags & Packed ) { + return V::BucketSize - this->emptySize - headerSize(); + } + int size = 0; + for( int j = 0; j < this->n; ++j ) { + if ( mayDropKey( j, refPos ) ) { + continue; + } + size += keyNode( j ).key.dataSize() + sizeof( _KeyNode ); + } + return size; + } + + /** + * when we delete things we just leave empty space until the node is + * full and then we repack it. + */ + template< class V > + void BucketBasics<V>::_pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const { + if ( this->flags & Packed ) + return; + + VERIFYTHISLOC + + /** TODO perhaps this can be optimized. for example if packing does no write, we can skip intent decl. + an empirical approach is probably best than just adding new code : perhaps the bucket would need + declaration anyway within the group commit interval, in which case we would just be adding + code and complexity without benefit. + */ + thisLoc.btreemod<V>()->_packReadyForMod(order, refPos); + } + + /** version when write intent already declared */ + template< class V > + void BucketBasics<V>::_packReadyForMod( const Ordering &order, int &refPos ) { + assertWritable(); + + if ( this->flags & Packed ) + return; + + int tdz = totalDataSize(); + char temp[V::BucketSize]; + int ofs = tdz; + this->topSize = 0; + int i = 0; + for ( int j = 0; j < this->n; j++ ) { + if( mayDropKey( j, refPos ) ) { + continue; // key is unused and has no children - drop it + } + if( i != j ) { + if ( refPos == j ) { + refPos = i; // i < j so j will never be refPos again + } + k( i ) = k( j ); + } + short ofsold = k(i).keyDataOfs(); + int sz = keyNode(i).key.dataSize(); + ofs -= sz; + this->topSize += sz; + memcpy(temp+ofs, dataAt(ofsold), sz); + k(i).setKeyDataOfsSavingUse( ofs ); + ++i; + } + if ( refPos == this->n ) { + refPos = i; + } + this->n = i; + int dataUsed = tdz - ofs; + memcpy(this->data + ofs, temp + ofs, dataUsed); + + // assertWritable(); + // TEMP TEST getDur().declareWriteIntent(this, sizeof(*this)); + + this->emptySize = tdz - dataUsed - this->n * sizeof(_KeyNode); + { + int foo = this->emptySize; + assert( foo >= 0 ); + } + + setPacked(); + + assertValid( order ); + } + + template< class V > + inline void BucketBasics<V>::truncateTo(int N, const Ordering &order, int &refPos) { + d.dbMutex.assertWriteLocked(); + assertWritable(); + + this->n = N; + setNotPacked(); + _packReadyForMod( order, refPos ); + } + + /** + * In the standard btree algorithm, we would split based on the + * existing keys _and_ the new key. But that's more work to + * implement, so we split the existing keys and then add the new key. + * + * There are several published heuristic algorithms for doing splits, + * but basically what you want are (1) even balancing between the two + * sides and (2) a small split key so the parent can have a larger + * branching factor. + * + * We just have a simple algorithm right now: if a key includes the + * halfway point (or 10% way point) in terms of bytes, split on that key; + * otherwise split on the key immediately to the left of the halfway + * point (or 10% point). + * + * This function is expected to be called on a packed bucket. + */ + template< class V > + int BucketBasics<V>::splitPos( int keypos ) const { + assert( this->n > 2 ); + int split = 0; + int rightSize = 0; + // when splitting a btree node, if the new key is greater than all the other keys, we should not do an even split, but a 90/10 split. + // see SERVER-983 + // TODO I think we only want to do the 90% split on the rhs node of the tree. + int rightSizeLimit = ( this->topSize + sizeof( _KeyNode ) * this->n ) / ( keypos == this->n ? 10 : 2 ); + for( int i = this->n - 1; i > -1; --i ) { + rightSize += keyNode( i ).key.dataSize() + sizeof( _KeyNode ); + if ( rightSize > rightSizeLimit ) { + split = i; + break; + } + } + // safeguards - we must not create an empty bucket + if ( split < 1 ) { + split = 1; + } + else if ( split > this->n - 2 ) { + split = this->n - 2; + } + + return split; + } + + template< class V > + void BucketBasics<V>::reserveKeysFront( int nAdd ) { + assert( this->emptySize >= int( sizeof( _KeyNode ) * nAdd ) ); + this->emptySize -= sizeof( _KeyNode ) * nAdd; + for( int i = this->n - 1; i > -1; --i ) { + k( i + nAdd ) = k( i ); + } + this->n += nAdd; + } + + template< class V > + void BucketBasics<V>::setKey( int i, const DiskLoc recordLoc, const Key &key, const DiskLoc prevChildBucket ) { + _KeyNode &kn = k( i ); + kn.recordLoc = recordLoc; + kn.prevChildBucket = prevChildBucket; + short ofs = (short) _alloc( key.dataSize() ); + kn.setKeyDataOfs( ofs ); + char *p = dataAt( ofs ); + memcpy( p, key.data(), key.dataSize() ); + } + + template< class V > + void BucketBasics<V>::dropFront( int nDrop, const Ordering &order, int &refpos ) { + for( int i = nDrop; i < this->n; ++i ) { + k( i - nDrop ) = k( i ); + } + this->n -= nDrop; + setNotPacked(); + _packReadyForMod( order, refpos ); + } + + /* - BtreeBucket --------------------------------------------------- */ + + /** @return largest key in the subtree. */ + template< class V > + void BtreeBucket<V>::findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey) { + DiskLoc loc = thisLoc; + while ( 1 ) { + const BtreeBucket *b = loc.btree<V>(); + if ( !b->nextChild.isNull() ) { + loc = b->nextChild; + continue; + } + + assert(b->n>0); + largestLoc = loc; + largestKey = b->n-1; + + break; + } + } + + /** + * NOTE Currently the Ordering implementation assumes a compound index will + * not have more keys than an unsigned variable has bits. The same + * assumption is used in the implementation below with respect to the 'mask' + * variable. + * + * @param l a regular bsonobj + * @param rBegin composed partly of an existing bsonobj, and the remaining keys are taken from a vector of elements that frequently changes + * + * see + * jstests/index_check6.js + * https://jira.mongodb.org/browse/SERVER-371 + */ + /* static */ + template< class V > + int BtreeBucket<V>::customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ) { + BSONObjIterator ll( l ); + BSONObjIterator rr( rBegin ); + vector< const BSONElement * >::const_iterator rr2 = rEnd.begin(); + vector< bool >::const_iterator inc = rEndInclusive.begin(); + unsigned mask = 1; + for( int i = 0; i < rBeginLen; ++i, mask <<= 1 ) { + BSONElement lll = ll.next(); + BSONElement rrr = rr.next(); + ++rr2; + ++inc; + + int x = lll.woCompare( rrr, false ); + if ( o.descending( mask ) ) + x = -x; + if ( x != 0 ) + return x; + } + if ( rSup ) { + return -direction; + } + for( ; ll.more(); mask <<= 1 ) { + BSONElement lll = ll.next(); + BSONElement rrr = **rr2; + ++rr2; + int x = lll.woCompare( rrr, false ); + if ( o.descending( mask ) ) + x = -x; + if ( x != 0 ) + return x; + if ( !*inc ) { + return -direction; + } + ++inc; + } + return 0; + } + + template< class V > + bool BtreeBucket<V>::exists(const IndexDetails& idx, const DiskLoc &thisLoc, const Key& key, const Ordering& order) const { + int pos; + bool found; + DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc); + + // skip unused keys + while ( 1 ) { + if( b.isNull() ) + break; + const BtreeBucket *bucket = b.btree<V>(); + const _KeyNode& kn = bucket->k(pos); + if ( kn.isUsed() ) + return bucket->keyAt(pos).woEqual(key); + b = bucket->advance(b, pos, 1, "BtreeBucket<V>::exists"); + } + return false; + } + + template< class V > + bool BtreeBucket<V>::wouldCreateDup( + const IndexDetails& idx, const DiskLoc &thisLoc, + const Key& key, const Ordering& order, + const DiskLoc &self) const { + int pos; + bool found; + DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc); + + while ( !b.isNull() ) { + // we skip unused keys + const BtreeBucket *bucket = b.btree<V>(); + const _KeyNode& kn = bucket->k(pos); + if ( kn.isUsed() ) { + if( bucket->keyAt(pos).woEqual(key) ) + return kn.recordLoc != self; + break; + } + b = bucket->advance(b, pos, 1, "BtreeBucket<V>::dupCheck"); + } + + return false; + } + + template< class V > + string BtreeBucket<V>::dupKeyError( const IndexDetails& idx , const Key& key ) { + stringstream ss; + ss << "E11000 duplicate key error "; + ss << "index: " << idx.indexNamespace() << " "; + ss << "dup key: " << key.toString(); + return ss.str(); + } + + /** + * Find a key withing this btree bucket. + * + * When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the + * key. That assures that even when there are many duplicates (e.g., 1 million) for a key, + * our performance is still good. + * + * assertIfDup: if the key exists (ignoring the recordLoc), uassert + * + * pos: for existing keys k0...kn-1. + * returns # it goes BEFORE. so key[pos-1] < key < key[pos] + * returns n if it goes after the last existing key. + * note result might be an Unused location! + */ + + bool guessIncreasing = false; + template< class V > + bool BtreeBucket<V>::find(const IndexDetails& idx, const Key& key, const DiskLoc &rl, + const Ordering &order, int& pos, bool assertIfDup) const { + Loc recordLoc; + recordLoc = rl; + globalIndexCounters.btree( (char*)this ); + + // binary search for this key + bool dupsChecked = false; + int l=0; + int h=this->n-1; + int m = (l+h)/2; + if( guessIncreasing ) { + m = h; + } + while ( l <= h ) { + KeyNode M = this->keyNode(m); + int x = key.woCompare(M.key, order); + if ( x == 0 ) { + if( assertIfDup ) { + if( k(m).isUnused() ) { + // ok that key is there if unused. but we need to check that there aren't other + // entries for the key then. as it is very rare that we get here, we don't put any + // coding effort in here to make this particularly fast + if( !dupsChecked ) { + dupsChecked = true; + if( idx.head.btree<V>()->exists(idx, idx.head, key, order) ) { + if( idx.head.btree<V>()->wouldCreateDup(idx, idx.head, key, order, recordLoc) ) + uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) ); + else + alreadyInIndex(); + } + } + } + else { + if( M.recordLoc == recordLoc ) + alreadyInIndex(); + uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) ); + } + } + + // dup keys allowed. use recordLoc as if it is part of the key + Loc unusedRL = M.recordLoc; + unusedRL.GETOFS() &= ~1; // so we can test equality without the used bit messing us up + x = recordLoc.compare(unusedRL); + } + if ( x < 0 ) // key < M.key + h = m-1; + else if ( x > 0 ) + l = m+1; + else { + // found it. + pos = m; + return true; + } + m = (l+h)/2; + } + // not found + pos = l; + if ( pos != this->n ) { + Key keyatpos = keyNode(pos).key; + wassert( key.woCompare(keyatpos, order) <= 0 ); + if ( pos > 0 ) { + if( !( keyNode(pos-1).key.woCompare(key, order) <= 0 ) ) { + DEV { + log() << key.toString() << endl; + log() << keyNode(pos-1).key.toString() << endl; + } + wassert(false); + } + } + } + + return false; + } + + template< class V > + void BtreeBucket<V>::delBucket(const DiskLoc thisLoc, const IndexDetails& id) { + ClientCursor::informAboutToDeleteBucket(thisLoc); // slow... + assert( !isHead() ); + + DiskLoc ll = this->parent; + const BtreeBucket *p = ll.btree<V>(); + int parentIdx = indexInParent( thisLoc ); + p->childForPos( parentIdx ).writing().Null(); + deallocBucket( thisLoc, id ); + } + + template< class V > + void BtreeBucket<V>::deallocBucket(const DiskLoc thisLoc, const IndexDetails &id) { +#if 0 + // as a temporary defensive measure, we zap the whole bucket, AND don't truly delete + // it (meaning it is ineligible for reuse). + memset(this, 0, Size()); +#else + // defensive: + this->n = -1; + this->parent.Null(); + string ns = id.indexNamespace(); + theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), thisLoc.rec(), thisLoc); +#endif + } + + /** note: may delete the entire bucket! this invalid upon return sometimes. */ + template< class V > + void BtreeBucket<V>::delKeyAtPos( const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order) { + assert(this->n>0); + DiskLoc left = this->childForPos(p); + + if ( this->n == 1 ) { + if ( left.isNull() && this->nextChild.isNull() ) { + this->_delKeyAtPos(p); + if ( isHead() ) { + // we don't delete the top bucket ever + } + else { + if ( !mayBalanceWithNeighbors( thisLoc, id, order ) ) { + // An empty bucket is only allowed as a transient state. If + // there are no neighbors to balance with, we delete ourself. + // This condition is only expected in legacy btrees. + delBucket(thisLoc, id); + } + } + return; + } + deleteInternalKey( thisLoc, p, id, order ); + return; + } + + if ( left.isNull() ) { + this->_delKeyAtPos(p); + mayBalanceWithNeighbors( thisLoc, id, order ); + } + else { + deleteInternalKey( thisLoc, p, id, order ); + } + } + + /** + * This function replaces the specified key (k) by either the prev or next + * key in the btree (k'). We require that k have either a left or right + * child. If k has a left child, we set k' to the prev key of k, which must + * be a leaf present in the left child. If k does not have a left child, we + * set k' to the next key of k, which must be a leaf present in the right + * child. When we replace k with k', we copy k' over k (which may cause a + * split) and then remove k' from its original location. Because k' is + * stored in a descendent of k, replacing k by k' will not modify the + * storage location of the original k', and we can easily remove k' from + * its original location. + * + * This function is only needed in cases where k has a left or right child; + * in other cases a simpler key removal implementation is possible. + * + * NOTE on legacy btree structures: + * In legacy btrees, k' can be a nonleaf. In such a case we 'delete' k by + * marking it as an unused node rather than replacing it with k'. Also, k' + * may be a leaf but marked as an unused node. In such a case we replace + * k by k', preserving the key's unused marking. This function is only + * expected to mark a key as unused when handling a legacy btree. + */ + template< class V > + void BtreeBucket<V>::deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order ) { + DiskLoc lchild = this->childForPos( keypos ); + DiskLoc rchild = this->childForPos( keypos + 1 ); + assert( !lchild.isNull() || !rchild.isNull() ); + int advanceDirection = lchild.isNull() ? 1 : -1; + int advanceKeyOfs = keypos; + DiskLoc advanceLoc = advance( thisLoc, advanceKeyOfs, advanceDirection, __FUNCTION__ ); + // advanceLoc must be a descentant of thisLoc, because thisLoc has a + // child in the proper direction and all descendants of thisLoc must be + // nonempty because they are not the root. + + if ( !advanceLoc.btree<V>()->childForPos( advanceKeyOfs ).isNull() || + !advanceLoc.btree<V>()->childForPos( advanceKeyOfs + 1 ).isNull() ) { + // only expected with legacy btrees, see note above + this->markUnused( keypos ); + return; + } + + KeyNode kn = advanceLoc.btree<V>()->keyNode( advanceKeyOfs ); + // Because advanceLoc is a descendant of thisLoc, updating thisLoc will + // not affect packing or keys of advanceLoc and kn will be stable + // during the following setInternalKey() + setInternalKey( thisLoc, keypos, kn.recordLoc, kn.key, order, this->childForPos( keypos ), this->childForPos( keypos + 1 ), id ); + advanceLoc.btreemod<V>()->delKeyAtPos( advanceLoc, id, advanceKeyOfs, order ); + } + +//#define BTREE(loc) (static_cast<DiskLoc>(loc).btree<V>()) +#define BTREE(loc) (loc.template btree<V>()) +//#define BTREEMOD(loc) (static_cast<DiskLoc>(loc).btreemod<V>()) +#define BTREEMOD(loc) (loc.template btreemod<V>()) + + template< class V > + void BtreeBucket<V>::replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id ) { + assert( this->n == 0 && !this->nextChild.isNull() ); + if ( this->parent.isNull() ) { + assert( id.head == thisLoc ); + id.head.writing() = this->nextChild; + } + else { + DiskLoc ll = this->parent; + ll.btree<V>()->childForPos( indexInParent( thisLoc ) ).writing() = this->nextChild; + } + BTREE(this->nextChild)->parent.writing() = this->parent; + ClientCursor::informAboutToDeleteBucket( thisLoc ); + deallocBucket( thisLoc, id ); + } + + template< class V > + bool BtreeBucket<V>::canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const { + assert( leftIndex >= 0 && leftIndex < this->n ); + DiskLoc leftNodeLoc = this->childForPos( leftIndex ); + DiskLoc rightNodeLoc = this->childForPos( leftIndex + 1 ); + if ( leftNodeLoc.isNull() || rightNodeLoc.isNull() ) { + // TODO if this situation is possible in long term implementation, maybe we should compact somehow anyway + return false; + } + int pos = 0; + { + const BtreeBucket *l = leftNodeLoc.btree<V>(); + const BtreeBucket *r = rightNodeLoc.btree<V>(); + if ( ( this->headerSize() + l->packedDataSize( pos ) + r->packedDataSize( pos ) + keyNode( leftIndex ).key.dataSize() + sizeof(_KeyNode) > unsigned( V::BucketSize ) ) ) { + return false; + } + } + return true; + } + + /** + * This implementation must respect the meaning and value of lowWaterMark. + * Also see comments in splitPos(). + */ + template< class V > + int BtreeBucket<V>::rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const { + int split = -1; + int rightSize = 0; + const BtreeBucket *l = BTREE(this->childForPos( leftIndex )); + const BtreeBucket *r = BTREE(this->childForPos( leftIndex + 1 )); + + int KNS = sizeof( _KeyNode ); + int rightSizeLimit = ( l->topSize + l->n * KNS + keyNode( leftIndex ).key.dataSize() + KNS + r->topSize + r->n * KNS ) / 2; + // This constraint should be ensured by only calling this function + // if we go below the low water mark. + assert( rightSizeLimit < BtreeBucket<V>::bodySize() ); + for( int i = r->n - 1; i > -1; --i ) { + rightSize += r->keyNode( i ).key.dataSize() + KNS; + if ( rightSize > rightSizeLimit ) { + split = l->n + 1 + i; + break; + } + } + if ( split == -1 ) { + rightSize += keyNode( leftIndex ).key.dataSize() + KNS; + if ( rightSize > rightSizeLimit ) { + split = l->n; + } + } + if ( split == -1 ) { + for( int i = l->n - 1; i > -1; --i ) { + rightSize += l->keyNode( i ).key.dataSize() + KNS; + if ( rightSize > rightSizeLimit ) { + split = i; + break; + } + } + } + // safeguards - we must not create an empty bucket + if ( split < 1 ) { + split = 1; + } + else if ( split > l->n + 1 + r->n - 2 ) { + split = l->n + 1 + r->n - 2; + } + + return split; + } + + template< class V > + void BtreeBucket<V>::doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) { + DiskLoc leftNodeLoc = this->childForPos( leftIndex ); + DiskLoc rightNodeLoc = this->childForPos( leftIndex + 1 ); + BtreeBucket *l = leftNodeLoc.btreemod<V>(); + BtreeBucket *r = rightNodeLoc.btreemod<V>(); + int pos = 0; + l->_packReadyForMod( order, pos ); + r->_packReadyForMod( order, pos ); // pack r in case there are droppable keys + + // We know the additional keys below will fit in l because canMergeChildren() + // must be true. + int oldLNum = l->n; + { + KeyNode kn = keyNode( leftIndex ); + l->pushBack( kn.recordLoc, kn.key, order, l->nextChild ); // left child's right child becomes old parent key's left child + } + for( int i = 0; i < r->n; ++i ) { + KeyNode kn = r->keyNode( i ); + l->pushBack( kn.recordLoc, kn.key, order, kn.prevChildBucket ); + } + l->nextChild = r->nextChild; + l->fixParentPtrs( leftNodeLoc, oldLNum ); + r->delBucket( rightNodeLoc, id ); + this->childForPos( leftIndex + 1 ) = leftNodeLoc; + this->childForPos( leftIndex ) = DiskLoc(); + this->_delKeyAtPos( leftIndex, true ); + if ( this->n == 0 ) { + // will trash this and thisLoc + // TODO To ensure all leaves are of equal height, we should ensure + // this is only called on the root. + replaceWithNextChild( thisLoc, id ); + } + else { + // balance recursively - maybe we should do this even when n == 0? + mayBalanceWithNeighbors( thisLoc, id, order ); + } + } + + template< class V > + int BtreeBucket<V>::indexInParent( const DiskLoc &thisLoc ) const { + assert( !this->parent.isNull() ); + const BtreeBucket *p = BTREE(this->parent); + if ( p->nextChild == thisLoc ) { + return p->n; + } + else { + for( int i = 0; i < p->n; ++i ) { + if ( p->k( i ).prevChildBucket == thisLoc ) { + return i; + } + } + } + out() << "ERROR: can't find ref to child bucket.\n"; + out() << "child: " << thisLoc << "\n"; + dump(); + out() << "Parent: " << this->parent << "\n"; + p->dump(); + assert(false); + return -1; // just to compile + } + + template< class V > + bool BtreeBucket<V>::tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const { + // If we can merge, then we must merge rather than balance to preserve + // bucket utilization constraints. + if ( canMergeChildren( thisLoc, leftIndex ) ) { + return false; + } + thisLoc.btreemod<V>()->doBalanceChildren( thisLoc, leftIndex, id, order ); + return true; + } + + template< class V > + void BtreeBucket<V>::doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split, + BtreeBucket *l, const DiskLoc lchild, + BtreeBucket *r, const DiskLoc rchild, + IndexDetails &id, const Ordering &order ) { + // TODO maybe do some audits the same way pushBack() does? + // As a precondition, rchild + the old separator are <= half a body size, + // and lchild is at most completely full. Based on the value of split, + // rchild will get <= half of the total bytes which is at most 75% + // of a full body. So rchild will have room for the following keys: + int rAdd = l->n - split; + r->reserveKeysFront( rAdd ); + for( int i = split + 1, j = 0; i < l->n; ++i, ++j ) { + KeyNode kn = l->keyNode( i ); + r->setKey( j, kn.recordLoc, kn.key, kn.prevChildBucket ); + } + { + KeyNode kn = keyNode( leftIndex ); + r->setKey( rAdd - 1, kn.recordLoc, kn.key, l->nextChild ); // left child's right child becomes old parent key's left child + } + r->fixParentPtrs( rchild, 0, rAdd - 1 ); + { + KeyNode kn = l->keyNode( split ); + l->nextChild = kn.prevChildBucket; + // Because lchild is a descendant of thisLoc, updating thisLoc will + // not affect packing or keys of lchild and kn will be stable + // during the following setInternalKey() + setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id ); + } + int zeropos = 0; + // lchild and rchild cannot be merged, so there must be >0 (actually more) + // keys to the left of split. + l->truncateTo( split, order, zeropos ); + } + + template< class V > + void BtreeBucket<V>::doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split, + BtreeBucket *l, const DiskLoc lchild, + BtreeBucket *r, const DiskLoc rchild, + IndexDetails &id, const Ordering &order ) { + // As a precondition, lchild + the old separator are <= half a body size, + // and rchild is at most completely full. Based on the value of split, + // lchild will get less than half of the total bytes which is at most 75% + // of a full body. So lchild will have room for the following keys: + int lN = l->n; + { + KeyNode kn = keyNode( leftIndex ); + l->pushBack( kn.recordLoc, kn.key, order, l->nextChild ); // left child's right child becomes old parent key's left child + } + for( int i = 0; i < split - lN - 1; ++i ) { + KeyNode kn = r->keyNode( i ); + l->pushBack( kn.recordLoc, kn.key, order, kn.prevChildBucket ); + } + { + KeyNode kn = r->keyNode( split - lN - 1 ); + l->nextChild = kn.prevChildBucket; + // Child lN was lchild's old nextChild, and don't need to fix that one. + l->fixParentPtrs( lchild, lN + 1, l->n ); + // Because rchild is a descendant of thisLoc, updating thisLoc will + // not affect packing or keys of rchild and kn will be stable + // during the following setInternalKey() + setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id ); + } + int zeropos = 0; + // lchild and rchild cannot be merged, so there must be >0 (actually more) + // keys to the right of split. + r->dropFront( split - lN, order, zeropos ); + } + + template< class V > + void BtreeBucket<V>::doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) { + DiskLoc lchild = this->childForPos( leftIndex ); + DiskLoc rchild = this->childForPos( leftIndex + 1 ); + int zeropos = 0; + BtreeBucket *l = lchild.btreemod<V>(); + l->_packReadyForMod( order, zeropos ); + BtreeBucket *r = rchild.btreemod<V>(); + r->_packReadyForMod( order, zeropos ); + int split = rebalancedSeparatorPos( thisLoc, leftIndex ); + + // By definition, if we are below the low water mark and cannot merge + // then we must actively balance. + assert( split != l->n ); + if ( split < l->n ) { + doBalanceLeftToRight( thisLoc, leftIndex, split, l, lchild, r, rchild, id, order ); + } + else { + doBalanceRightToLeft( thisLoc, leftIndex, split, l, lchild, r, rchild, id, order ); + } + } + + template< class V > + bool BtreeBucket<V>::mayBalanceWithNeighbors( const DiskLoc thisLoc, IndexDetails &id, const Ordering &order ) const { + if ( this->parent.isNull() ) { // we are root, there are no neighbors + return false; + } + + if ( this->packedDataSize( 0 ) >= this->lowWaterMark() ) { + return false; + } + + const BtreeBucket *p = BTREE(this->parent); + int parentIdx = indexInParent( thisLoc ); + + // TODO will missing neighbor case be possible long term? Should we try to merge/balance somehow in that case if so? + bool mayBalanceRight = ( ( parentIdx < p->n ) && !p->childForPos( parentIdx + 1 ).isNull() ); + bool mayBalanceLeft = ( ( parentIdx > 0 ) && !p->childForPos( parentIdx - 1 ).isNull() ); + + // Balance if possible on one side - we merge only if absolutely necessary + // to preserve btree bucket utilization constraints since that's a more + // heavy duty operation (especially if we must re-split later). + if ( mayBalanceRight && + p->tryBalanceChildren( this->parent, parentIdx, id, order ) ) { + return true; + } + if ( mayBalanceLeft && + p->tryBalanceChildren( this->parent, parentIdx - 1, id, order ) ) { + return true; + } + + BtreeBucket *pm = BTREEMOD(this->parent); + if ( mayBalanceRight ) { + pm->doMergeChildren( this->parent, parentIdx, id, order ); + return true; + } + else if ( mayBalanceLeft ) { + pm->doMergeChildren( this->parent, parentIdx - 1, id, order ); + return true; + } + + return false; + } + + /** remove a key from the index */ + template< class V > + bool BtreeBucket<V>::unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc ) const { + int pos; + bool found; + const Ordering ord = Ordering::make(id.keyPattern()); + DiskLoc loc = locate(id, thisLoc, key, ord, pos, found, recordLoc, 1); + if ( found ) { + if ( key.objsize() > this->KeyMax ) { + OCCASIONALLY problem() << "unindex: key too large to index but was found for " << id.indexNamespace() << " reIndex suggested" << endl; + } + loc.btreemod<V>()->delKeyAtPos(loc, id, pos, ord); + return true; + } + return false; + } + + template< class V > + inline void BtreeBucket<V>::fix(const DiskLoc thisLoc, const DiskLoc child) { + if ( !child.isNull() ) { + if ( insert_debug ) + out() << " fix " << child.toString() << ".parent=" << thisLoc.toString() << endl; + child.btree<V>()->parent.writing() = thisLoc; + } + } + + /** + * This can cause a lot of additional page writes when we assign buckets to + * different parents. Maybe get rid of parent ptrs? + */ + template< class V > + void BtreeBucket<V>::fixParentPtrs(const DiskLoc thisLoc, int firstIndex, int lastIndex) const { + VERIFYTHISLOC + if ( lastIndex == -1 ) { + lastIndex = this->n; + } + for ( int i = firstIndex; i <= lastIndex; i++ ) { + fix(thisLoc, this->childForPos(i)); + } + } + + template< class V > + void BtreeBucket<V>::setInternalKey( const DiskLoc thisLoc, int keypos, + const DiskLoc recordLoc, const Key &key, const Ordering &order, + const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx ) { + this->childForPos( keypos ).Null(); + + // This may leave the bucket empty (n == 0) which is ok only as a + // transient state. In the instant case, the implementation of + // insertHere behaves correctly when n == 0 and as a side effect + // increments n. + this->_delKeyAtPos( keypos, true ); + + // Ensure we do not orphan neighbor's old child. + assert( this->childForPos( keypos ) == rchild ); + + // Just set temporarily - required to pass validation in insertHere() + this->childForPos( keypos ) = lchild; + + insertHere( thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx ); + } + + /** + * insert a key in this bucket, splitting if necessary. + * @keypos - where to insert the key in range 0..n. 0=make leftmost, n=make rightmost. + * NOTE this function may free some data, and as a result the value passed for keypos may + * be invalid after calling insertHere() + * + * Some of the write intent signaling below relies on the implementation of + * the optimized write intent code in basicInsert(). + */ + template< class V > + void BtreeBucket<V>::insertHere( const DiskLoc thisLoc, int keypos, + const DiskLoc recordLoc, const Key& key, const Ordering& order, + const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) const { + if ( insert_debug ) + out() << " " << thisLoc.toString() << ".insertHere " << key.toString() << '/' << recordLoc.toString() << ' ' + << lchild.toString() << ' ' << rchild.toString() << " keypos:" << keypos << endl; + + if ( !this->basicInsert(thisLoc, keypos, recordLoc, key, order) ) { + // If basicInsert() fails, the bucket will be packed as required by split(). + thisLoc.btreemod<V>()->split(thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx); + return; + } + + { + const _KeyNode *_kn = &k(keypos); + _KeyNode *kn = (_KeyNode *) getDur().alreadyDeclared((_KeyNode*) _kn); // already declared intent in basicInsert() + if ( keypos+1 == this->n ) { // last key + if ( this->nextChild != lchild ) { + out() << "ERROR nextChild != lchild" << endl; + out() << " thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl; + out() << " keyPos: " << keypos << " n:" << this->n << endl; + out() << " nextChild: " << this->nextChild.toString() << " lchild: " << lchild.toString() << endl; + out() << " recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl; + out() << " key: " << key.toString() << endl; + dump(); + assert(false); + } + kn->prevChildBucket = this->nextChild; + assert( kn->prevChildBucket == lchild ); + this->nextChild.writing() = rchild; + if ( !rchild.isNull() ) + BTREE(rchild)->parent.writing() = thisLoc; + } + else { + kn->prevChildBucket = lchild; + if ( k(keypos+1).prevChildBucket != lchild ) { + out() << "ERROR k(keypos+1).prevChildBucket != lchild" << endl; + out() << " thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl; + out() << " keyPos: " << keypos << " n:" << this->n << endl; + out() << " k(keypos+1).pcb: " << k(keypos+1).prevChildBucket.toString() << " lchild: " << lchild.toString() << endl; + out() << " recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl; + out() << " key: " << key.toString() << endl; + dump(); + assert(false); + } + const Loc *pc = &k(keypos+1).prevChildBucket; + *getDur().alreadyDeclared( const_cast<Loc*>(pc) ) = rchild; // declared in basicInsert() + if ( !rchild.isNull() ) + rchild.btree<V>()->parent.writing() = thisLoc; + } + return; + } + } + + template< class V > + void BtreeBucket<V>::split(const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const Key& key, const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) { + this->assertWritable(); + + if ( split_debug ) + out() << " " << thisLoc.toString() << ".split" << endl; + + int split = this->splitPos( keypos ); + DiskLoc rLoc = addBucket(idx); + BtreeBucket *r = rLoc.btreemod<V>(); + if ( split_debug ) + out() << " split:" << split << ' ' << keyNode(split).key.toString() << " n:" << this->n << endl; + for ( int i = split+1; i < this->n; i++ ) { + KeyNode kn = keyNode(i); + r->pushBack(kn.recordLoc, kn.key, order, kn.prevChildBucket); + } + r->nextChild = this->nextChild; + r->assertValid( order ); + + if ( split_debug ) + out() << " new rLoc:" << rLoc.toString() << endl; + r = 0; + rLoc.btree<V>()->fixParentPtrs(rLoc); + + { + KeyNode splitkey = keyNode(split); + this->nextChild = splitkey.prevChildBucket; // splitkey key gets promoted, its children will be thisLoc (l) and rLoc (r) + if ( split_debug ) { + out() << " splitkey key:" << splitkey.key.toString() << endl; + } + + // Because thisLoc is a descendant of parent, updating parent will + // not affect packing or keys of thisLoc and splitkey will be stable + // during the following: + + // promote splitkey to a parent this->node + if ( this->parent.isNull() ) { + // make a new parent if we were the root + DiskLoc L = addBucket(idx); + BtreeBucket *p = L.btreemod<V>(); + p->pushBack(splitkey.recordLoc, splitkey.key, order, thisLoc); + p->nextChild = rLoc; + p->assertValid( order ); + this->parent = idx.head.writing() = L; + if ( split_debug ) + out() << " we were root, making new root:" << hex << this->parent.getOfs() << dec << endl; + rLoc.btree<V>()->parent.writing() = this->parent; + } + else { + // set this before calling _insert - if it splits it will do fixParent() logic and change the value. + rLoc.btree<V>()->parent.writing() = this->parent; + if ( split_debug ) + out() << " promoting splitkey key " << splitkey.key.toString() << endl; + BTREE(this->parent)->_insert(this->parent, splitkey.recordLoc, splitkey.key, order, /*dupsallowed*/true, thisLoc, rLoc, idx); + } + } + + int newpos = keypos; + // note this may trash splitkey.key. thus we had to promote it before finishing up here. + this->truncateTo(split, order, newpos); + + // add our this->new key, there is room this->now + { + if ( keypos <= split ) { + if ( split_debug ) + out() << " keypos<split, insertHere() the new key" << endl; + insertHere(thisLoc, newpos, recordLoc, key, order, lchild, rchild, idx); + } + else { + int kp = keypos-split-1; + assert(kp>=0); + BTREE(rLoc)->insertHere(rLoc, kp, recordLoc, key, order, lchild, rchild, idx); + } + } + + if ( split_debug ) + out() << " split end " << hex << thisLoc.getOfs() << dec << endl; + } + + /** start a new index off, empty */ + template< class V > + DiskLoc BtreeBucket<V>::addBucket(const IndexDetails& id) { + string ns = id.indexNamespace(); + DiskLoc loc = theDataFileMgr.insert(ns.c_str(), 0, V::BucketSize, true); + BtreeBucket *b = BTREEMOD(loc); + b->init(); + return loc; + } + + void renameIndexNamespace(const char *oldNs, const char *newNs) { + renameNamespace( oldNs, newNs ); + } + + template< class V > + const DiskLoc BtreeBucket<V>::getHead(const DiskLoc& thisLoc) const { + DiskLoc p = thisLoc; + while ( !BTREE(p)->isHead() ) + p = BTREE(p)->parent; + return p; + } + + template< class V > + DiskLoc BtreeBucket<V>::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const { + if ( keyOfs < 0 || keyOfs >= this->n ) { + out() << "ASSERT failure BtreeBucket<V>::advance, caller: " << caller << endl; + out() << " thisLoc: " << thisLoc.toString() << endl; + out() << " keyOfs: " << keyOfs << " n:" << this->n << " direction: " << direction << endl; + out() << bucketSummary() << endl; + assert(false); + } + int adj = direction < 0 ? 1 : 0; + int ko = keyOfs + direction; + DiskLoc nextDown = this->childForPos(ko+adj); + if ( !nextDown.isNull() ) { + while ( 1 ) { + keyOfs = direction>0 ? 0 : BTREE(nextDown)->n - 1; + DiskLoc loc = BTREE(nextDown)->childForPos(keyOfs + adj); + if ( loc.isNull() ) + break; + nextDown = loc; + } + return nextDown; + } + + if ( ko < this->n && ko >= 0 ) { + keyOfs = ko; + return thisLoc; + } + + // end of bucket. traverse back up. + DiskLoc childLoc = thisLoc; + DiskLoc ancestor = this->parent; + while ( 1 ) { + if ( ancestor.isNull() ) + break; + const BtreeBucket *an = BTREE(ancestor); + for ( int i = 0; i < an->n; i++ ) { + if ( an->childForPos(i+adj) == childLoc ) { + keyOfs = i; + return ancestor; + } + } + assert( direction<0 || an->nextChild == childLoc ); + // parent exhausted also, keep going up + childLoc = ancestor; + ancestor = an->parent; + } + + return DiskLoc(); + } + + template< class V > + DiskLoc BtreeBucket<V>::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const { + KeyOwned k(key); + return locate(idx, thisLoc, k, order, pos, found, recordLoc, direction); + } + + template< class V > + DiskLoc BtreeBucket<V>::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const Key& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const { + int p; + found = find(idx, key, recordLoc, order, p, /*assertIfDup*/ false); + if ( found ) { + pos = p; + return thisLoc; + } + + DiskLoc child = this->childForPos(p); + + if ( !child.isNull() ) { + DiskLoc l = BTREE(child)->locate(idx, child, key, order, pos, found, recordLoc, direction); + if ( !l.isNull() ) + return l; + } + + pos = p; + if ( direction < 0 ) + return --pos == -1 ? DiskLoc() /*theend*/ : thisLoc; + else + return pos == this->n ? DiskLoc() /*theend*/ : thisLoc; + } + + template< class V > + bool BtreeBucket<V>::customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) { + const BtreeBucket<V> * bucket = BTREE(thisLoc); + while( 1 ) { + if ( l + 1 == h ) { + keyOfs = ( direction > 0 ) ? h : l; + DiskLoc next = bucket->k( h ).prevChildBucket; + if ( !next.isNull() ) { + bestParent = make_pair( thisLoc, keyOfs ); + thisLoc = next; + return true; + } + else { + return false; + } + } + int m = l + ( h - l ) / 2; + int cmp = customBSONCmp( bucket->keyNode( m ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ); + if ( cmp < 0 ) { + l = m; + } + else if ( cmp > 0 ) { + h = m; + } + else { + if ( direction < 0 ) { + l = m; + } + else { + h = m; + } + } + } + } + + /** + * find smallest/biggest value greater-equal/less-equal than specified + * starting thisLoc + keyOfs will be strictly less than/strictly greater than keyBegin/keyBeginLen/keyEnd + * All the direction checks below allowed me to refactor the code, but possibly separate forward and reverse implementations would be more efficient + */ + template< class V > + void BtreeBucket<V>::advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const { + int l,h; + bool dontGoUp; + if ( direction > 0 ) { + l = keyOfs; + h = this->n - 1; + dontGoUp = ( customBSONCmp( keyNode( h ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ); + } + else { + l = 0; + h = keyOfs; + dontGoUp = ( customBSONCmp( keyNode( l ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ); + } + pair< DiskLoc, int > bestParent; + if ( dontGoUp ) { + // this comparison result assures h > l + if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, thisLoc, keyOfs, bestParent ) ) { + return; + } + } + else { + // go up parents until rightmost/leftmost node is >=/<= target or at top + while( !BTREE(thisLoc)->parent.isNull() ) { + thisLoc = BTREE(thisLoc)->parent; + if ( direction > 0 ) { + if ( customBSONCmp( BTREE(thisLoc)->keyNode( BTREE(thisLoc)->n - 1 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ) { + break; + } + } + else { + if ( customBSONCmp( BTREE(thisLoc)->keyNode( 0 ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ) { + break; + } + } + } + } + customLocate( thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, bestParent ); + } + + /** @param thisLoc in/out param. perhaps thisLoc isn't the best name given that. + Ut is used by advanceTo, which skips + from one key to another key without necessarily checking all the keys + between them in the btree (it can skip to different btree buckets). + The advanceTo function can get called a lot, and for different targets + we want to advance to, don't want to create a bson obj in a new + buffer each time we call that function. The + customLocate function necessary for advanceTo, and does the same thing + as normal locate function but takes basically the same arguments + as advanceTo. + */ + template< class V > + void BtreeBucket<V>::customLocate(DiskLoc &locInOut, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, + const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, + const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) { + dassert( direction == 1 || direction == -1 ); + const BtreeBucket<V> *bucket = BTREE(locInOut); + if ( bucket->n == 0 ) { + locInOut = DiskLoc(); + return; + } + // go down until find smallest/biggest >=/<= target + while( 1 ) { + int l = 0; + int h = bucket->n - 1; + + // +direction: 0, -direction: h + int z = (1-direction)/2*h; + + // leftmost/rightmost key may possibly be >=/<= search key + int res = customBSONCmp( bucket->keyNode( z ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ); + bool firstCheck = direction*res >= 0; + + if ( firstCheck ) { + DiskLoc next; + keyOfs = z; + if ( direction > 0 ) { + dassert( z == 0 ); + next = bucket->k( 0 ).prevChildBucket; + } + else { + next = bucket->nextChild; + } + if ( !next.isNull() ) { + bestParent = pair< DiskLoc, int >( locInOut, keyOfs ); + locInOut = next; + bucket = BTREE(locInOut); + continue; + } + else { + return; + } + } + + res = customBSONCmp( bucket->keyNode( h-z ).key.toBson(), keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ); + bool secondCheck = direction*res < 0; + + if ( secondCheck ) { + DiskLoc next; + if ( direction > 0 ) { + next = bucket->nextChild; + } + else { + next = bucket->k( 0 ).prevChildBucket; + } + if ( next.isNull() ) { + // if bestParent is null, we've hit the end and locInOut gets set to DiskLoc() + locInOut = bestParent.first; + keyOfs = bestParent.second; + return; + } + else { + locInOut = next; + bucket = BTREE(locInOut); + continue; + } + } + + if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, locInOut, keyOfs, bestParent ) ) { + return; + } + bucket = BTREE(locInOut); + } + } + + /** @thisLoc disk location of *this */ + template< class V > + void BtreeBucket<V>::insertStepOne(DiskLoc thisLoc, + Continuation<V>& c, + bool dupsAllowed) const { + dassert( c.key.dataSize() <= this->KeyMax ); + assert( c.key.dataSize() > 0 ); + + int pos; + bool found = find(c.idx, c.key, c.recordLoc, c.order, pos, !dupsAllowed); + + if ( found ) { + const _KeyNode& kn = k(pos); + if ( kn.isUnused() ) { + log(4) << "btree _insert: reusing unused key" << endl; + c.b = this; + c.pos = pos; + c.op = Continuation<V>::SetUsed; + return; + } + + DEV { + log() << "_insert(): key already exists in index (ok for background:true)\n"; + log() << " " << c.idx.indexNamespace() << " thisLoc:" << thisLoc.toString() << '\n'; + log() << " " << c.key.toString() << '\n'; + log() << " " << "recordLoc:" << c.recordLoc.toString() << " pos:" << pos << endl; + log() << " old l r: " << this->childForPos(pos).toString() << ' ' << this->childForPos(pos+1).toString() << endl; + } + alreadyInIndex(); + } + + Loc ch = this->childForPos(pos); + DiskLoc child = ch; + + if ( child.isNull() ) { + // A this->new key will be inserted at the same tree height as an adjacent existing key. + c.bLoc = thisLoc; + c.b = this; + c.pos = pos; + c.op = Continuation<V>::InsertHere; + return; + } + + child.btree<V>()->insertStepOne(child, c, dupsAllowed); + } + + /** @thisLoc disk location of *this */ + template< class V > + int BtreeBucket<V>::_insert(const DiskLoc thisLoc, const DiskLoc recordLoc, + const Key& key, const Ordering &order, bool dupsAllowed, + const DiskLoc lChild, const DiskLoc rChild, IndexDetails& idx) const { + if ( key.dataSize() > this->KeyMax ) { + problem() << "ERROR: key too large len:" << key.dataSize() << " max:" << this->KeyMax << ' ' << key.dataSize() << ' ' << idx.indexNamespace() << endl; + return 2; + } + assert( key.dataSize() > 0 ); + + int pos; + bool found = find(idx, key, recordLoc, order, pos, !dupsAllowed); + if ( insert_debug ) { + out() << " " << thisLoc.toString() << '.' << "_insert " << + key.toString() << '/' << recordLoc.toString() << + " l:" << lChild.toString() << " r:" << rChild.toString() << endl; + out() << " found:" << found << " pos:" << pos << " n:" << this->n << endl; + } + + if ( found ) { + const _KeyNode& kn = k(pos); + if ( kn.isUnused() ) { + log(4) << "btree _insert: reusing unused key" << endl; + massert( 10285 , "_insert: reuse key but lchild is not null", lChild.isNull()); + massert( 10286 , "_insert: reuse key but rchild is not null", rChild.isNull()); + kn.writing().setUsed(); + return 0; + } + + DEV { + log() << "_insert(): key already exists in index (ok for background:true)\n"; + log() << " " << idx.indexNamespace() << " thisLoc:" << thisLoc.toString() << '\n'; + log() << " " << key.toString() << '\n'; + log() << " " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl; + log() << " old l r: " << this->childForPos(pos).toString() << ' ' << this->childForPos(pos+1).toString() << endl; + log() << " new l r: " << lChild.toString() << ' ' << rChild.toString() << endl; + } + alreadyInIndex(); + } + + DEBUGGING out() << "TEMP: key: " << key.toString() << endl; + Loc ch = this->childForPos(pos); + DiskLoc child = ch; + if ( insert_debug ) + out() << " getChild(" << pos << "): " << child.toString() << endl; + // In current usage, rChild isNull() for a new key and false when we are + // promoting a split key. These are the only two cases where _insert() + // is called currently. + if ( child.isNull() || !rChild.isNull() ) { + // A new key will be inserted at the same tree height as an adjacent existing key. + insertHere(thisLoc, pos, recordLoc, key, order, lChild, rChild, idx); + return 0; + } + + return child.btree<V>()->_insert(child, recordLoc, key, order, dupsAllowed, /*lchild*/DiskLoc(), /*rchild*/DiskLoc(), idx); + } + + template< class V > + void BtreeBucket<V>::dump(unsigned depth) const { + string indent = string(depth, ' '); + _log() << "BUCKET n:" << this->n; + _log() << " parent:" << hex << this->parent.getOfs() << dec; + for ( int i = 0; i < this->n; i++ ) { + _log() << '\n' << indent; + KeyNode k = keyNode(i); + string ks = k.key.toString(); + _log() << " " << hex << k.prevChildBucket.getOfs() << '\n'; + _log() << indent << " " << i << ' ' << ks.substr(0, 30) << " Loc:" << k.recordLoc.toString() << dec; + if ( this->k(i).isUnused() ) + _log() << " UNUSED"; + } + _log() << "\n" << indent << " " << hex << this->nextChild.getOfs() << dec << endl; + } + + template< class V > + void BtreeBucket<V>::twoStepInsert(DiskLoc thisLoc, Continuation<V> &c, bool dupsAllowed) const + { + + if ( c.key.dataSize() > this->KeyMax ) { + problem() << "ERROR: key too large len:" << c.key.dataSize() << " max:" << this->KeyMax << ' ' << c.key.dataSize() << ' ' << c.idx.indexNamespace() << endl; + return; // op=Nothing + } + insertStepOne(thisLoc, c, dupsAllowed); + } + + /** todo: meaning of return code unclear clean up */ + template< class V > + int BtreeBucket<V>::bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc, + const BSONObj& _key, const Ordering &order, bool dupsAllowed, + IndexDetails& idx, bool toplevel) const + { + guessIncreasing = _key.firstElementType() == jstOID && idx.isIdIndex(); + KeyOwned key(_key); + + dassert(toplevel); + if ( toplevel ) { + if ( key.dataSize() > this->KeyMax ) { + problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() << ' ' << key.dataSize() << ' ' << key.toString() << endl; + return 3; + } + } + + int x; + try { + x = _insert(thisLoc, recordLoc, key, order, dupsAllowed, DiskLoc(), DiskLoc(), idx); + this->assertValid( order ); + } + catch( ... ) { + guessIncreasing = false; + throw; + } + guessIncreasing = false; + return x; + } + + template< class V > + void BtreeBucket<V>::shape(stringstream& ss) const { + this->_shape(0, ss); + } + + template< class V > + int BtreeBucket<V>::getKeyMax() { + return V::KeyMax; + } + + template< class V > + DiskLoc BtreeBucket<V>::findSingle( const IndexDetails& indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const { + int pos; + bool found; + // TODO: is it really ok here that the order is a default? + // for findById() use, yes. for checkNoIndexConflicts, no? + Ordering o = Ordering::make(BSONObj()); + DiskLoc bucket = locate( indexdetails , indexdetails.head , key , o , pos , found , minDiskLoc ); + if ( bucket.isNull() ) + return bucket; + + const BtreeBucket<V> *b = bucket.btree<V>(); + while ( 1 ) { + const _KeyNode& knraw = b->k(pos); + if ( knraw.isUsed() ) + break; + bucket = b->advance( bucket , pos , 1 , "findSingle" ); + if ( bucket.isNull() ) + return bucket; + b = bucket.btree<V>(); + } + KeyNode kn = b->keyNode( pos ); + if ( KeyOwned(key).woCompare( kn.key, o ) != 0 ) + return DiskLoc(); + return kn.recordLoc; + } + +} // namespace mongo + +#include "db.h" +#include "dbhelpers.h" + +namespace mongo { + + template< class V > + void BtreeBucket<V>::a_test(IndexDetails& id) { + BtreeBucket *b = id.head.btreemod<V>(); + + // record locs for testing + DiskLoc A(1, 20); + DiskLoc B(1, 30); + DiskLoc C(1, 40); + + DiskLoc rl; + BSONObj key = fromjson("{x:9}"); + BSONObj orderObj = fromjson("{}"); + Ordering order = Ordering::make(orderObj); + + b->bt_insert(id.head, A, key, order, true, id); + A.GETOFS() += 2; + b->bt_insert(id.head, A, key, order, true, id); + A.GETOFS() += 2; + b->bt_insert(id.head, A, key, order, true, id); + A.GETOFS() += 2; + b->bt_insert(id.head, A, key, order, true, id); + A.GETOFS() += 2; + assert( b->k(0).isUsed() ); +// b->k(0).setUnused(); + b->k(1).setUnused(); + b->k(2).setUnused(); + b->k(3).setUnused(); + + b->dumpTree(id.head, orderObj); + + /* b->bt_insert(id.head, B, key, order, false, id); + b->k(1).setUnused(); + b->dumpTree(id.head, order); + b->bt_insert(id.head, A, key, order, false, id); + b->dumpTree(id.head, order); + */ + + // this should assert. does it? (it might "accidentally" though, not asserting proves a problem, asserting proves nothing) + b->bt_insert(id.head, C, key, order, false, id); + + // b->dumpTree(id.head, order); + } + + template class BucketBasics<V0>; + template class BucketBasics<V1>; + template class BtreeBucket<V0>; + template class BtreeBucket<V1>; + template struct __KeyNode<DiskLoc>; + template struct __KeyNode<DiskLoc56Bit>; + + struct BTUnitTest : public UnitTest { + void run() { + DiskLoc big(0xf12312, 0x70001234); + DiskLoc56Bit bigl; + { + bigl = big; + assert( big == bigl ); + DiskLoc e = bigl; + assert( big == e ); + } + { + DiskLoc d; + assert( d.isNull() ); + DiskLoc56Bit l; + l = d; + assert( l.isNull() ); + d = l; + assert( d.isNull() ); + assert( l < bigl ); + } + } + } btunittest; + +} diff --git a/src/mongo/db/btree.h b/src/mongo/db/btree.h new file mode 100644 index 00000000000..85e5172d163 --- /dev/null +++ b/src/mongo/db/btree.h @@ -0,0 +1,1174 @@ +// btree.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../pch.h" +#include "jsobj.h" +#include "diskloc.h" +#include "pdfile.h" +#include "key.h" + +namespace mongo { + + /** + * Our btree implementation generally follows the standard btree algorithm, + * which is described in many places. The nodes of our btree are referred to + * as buckets below. These buckets are of size BucketSize and their body is + * an ordered array of <bson key, disk loc> pairs, where disk loc is the disk + * location of a document and bson key is a projection of this document into + * the schema of the index for this btree. Ordering is determined on the + * basis of bson key first and then disk loc in case of a tie. All bson keys + * for a btree have identical schemas with empty string field names and may + * not have an objsize() exceeding KeyMax. The btree's buckets are + * themselves organized into an ordered tree. Although there are exceptions, + * generally buckets with n keys have n+1 children and the body of a bucket is + * at least lowWaterMark bytes. A more strictly enforced requirement is that + * a non root bucket must have at least one key except in certain transient + * states. + * + * Our btrees support the following primary read operations: finding a + * specified key; iterating from a starting key to the next or previous + * ordered key; and skipping from a starting key to another specified key + * without checking every intermediate key. The primary write operations + * are insertion and deletion of keys. Insertion may trigger a bucket split + * if necessary to avoid bucket overflow. In such a case, subsequent splits + * will occur recursively as necessary. Deletion may trigger a bucket + * rebalance, in which a size deficient bucket is filled with keys from an + * adjacent bucket. In this case, splitting may potentially occur in the + * parent. Deletion may alternatively trigger a merge, in which the keys + * from two buckets and a key from their shared parent are combined into the + * same bucket. In such a case, rebalancing or merging may proceed + * recursively from the parent. + * + * While the btree data format has been relatively constant over time, btrees + * initially created by versions of mongo earlier than the current version + * may embody different properties than freshly created btrees (while + * following the same data format). These older btrees are referred to + * below as legacy btrees. + */ + + const int OldBucketSize = 8192; + +#pragma pack(1) + template< class Version > class BucketBasics; + + /** + * This is the fixed width data component for storage of a key within a + * bucket. It contains an offset pointer to the variable width bson + * data component. A _KeyNode may be 'unused', please see below. + */ + template< class Loc > + struct __KeyNode { + /** Signals that we are writing this _KeyNode and casts away const */ + __KeyNode<Loc> & writing() const; + /** + * The 'left' child bucket of this key. If this is the i-th key, it + * points to the i index child bucket. + */ + Loc prevChildBucket; + /** The location of the record associated with this key. */ + Loc recordLoc; + short keyDataOfs() const { return (short) _kdo; } + + /** Offset within current bucket of the variable width bson key for this _KeyNode. */ + unsigned short _kdo; + void setKeyDataOfs(short s) { + _kdo = s; + assert(s>=0); + } + /** Seems to be redundant. */ + void setKeyDataOfsSavingUse(short s) { + _kdo = s; + assert(s>=0); + } + /** + * Unused keys are not returned by read operations. Keys may be marked + * as unused in cases where it is difficult to delete them while + * maintaining the constraints required of a btree. + * + * Setting ofs to odd is the sentinel for unused, as real recordLoc's + * are always even numbers. Note we need to keep its value basically + * the same as we use the recordLoc as part of the key in the index + * (to handle duplicate keys efficiently). + * + * Flagging keys as unused is a feature that is being phased out in favor + * of deleting the keys outright. The current btree implementation is + * not expected to mark a key as unused in a non legacy btree. + */ + void setUnused() { + recordLoc.GETOFS() |= 1; + } + void setUsed() { recordLoc.GETOFS() &= ~1; } + int isUnused() const { + return recordLoc.getOfs() & 1; + } + int isUsed() const { + return !isUnused(); + } + }; + + /** + * This structure represents header data for a btree bucket. An object of + * this type is typically allocated inside of a buffer of size BucketSize, + * resulting in a full bucket with an appropriate header. + * + * The body of a btree bucket contains an array of _KeyNode objects starting + * from its lowest indexed bytes and growing to higher indexed bytes. The + * body also contains variable width bson keys, which are allocated from the + * highest indexed bytes toward lower indexed bytes. + * + * |hhhh|kkkkkkk--------bbbbbbbbbbbuuubbbuubbb| + * h = header data + * k = KeyNode data + * - = empty space + * b = bson key data + * u = unused (old) bson key data, that may be garbage collected + */ + class BtreeData_V0 { + protected: + /** Parent bucket of this bucket, which isNull() for the root bucket. */ + DiskLoc parent; + /** Given that there are n keys, this is the n index child. */ + DiskLoc nextChild; + /** can be reused, value is 8192 in current pdfile version Apr2010 */ + unsigned short _wasSize; + /** zero */ + unsigned short _reserved1; + int flags; + + void _init() { + _reserved1 = 0; + _wasSize = BucketSize; + reserved = 0; + } + + /** basicInsert() assumes the next three members are consecutive and in this order: */ + + /** Size of the empty region. */ + int emptySize; + /** Size used for bson storage, including storage of old keys. */ + int topSize; + /* Number of keys in the bucket. */ + int n; + + int reserved; + /* Beginning of the bucket's body */ + char data[4]; + + public: + typedef __KeyNode<DiskLoc> _KeyNode; + typedef DiskLoc Loc; + typedef KeyBson Key; + typedef KeyBson KeyOwned; + enum { BucketSize = 8192 }; + + // largest key size we allow. note we very much need to support bigger keys (somehow) in the future. + static const int KeyMax = OldBucketSize / 10; + }; + + // a a a ofs ofs ofs ofs + class DiskLoc56Bit { + int ofs; + unsigned char _a[3]; + unsigned long long Z() const { + // endian + return *((unsigned long long*)this) & 0x00ffffffffffffffULL; + } + enum { + // first bit of offsets used in _KeyNode we don't use -1 here. + OurNullOfs = -2 + }; + public: + template< class V > + const BtreeBucket<V> * btree() const { + return DiskLoc(*this).btree<V>(); + } + template< class V > + BtreeBucket<V> * btreemod() const { + return DiskLoc(*this).btreemod<V>(); + } + operator const DiskLoc() const { + // endian + if( isNull() ) return DiskLoc(); + unsigned a = *((unsigned *) (_a-1)); + return DiskLoc(a >> 8, ofs); + } + int& GETOFS() { return ofs; } + int getOfs() const { return ofs; } + bool operator<(const DiskLoc56Bit& rhs) const { + // the orderering of dup keys in btrees isn't too critical, but we'd like to put items that are + // close together on disk close together in the tree, so we do want the file # to be the most significant + // bytes + return Z() < rhs.Z(); + } + int compare(const DiskLoc56Bit& rhs) const { + unsigned long long a = Z(); + unsigned long long b = rhs.Z(); + if( a < b ) return -1; + return a == b ? 0 : 1; + } + bool operator==(const DiskLoc56Bit& rhs) const { return Z() == rhs.Z(); } + bool operator!=(const DiskLoc56Bit& rhs) const { return Z() != rhs.Z(); } + bool operator==(const DiskLoc& rhs) const { + return DiskLoc(*this) == rhs; + } + bool operator!=(const DiskLoc& rhs) const { return !(*this==rhs); } + bool isNull() const { return ofs < 0; } + void Null() { + ofs = OurNullOfs; + _a[0] = _a[1] = _a[2] = 0; + } + string toString() const { return DiskLoc(*this).toString(); } + void operator=(const DiskLoc& loc) { + ofs = loc.getOfs(); + int la = loc.a(); + assert( la <= 0xffffff ); // must fit in 3 bytes + if( la < 0 ) { + assert( la == -1 ); + la = 0; + ofs = OurNullOfs; + } + memcpy(_a, &la, 3); // endian + dassert( ofs != 0 ); + } + DiskLoc56Bit& writing() const { + return *((DiskLoc56Bit*) getDur().writingPtr((void*)this, 7)); + } + }; + + class BtreeData_V1 { + public: + typedef DiskLoc56Bit Loc; + //typedef DiskLoc Loc; + typedef __KeyNode<Loc> _KeyNode; + typedef KeyV1 Key; + typedef KeyV1Owned KeyOwned; + enum { BucketSize = 8192-16 }; // leave room for Record header + // largest key size we allow. note we very much need to support bigger keys (somehow) in the future. + static const int KeyMax = 1024; + protected: + /** Parent bucket of this bucket, which isNull() for the root bucket. */ + Loc parent; + /** Given that there are n keys, this is the n index child. */ + Loc nextChild; + + unsigned short flags; + + /** basicInsert() assumes the next three members are consecutive and in this order: */ + + /** Size of the empty region. */ + unsigned short emptySize; + /** Size used for bson storage, including storage of old keys. */ + unsigned short topSize; + /* Number of keys in the bucket. */ + unsigned short n; + + /* Beginning of the bucket's body */ + char data[4]; + + void _init() { } + }; + + typedef BtreeData_V0 V0; + typedef BtreeData_V1 V1; + + /** + * This class adds functionality to BtreeData for managing a single bucket. + * The following policies are used in an attempt to encourage simplicity: + * + * Const member functions of this class are those which may be called on + * an object for which writing has not been signaled. Non const member + * functions may only be called on objects for which writing has been + * signaled. Note that currently some const functions write to the + * underlying memory representation of this bucket using optimized methods + * to signal write operations. + * + * DiskLoc parameters that may shadow references within the btree should + * be passed by value rather than by reference to non const member + * functions or to const member functions which may perform writes. This way + * a callee need not worry that write operations will change or invalidate + * its arguments. + * + * The current policy for dealing with bson arguments is the opposite of + * what is described above for DiskLoc arguments. We do not want to copy + * bson into memory as an intermediate step for btree changes, and if bson + * is to be moved it must be copied to the new location before the old + * location is invalidated. Care should be taken in cases where that invalid + * memory may be implicitly referenced by function arguments. + * + * A number of functions below require a thisLoc argument, which must be the + * disk location of the bucket mapped to 'this'. + */ + template< class Version > + class BucketBasics : public Version { + public: + template <class U> friend class BtreeBuilder; + typedef typename Version::Key Key; + typedef typename Version::_KeyNode _KeyNode; + typedef typename Version::Loc Loc; + + int getN() const { return this->n; } + + /** + * This is an in memory wrapper for a _KeyNode, and not itself part of btree + * storage. This object and its BSONObj 'key' will become invalid if the + * _KeyNode data that generated it is moved within the btree. In general, + * a KeyNode should not be expected to be valid after a write. + */ + class KeyNode { + public: + KeyNode(const BucketBasics<Version>& bb, const _KeyNode &k); + const Loc& prevChildBucket; + const Loc& recordLoc; + /* Points to the bson key storage for a _KeyNode */ + Key key; + }; + friend class KeyNode; + + /** Assert write intent declared for this bucket already. */ + void assertWritable(); + + void assertValid(const Ordering &order, bool force = false) const; + void assertValid(const BSONObj &orderObj, bool force = false) const { return assertValid(Ordering::make(orderObj),force); } + + /** + * @return KeyNode for key at index i. The KeyNode will become invalid + * if the key is moved or reassigned, or if the node is packed. In general + * a KeyNode should not be expected to be valid after a write. + */ + const KeyNode keyNode(int i) const { + if ( i >= this->n ) { + massert( 13000 , (string)"invalid keyNode: " + BSON( "i" << i << "n" << this->n ).jsonString() , i < this->n ); + } + return KeyNode(*this, k(i)); + } + + static int headerSize() { + const BucketBasics *d = 0; + return (char*)&(d->data) - (char*)&(d->parent); + } + static int bodySize() { return Version::BucketSize - headerSize(); } + static int lowWaterMark() { return bodySize() / 2 - Version::KeyMax - sizeof( _KeyNode ) + 1; } // see comment in btree.cpp + + // for testing + int nKeys() const { return this->n; } + const DiskLoc getNextChild() const { return this->nextChild; } + + protected: + char * dataAt(short ofs) { return this->data + ofs; } + + /** Initialize the header for a new node. */ + void init(); + + /** + * Preconditions: + * - 0 <= keypos <= n + * - If key is inserted at position keypos, the bucket's keys will still be + * in order. + * Postconditions: + * - If key can fit in the bucket, the bucket may be packed and keypos + * may be decreased to reflect deletion of earlier indexed keys during + * packing, the key will be inserted at the updated keypos index with + * a null prevChildBucket, the subsequent keys shifted to the right, + * and the function will return true. + * - If key cannot fit in the bucket, the bucket will be packed and + * the function will return false. + * Although this function is marked const, it modifies the underlying + * btree representation through an optimized write intent mechanism. + */ + bool basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order) const; + + /** + * Preconditions: + * - key / recordLoc are > all existing keys + * - The keys in prevChild and their descendents are between all existing + * keys and 'key'. + * Postconditions: + * - If there is space for key without packing, it is inserted as the + * last key with specified prevChild and true is returned. + * Importantly, nextChild is not updated! + * - Otherwise false is returned and there is no change. + */ + bool _pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild); + void pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild) { + bool ok = _pushBack( recordLoc , key , order , prevChild ); + assert(ok); + } + + /** + * This is a special purpose function used by BtreeBuilder. The + * interface is quite dangerous if you're not careful. The bson key + * returned here points to bucket memory that has been invalidated but + * not yet reclaimed. + * + * TODO Maybe this could be replaced with two functions, one which + * returns the last key without deleting it and another which simply + * deletes the last key. Then the caller would have enough control to + * ensure proper memory integrity. + * + * Preconditions: + * - bucket is not empty + * - last key of bucket is used (not unused) + * - nextChild isNull() + * - _unalloc will work correctly as used - see code + * Postconditions: + * - The last key of the bucket is removed, and its key and recLoc are + * returned. As mentioned above, the key points to unallocated memory. + */ + void popBack(DiskLoc& recLoc, Key &key); + + /** + * Preconditions: + * - 0 <= keypos < n + * - there is no child bucket at keypos + * - n > 1 + * - if mayEmpty == false or nextChild.isNull(), n > 0 + * Postconditions: + * - The key at keypos is removed, and remaining keys are shifted over. + * - The bucket becomes unpacked. + * - if mayEmpty is true and nextChild.isNull(), the bucket may have no keys. + */ + void _delKeyAtPos(int keypos, bool mayEmpty = false); + + /* !Packed means there is deleted fragment space within the bucket. + We "repack" when we run out of space before considering the node + to be full. + */ + enum Flags { Packed=1 }; + + /** n == 0 is ok */ + const Loc& childForPos(int p) const { return p == this->n ? this->nextChild : k(p).prevChildBucket; } + Loc& childForPos(int p) { return p == this->n ? this->nextChild : k(p).prevChildBucket; } + + /** Same as bodySize(). */ + int totalDataSize() const; + /** + * @return true when a key may be dropped by pack() + * @param index index of the key that may be dropped + * @param refPos index of a particular key of interest, which must not + * be dropped; = 0 to safely ignore + */ + bool mayDropKey( int index, int refPos ) const; + + /** + * Pack the bucket to reclaim space from invalidated memory. + * @refPos is an index in the bucket which may be updated if we + * delete keys from the bucket + * This function may cast away const and perform a write. + * Preconditions: none + * Postconditions: + * - Bucket will be packed + * - Some unused nodes may be dropped, but not ones at index 0 or refPos + * - Some used nodes may be moved + * - If refPos is the index of an existing key, it will be updated to that + * key's new index if the key is moved. + */ + void _pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const; + /** Pack when already writable */ + void _packReadyForMod(const Ordering &order, int &refPos); + + /** @return the size the bucket's body would have if we were to call pack() */ + int packedDataSize( int refPos ) const; + void setNotPacked() { this->flags &= ~Packed; } + void setPacked() { this->flags |= Packed; } + /** + * Preconditions: 'bytes' is <= emptySize + * Postconditions: A buffer of size 'bytes' is allocated on the top side, + * and its offset is returned. + */ + int _alloc(int bytes); + /** + * This function can be used to deallocate the lowest byte index bson + * buffer in the top region, which in some but not all cases is for the + * n - 1 index key. This function only works correctly in certain + * special cases, please be careful. + * Preconditions: 'bytes' <= topSize + * Postconditions: The top region is decreased + */ + void _unalloc(int bytes); + /** + * Preconditions: 'N' <= n + * Postconditions: + * - All keys after the N index key are dropped. + * - Then bucket is packed, without dropping refPos if < refPos N. + */ + void truncateTo(int N, const Ordering &order, int &refPos); + /** + * Preconditions: + * - 'nDrop' < n + * - for now, refPos should be zero. + * Postconditions: + * - All keys before the nDrop index key are dropped. + * - The bucket is packed. + */ + void dropFront(int nDrop, const Ordering &order, int &refPos); + /** + * Preconditions: 0 <= keypos < n + * Postconditions: keypos indexed key is marked unused. + */ + void markUnused(int keypos); + + /** + * BtreeBuilder uses the parent var as a temp place to maintain a linked list chain. + * we use tempNext() when we do that to be less confusing. (one might have written a union in C) + */ + DiskLoc tempNext() const { return this->parent; } + void setTempNext(DiskLoc l) { this->parent = l; } + + void _shape(int level, stringstream&) const; + int Size() const; + + /** @return i-indexed _KeyNode, without bounds checking */ + public: + const _KeyNode& k(int i) const { return ((const _KeyNode*)this->data)[i]; } + _KeyNode& _k(int i) { return ((_KeyNode*)this->data)[i]; } + protected: + _KeyNode& k(int i) { return ((_KeyNode*)this->data)[i]; } + + /** + * Preconditions: 'this' is packed + * @return the key index to be promoted on split + * @param keypos The requested index of a key to insert, which may affect + * the choice of split position. + */ + int splitPos( int keypos ) const; + + /** + * Preconditions: nAdd * sizeof( _KeyNode ) <= emptySize + * Postconditions: + * - Increases indexes of existing _KeyNode objects by nAdd, reserving + * space for additional _KeyNode objects at front. + * - Does not initialize ofs values for the bson data of these + * _KeyNode objects. + */ + void reserveKeysFront( int nAdd ); + + /** + * Preconditions: + * - 0 <= i < n + * - The bson 'key' must fit in the bucket without packing. + * - If 'key' and 'prevChildBucket' are set at index i, the btree + * ordering properties will be maintained. + * Postconditions: + * - The specified key is set at index i, replacing the existing + * _KeyNode data and without shifting any other _KeyNode objects. + */ + void setKey( int i, const DiskLoc recordLoc, const Key& key, const DiskLoc prevChildBucket ); + }; + + template< class V> + struct Continuation; + + /** + * This class adds functionality for manipulating buckets that are assembled + * in a tree. The requirements for const and non const functions and + * arguments are generally the same as in BtreeBucket. Because this class + * deals with tree structure, some functions that are marked const may + * trigger modification of another node in the btree or potentially of the + * current node. In such cases, the function's implementation explicitly + * casts away const when indicating an intent to write to the durability + * layer. The DiskLocs provided to such functions should be passed by + * value if they shadow pointers within the btree. + * + * To clarify enforcement of referential integrity in this implementation, + * we use the following pattern when deleting data we have a persistent + * pointer to. The pointer is cleared or removed explicitly, then the data + * it pointed to is cleaned up with a helper function. + * + * TODO It might make sense to put some of these functions in a class + * representing a full btree instead of a single btree bucket. That would + * allow us to use the const qualifier in a manner more consistent with + * standard usage. Right now the interface is for both a node and a tree, + * so assignment of const is sometimes nonideal. + * + * TODO There are several cases in which the 'this' pointer is invalidated + * as a result of deallocation. A seperate class representing a btree would + * alleviate some fragile cases where the implementation must currently + * behave correctly if the 'this' pointer is suddenly invalidated by a + * callee. + */ + template< class V > + class BtreeBucket : public BucketBasics<V> { + friend class BtreeCursor; + friend struct Continuation<V>; + public: + // make compiler happy: + typedef typename V::Key Key; + typedef typename V::KeyOwned KeyOwned; + typedef typename BucketBasics<V>::KeyNode KeyNode; + typedef typename BucketBasics<V>::_KeyNode _KeyNode; + typedef typename BucketBasics<V>::Loc Loc; + const _KeyNode& k(int i) const { return static_cast< const BucketBasics<V> * >(this)->k(i); } + protected: + _KeyNode& k(int i) { return static_cast< BucketBasics<V> * >(this)->_k(i); } + public: + const KeyNode keyNode(int i) const { return static_cast< const BucketBasics<V> * >(this)->keyNode(i); } + + bool isHead() const { return this->parent.isNull(); } + void dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const; + long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order, long long *unusedCount = 0, bool strict = false, unsigned depth=0) const; /* traverses everything */ + + bool isUsed( int i ) const { return this->k(i).isUsed(); } + string bucketSummary() const; + void dump(unsigned depth=0) const; + + /** + * @return true if key exists in index + * + * @order - indicates order of keys in the index. this is basically the index's key pattern, e.g.: + * BSONObj order = ((IndexDetails&)idx).keyPattern(); + * likewise below in bt_insert() etc. + */ + private: + bool exists(const IndexDetails& idx, const DiskLoc &thisLoc, const Key& key, const Ordering& order) const; + public: + + /** + * @param self - Don't complain about ourself already being in the index case. + * @return true = There is a duplicate used key. + */ + bool wouldCreateDup( + const IndexDetails& idx, const DiskLoc &thisLoc, + const Key& key, const Ordering& order, + const DiskLoc &self) const; + + /** + * Preconditions: none + * Postconditions: @return a new bucket allocated from pdfile storage + * and init()-ed. This bucket is suitable to for use as a new root + * or any other new node in the tree. + */ + static DiskLoc addBucket(const IndexDetails&); + + /** + * Preconditions: none + * Postconditions: + * - Some header values in this bucket are cleared, and the bucket is + * deallocated from pdfile storage. + * - The memory at thisLoc is invalidated, and 'this' is invalidated. + */ + void deallocBucket(const DiskLoc thisLoc, const IndexDetails &id); + + /** + * Preconditions: + * - 'key' has a valid schema for this index. + * - All other paramenters are valid and consistent with this index if applicable. + * Postconditions: + * - If key is bigger than KeyMax, @return 2 or 3 and no change. + * - If key / recordLoc exist in the btree as an unused key, set them + * as used and @return 0 + * - If key / recordLoc exist in the btree as a used key, @throw + * exception 10287 and no change. + * - If key / recordLoc do not exist in the btree, they are inserted + * and @return 0. The root of the btree may be changed, so + * 'this'/thisLoc may no longer be the root upon return. + */ + int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc, + const BSONObj& key, const Ordering &order, bool dupsAllowed, + IndexDetails& idx, bool toplevel = true) const; + + /** does the insert in two steps - can then use an upgradable lock for step 1, which + is the part which may have page faults. also that step is most of the computational work. + */ + void twoStepInsert(DiskLoc thisLoc, Continuation<V> &c, bool dupsAllowed) const; + + /** + * Preconditions: + * - 'key' has a valid schema for this index, and may have objsize() > KeyMax. + * Postconditions: + * - If key / recordLoc are in the btree, they are removed (possibly + * by being marked as an unused key), @return true, and potentially + * invalidate 'this' / thisLoc and change the head. + * - If key / recordLoc are not in the btree, @return false and do nothing. + */ + bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const; + + /** + * locate may return an "unused" key that is just a marker. so be careful. + * looks for a key:recordloc pair. + * + * @found - returns true if exact match found. note you can get back a position + * result even if found is false. + */ + DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, + int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const; + DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const Key& key, const Ordering &order, + int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const; + + /** + * find the first instance of the key + * does not handle dups + * WARNING: findSingle may not be compound index safe. this may need to change. see notes in + * findSingle code. + * @return the record location of the first match + */ + DiskLoc findSingle( const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const; + + /** + * Advance to next or previous key in the index. + * @param direction to advance. + */ + DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const; + + /** Advance in specified direction to the specified key */ + void advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const; + + /** Locate a key with fields comprised of a combination of keyBegin fields and keyEnd fields. */ + static void customLocate(DiskLoc &locInOut, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) ; + + /** @return head of the btree by traversing from current bucket. */ + const DiskLoc getHead(const DiskLoc& thisLoc) const; + + /** get tree shape */ + void shape(stringstream&) const; + + static void a_test(IndexDetails&); + + static int getKeyMax(); + + protected: + /** + * Preconditions: + * - 0 <= firstIndex <= n + * - -1 <= lastIndex <= n ( -1 is equivalent to n ) + * Postconditions: + * - Any children at indexes firstIndex through lastIndex (inclusive) + * will have their parent pointers set to thisLoc. + */ + void fixParentPtrs(const DiskLoc thisLoc, int firstIndex = 0, int lastIndex = -1) const; + + /** + * Preconditions: + * - thisLoc is not the btree head. + * - n == 0 is ok + * Postconditions: + * - All cursors pointing to this bucket will be updated. + * - This bucket's parent's child pointer is set to null. + * - This bucket is deallocated from pdfile storage. + * - 'this' and thisLoc are invalidated. + */ + void delBucket(const DiskLoc thisLoc, const IndexDetails&); + + /** + * Preconditions: 0 <= p < n + * Postconditions: + * - The key at index p is removed from the btree. + * - 'this' and thisLoc may be invalidated. + * - The tree head may change. + */ + void delKeyAtPos(const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order); + + /** + * Preconditions: + * - n == 0 is ok + * Postconditions: + * - If thisLoc is head, or if its body has at least lowWaterMark bytes, + * return false and do nothing. + * - Otherwise, if thisLoc has left or right neighbors, either balance + * or merge with them and return true. Also, 'this' and thisLoc may + * be invalidated and the tree head may change. + */ + bool mayBalanceWithNeighbors(const DiskLoc thisLoc, IndexDetails &id, const Ordering &order) const; + + /** + * Preconditions: + * - 0 <= leftIndex < n + * - The child at leftIndex or the child at leftIndex + 1 contains + * fewer than lowWaterMark bytes. + * Postconditions: + * - If the child bucket at leftIndex can merge with the child index + * at leftIndex + 1, do nothing and return false. + * - Otherwise, balance keys between the leftIndex child and the + * leftIndex + 1 child, return true, and possibly change the tree head. + */ + bool tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const; + + /** + * Preconditions: + * - All preconditions of tryBalanceChildren. + * - The leftIndex child and leftIndex + 1 child cannot be merged. + * Postconditions: + * - Keys are moved between the leftIndex child and the leftIndex + 1 + * child such that neither child has fewer than lowWaterMark bytes. + * The tree head may change. + */ + void doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ); + + /** + * Preconditions: + * - All preconditions of doBalanceChildren + * - The leftIndex and leftIndex + 1 children are packed. + * - The leftIndex + 1 child has fewer than lowWaterMark bytes. + * - split returned by rebalancedSeparatorPos() + * Postconditions: + * - The key in lchild at index split is set as thisLoc's key at index + * leftIndex, which may trigger a split and change the tree head. + * The previous key in thisLoc at index leftIndex and all keys with + * indexes greater than split in lchild are moved to rchild. + */ + void doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split, + BtreeBucket<V> *l, const DiskLoc lchild, + BtreeBucket<V> *r, const DiskLoc rchild, + IndexDetails &id, const Ordering &order ); + /** + * Preconditions: + * - All preconditions of doBalanceChildren + * - The leftIndex and leftIndex + 1 children are packed. + * - The leftIndex child has fewer than lowWaterMark bytes. + * - split returned by rebalancedSeparatorPos() + * Postconditions: + * - The key in rchild at index split - l->n - 1 is set as thisLoc's key + * at index leftIndex, which may trigger a split and change the tree + * head. The previous key in thisLoc at index leftIndex and all keys + * with indexes less than split - l->n - 1 in rchild are moved to + * lchild. + */ + void doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split, + BtreeBucket<V> *l, const DiskLoc lchild, + BtreeBucket<V> *r, const DiskLoc rchild, + IndexDetails &id, const Ordering &order ); + + /** + * Preconditions: + * - 0 <= leftIndex < n + * - this->canMergeChildren( thisLoc, leftIndex ) == true + * Postconditions: + * - All of the above mentioned keys will be placed in the left child. + * - The tree may be updated recursively, resulting in 'this' and + * thisLoc being invalidated and the tree head being changed. + */ + void doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order); + + /** + * Preconditions: + * - n == 0 + * - !nextChild.isNull() + * Postconditions: + * - 'this' and thisLoc are deallocated (and invalidated), any cursors + * to them are updated, and the tree head may change. + * - nextChild replaces thisLoc in the btree structure. + */ + void replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id ); + + /** + * @return true iff the leftIndex and leftIndex + 1 children both exist, + * and if their body sizes when packed and the thisLoc key at leftIndex + * would fit in a single bucket body. + */ + bool canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const; + + /** + * Preconditions: + * - leftIndex and leftIndex + 1 children are packed + * - leftIndex or leftIndex + 1 child is below lowWaterMark + * @return index of the rebalanced separator; the index value is + * determined as if we had a bucket with body + * <left bucket keys array>.push( <old separator> ).concat( <right bucket keys array> ) + * and called splitPos( 0 ) on it. + */ + int rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const; + + /** + * Preconditions: thisLoc has a parent + * @return parent's index of thisLoc. + */ + int indexInParent( const DiskLoc &thisLoc ) const; + + public: + Key keyAt(int i) const { + if( i >= this->n ) + return Key(); + return Key(this->data + k(i).keyDataOfs()); + } + protected: + + /** + * Preconditions: + * - This bucket is packed. + * - Cannot add a key of size KeyMax to this bucket. + * - 0 <= keypos <= n is the position of a new key that will be inserted + * - lchild is equal to the existing child at index keypos. + * Postconditions: + * - The thisLoc bucket is split into two packed buckets, possibly + * invalidating the initial position of keypos, with a split key + * promoted to the parent. The new key key/recordLoc will be inserted + * into one of the split buckets, and lchild/rchild set appropriately. + * Splitting may occur recursively, possibly changing the tree head. + */ + void split(const DiskLoc thisLoc, int keypos, + const DiskLoc recordLoc, const Key& key, + const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx); + + /** + * Preconditions: + * - 0 <= keypos <= n + * - If key / recordLoc are inserted at position keypos, with provided + * lchild and rchild, the btree ordering requirements will be + * maintained. + * - lchild is equal to the existing child at index keypos. + * - n == 0 is ok. + * Postconditions: + * - The key / recordLoc are inserted at position keypos, and the + * bucket is split if necessary, which may change the tree head. + * - The bucket may be packed or split, invalidating the specified value + * of keypos. + * This function will always modify thisLoc, but it's marked const because + * it commonly relies on the specialized writ]e intent mechanism of basicInsert(). + */ + void insertHere(const DiskLoc thisLoc, int keypos, + const DiskLoc recordLoc, const Key& key, const Ordering &order, + const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx) const; + + /** bt_insert() is basically just a wrapper around this. */ + int _insert(const DiskLoc thisLoc, const DiskLoc recordLoc, + const Key& key, const Ordering &order, bool dupsAllowed, + const DiskLoc lChild, const DiskLoc rChild, IndexDetails &idx) const; + + void insertStepOne(DiskLoc thisLoc, Continuation<V>& c, bool dupsAllowed) const; + + bool find(const IndexDetails& idx, const Key& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const; + static bool customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) ; + static void findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey); + static int customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ); + + /** If child is non null, set its parent to thisLoc */ + static void fix(const DiskLoc thisLoc, const DiskLoc child); + + /** + * Preconditions: + * - 0 <= keypos < n + * - If the specified key and recordLoc are placed in keypos of thisLoc, + * and lchild and rchild are set, the btree ordering properties will + * be maintained. + * - rchild == childForPos( keypos + 1 ) + * - childForPos( keypos ) is referenced elsewhere if nonnull. + * Postconditions: + * - The key at keypos will be replaced with the specified key and + * lchild, potentially splitting this bucket and changing the tree + * head. + * - childForPos( keypos ) will be orphaned. + */ + void setInternalKey( const DiskLoc thisLoc, int keypos, + const DiskLoc recordLoc, const Key &key, const Ordering &order, + const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx); + + /** + * Preconditions: + * - 0 <= keypos < n + * - The keypos or keypos+1 indexed child is non null. + * Postconditions: + * - The specified key is deleted by replacing it with another key if + * possible. This replacement may cause a split and change the tree + * head. The replacement key will be deleted from its original + * location, potentially causing merges and splits that may invalidate + * 'this' and thisLoc and change the tree head. + * - If the key cannot be replaced, it will be marked as unused. This + * is only expected in legacy btrees. + */ + void deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order ); + public: + /** simply builds and returns a dup key error message string */ + static string dupKeyError( const IndexDetails& idx , const Key& key ); + }; +#pragma pack() + + class FieldRangeVector; + class FieldRangeVectorIterator; + + class BtreeCursor : public Cursor { + protected: + BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction ); + BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction ); + public: + virtual ~BtreeCursor(); + /** makes an appropriate subclass depending on the index version */ + static BtreeCursor* make( NamespaceDetails *_d, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction ); + static BtreeCursor* make( NamespaceDetails *_d, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction ); + static BtreeCursor* make( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction ); + static BtreeCursor* make( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction ); + + virtual bool ok() { return !bucket.isNull(); } + virtual bool advance(); + virtual void noteLocation(); // updates keyAtKeyOfs... + virtual void checkLocation() = 0; + virtual bool supportGetMore() { return true; } + virtual bool supportYields() { return true; } + + /** + * used for multikey index traversal to avoid sending back dups. see Matcher::matches(). + * if a multikey index traversal: + * if loc has already been sent, returns true. + * otherwise, marks loc as sent. + * @return false if the loc has not been seen + */ + virtual bool getsetdup(DiskLoc loc) { + if( _multikey ) { + pair<set<DiskLoc>::iterator, bool> p = _dups.insert(loc); + return !p.second; + } + return false; + } + + virtual bool modifiedKeys() const { return _multikey; } + virtual bool isMultiKey() const { return _multikey; } + + /*const _KeyNode& _currKeyNode() const { + assert( !bucket.isNull() ); + const _KeyNode& kn = keyNode(keyOfs); + assert( kn.isUsed() ); + return kn; + }*/ + + /** returns BSONObj() if ofs is out of range */ + virtual BSONObj keyAt(int ofs) const = 0; + + virtual BSONObj currKey() const = 0; + virtual BSONObj indexKeyPattern() { return indexDetails.keyPattern(); } + + virtual void aboutToDeleteBucket(const DiskLoc& b) { + if ( bucket == b ) + keyOfs = -1; + } + + virtual DiskLoc currLoc() = 0; // { return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc(); } + virtual DiskLoc refLoc() { return currLoc(); } + virtual Record* _current() { return currLoc().rec(); } + virtual BSONObj current() { return BSONObj(_current()); } + virtual string toString(); + + BSONObj prettyKey( const BSONObj &key ) const { + return key.replaceFieldNames( indexDetails.keyPattern() ).clientReadable(); + } + + virtual BSONObj prettyIndexBounds() const; + + virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); } + virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; } + + virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher; } + + virtual long long nscanned() { return _nscanned; } + + /** for debugging only */ + const DiskLoc getBucket() const { return bucket; } + int getKeyOfs() const { return keyOfs; } + + // just for unit tests + virtual bool curKeyHasChild() = 0; + + protected: + /** + * Our btrees may (rarely) have "unused" keys when items are deleted. + * Skip past them. + */ + virtual bool skipUnusedKeys() = 0; + + bool skipOutOfRangeKeysAndCheckEnd(); + void skipAndCheck(); + void checkEnd(); + + /** selective audits on construction */ + void audit(); + + virtual void _audit() = 0; + virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc) = 0; + virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) = 0; + virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) = 0; + + /** set initial bucket */ + void initWithoutIndependentFieldRanges(); + + /** if afterKey is true, we want the first key with values of the keyBegin fields greater than keyBegin */ + void advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive ); + + set<DiskLoc> _dups; + NamespaceDetails * const d; + const int idxNo; + BSONObj startKey; + BSONObj endKey; + bool _endKeyInclusive; + bool _multikey; // this must be updated every getmore batch in case someone added a multikey + const IndexDetails& indexDetails; + const BSONObj _order; + const Ordering _ordering; + DiskLoc bucket; + int keyOfs; + const int _direction; // 1=fwd,-1=reverse + BSONObj keyAtKeyOfs; // so we can tell if things moved around on us between the query and the getMore call + DiskLoc locAtKeyOfs; + const shared_ptr< FieldRangeVector > _bounds; + auto_ptr< FieldRangeVectorIterator > _boundsIterator; + shared_ptr< CoveredIndexMatcher > _matcher; + bool _independentFieldRanges; + long long _nscanned; + }; + + template< class V > + struct Continuation { + //Continuation(const typename V::Key & k); + Continuation(DiskLoc thisLoc, DiskLoc _recordLoc, const BSONObj &_key, + Ordering _order, IndexDetails& _idx) : + bLoc(thisLoc), recordLoc(_recordLoc), key(_key), order(_order), idx(_idx) { + op = Nothing; + } + + DiskLoc bLoc; + DiskLoc recordLoc; + typename V::KeyOwned key; + const Ordering order; + IndexDetails& idx; + enum Op { Nothing, SetUsed, InsertHere } op; + + int pos; + const BtreeBucket<V> *b; + + void stepTwo() { + if( op == Nothing ) + return; + else if( op == SetUsed ) { + const typename V::_KeyNode& kn = b->k(pos); + kn.writing().setUsed(); + } + else { + b->insertHere(bLoc, pos, recordLoc, key, order, DiskLoc(), DiskLoc(), idx); + } + } + }; + + /** Renames the index namespace for this btree's index. */ + void renameIndexNamespace(const char *oldNs, const char *newNs); + + /** + * give us a writable version of the btree bucket (declares write intent). + * note it is likely more efficient to declare write intent on something smaller when you can. + */ + template< class V > + BtreeBucket<V> * DiskLoc::btreemod() const { + assert( _a != -1 ); + BtreeBucket<V> *b = const_cast< BtreeBucket<V> * >( btree<V>() ); + return static_cast< BtreeBucket<V>* >( getDur().writingPtr( b, V::BucketSize ) ); + } + + template< class V > + BucketBasics<V>::KeyNode::KeyNode(const BucketBasics<V>& bb, const _KeyNode &k) : + prevChildBucket(k.prevChildBucket), + recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs()) + { } + +} // namespace mongo; diff --git a/src/mongo/db/btreebuilder.cpp b/src/mongo/db/btreebuilder.cpp new file mode 100644 index 00000000000..0ec587a1958 --- /dev/null +++ b/src/mongo/db/btreebuilder.cpp @@ -0,0 +1,184 @@ +// btreebuilder.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "db.h" +#include "btree.h" +#include "pdfile.h" +#include "json.h" +#include "clientcursor.h" +#include "client.h" +#include "dbhelpers.h" +#include "curop-inl.h" +#include "stats/counters.h" +#include "dur_commitjob.h" +#include "btreebuilder.h" + +namespace mongo { + + /* --- BtreeBuilder --- */ + + template<class V> + BtreeBuilder<V>::BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx) : + dupsAllowed(_dupsAllowed), + idx(_idx), + n(0), + order( idx.keyPattern() ), + ordering( Ordering::make(idx.keyPattern()) ) { + first = cur = BtreeBucket<V>::addBucket(idx); + b = cur.btreemod<V>(); + committed = false; + } + + template<class V> + void BtreeBuilder<V>::newBucket() { + DiskLoc L = BtreeBucket<V>::addBucket(idx); + b->setTempNext(L); + cur = L; + b = cur.btreemod<V>(); + } + + template<class V> + void BtreeBuilder<V>::mayCommitProgressDurably() { + if ( getDur().commitIfNeeded() ) { + b = cur.btreemod<V>(); + } + } + + template<class V> + void BtreeBuilder<V>::addKey(BSONObj& _key, DiskLoc loc) { + + auto_ptr< KeyOwned > key( new KeyOwned(_key) ); + if ( key->dataSize() > BtreeBucket<V>::KeyMax ) { + problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() + << ' ' << key->dataSize() << ' ' << key->toString() << endl; + return; + } + + if( !dupsAllowed ) { + if( n > 0 ) { + int cmp = keyLast->woCompare(*key, ordering); + massert( 10288 , "bad key order in BtreeBuilder - server internal error", cmp <= 0 ); + if( cmp == 0 ) { + //if( !dupsAllowed ) + uasserted( ASSERT_ID_DUPKEY , BtreeBucket<V>::dupKeyError( idx , *keyLast ) ); + } + } + } + + if ( ! b->_pushBack(loc, *key, ordering, DiskLoc()) ) { + // bucket was full + newBucket(); + b->pushBack(loc, *key, ordering, DiskLoc()); + } + keyLast = key; + n++; + mayCommitProgressDurably(); + } + + template<class V> + void BtreeBuilder<V>::buildNextLevel(DiskLoc loc) { + int levels = 1; + while( 1 ) { + if( loc.btree<V>()->tempNext().isNull() ) { + // only 1 bucket at this level. we are done. + getDur().writingDiskLoc(idx.head) = loc; + break; + } + levels++; + + DiskLoc upLoc = BtreeBucket<V>::addBucket(idx); + DiskLoc upStart = upLoc; + BtreeBucket<V> *up = upLoc.btreemod<V>(); + + DiskLoc xloc = loc; + while( !xloc.isNull() ) { + if ( getDur().commitIfNeeded() ) { + b = cur.btreemod<V>(); + up = upLoc.btreemod<V>(); + } + + BtreeBucket<V> *x = xloc.btreemod<V>(); + Key k; + DiskLoc r; + x->popBack(r,k); + bool keepX = ( x->n != 0 ); + DiskLoc keepLoc = keepX ? xloc : x->nextChild; + + if ( ! up->_pushBack(r, k, ordering, keepLoc) ) { + // current bucket full + DiskLoc n = BtreeBucket<V>::addBucket(idx); + up->setTempNext(n); + upLoc = n; + up = upLoc.btreemod<V>(); + up->pushBack(r, k, ordering, keepLoc); + } + + DiskLoc nextLoc = x->tempNext(); // get next in chain at current level + if ( keepX ) { + x->parent = upLoc; + } + else { + if ( !x->nextChild.isNull() ) { + DiskLoc ll = x->nextChild; + ll.btreemod<V>()->parent = upLoc; + //(x->nextChild.btreemod<V>())->parent = upLoc; + } + x->deallocBucket( xloc, idx ); + } + xloc = nextLoc; + } + + loc = upStart; + mayCommitProgressDurably(); + } + + if( levels > 1 ) + log(2) << "btree levels: " << levels << endl; + } + + /** when all addKeys are done, we then build the higher levels of the tree */ + template<class V> + void BtreeBuilder<V>::commit() { + buildNextLevel(first); + committed = true; + } + + template<class V> + BtreeBuilder<V>::~BtreeBuilder() { + DESTRUCTOR_GUARD( + if( !committed ) { + log(2) << "Rolling back partially built index space" << endl; + DiskLoc x = first; + while( !x.isNull() ) { + DiskLoc next = x.btree<V>()->tempNext(); + string ns = idx.indexNamespace(); + theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), x.rec(), x); + x = next; + getDur().commitIfNeeded(); + } + assert( idx.head.isNull() ); + log(2) << "done rollback" << endl; + } + ) + } + + template class BtreeBuilder<V0>; + template class BtreeBuilder<V1>; + +} diff --git a/src/mongo/db/btreebuilder.h b/src/mongo/db/btreebuilder.h new file mode 100644 index 00000000000..6de55d89299 --- /dev/null +++ b/src/mongo/db/btreebuilder.h @@ -0,0 +1,53 @@ +#pragma once + +#include "btree.h" + +namespace mongo { + + /** + * build btree from the bottom up + */ + template< class V > + class BtreeBuilder { + typedef typename V::KeyOwned KeyOwned; + typedef typename V::Key Key; + + bool dupsAllowed; + IndexDetails& idx; + /** Number of keys added to btree. */ + unsigned long long n; + /** Last key passed to addKey(). */ + auto_ptr< typename V::KeyOwned > keyLast; + BSONObj order; + Ordering ordering; + /** true iff commit() completed successfully. */ + bool committed; + + DiskLoc cur, first; + BtreeBucket<V> *b; + + void newBucket(); + void buildNextLevel(DiskLoc); + void mayCommitProgressDurably(); + + public: + ~BtreeBuilder(); + + BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx); + + /** + * Preconditions: 'key' is > or >= last key passed to this function (depends on _dupsAllowed) + * Postconditions: 'key' is added to intermediate storage. + */ + void addKey(BSONObj& key, DiskLoc loc); + + /** + * commit work. if not called, destructor will clean up partially completed work + * (in case exception has happened). + */ + void commit(); + + unsigned long long getn() { return n; } + }; + +} diff --git a/src/mongo/db/btreecursor.cpp b/src/mongo/db/btreecursor.cpp new file mode 100644 index 00000000000..7ddd4874ef6 --- /dev/null +++ b/src/mongo/db/btreecursor.cpp @@ -0,0 +1,457 @@ +// btreecursor.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "btree.h" +#include "pdfile.h" +#include "jsobj.h" +#include "curop-inl.h" +#include "queryutil.h" + +namespace mongo { + + template< class V > + class BtreeCursorImpl : public BtreeCursor { + public: + typedef typename BucketBasics<V>::KeyNode KeyNode; + typedef typename V::Key Key; + typedef typename V::_KeyNode _KeyNode; + + BtreeCursorImpl(NamespaceDetails *a, int b, const IndexDetails& c, const BSONObj &d, const BSONObj &e, bool f, int g) : + BtreeCursor(a,b,c,d,e,f,g) { } + BtreeCursorImpl(NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction ) : + BtreeCursor(_d,_idxNo,_id,_bounds,_direction ) + { + pair< DiskLoc, int > noBestParent; + indexDetails.head.btree<V>()->customLocate( bucket, keyOfs, startKey, 0, false, _boundsIterator->cmp(), _boundsIterator->inc(), _ordering, _direction, noBestParent ); + skipAndCheck(); + dassert( _dups.size() == 0 ); + } + + virtual DiskLoc currLoc() { + if( bucket.isNull() ) return DiskLoc(); + return currKeyNode().recordLoc; + } + + virtual BSONObj keyAt(int ofs) const { + assert( !bucket.isNull() ); + const BtreeBucket<V> *b = bucket.btree<V>(); + int n = b->getN(); + if( n == 0xffff ) { + throw UserException(15850, "keyAt bucket deleted"); + } + dassert( n >= 0 && n < 10000 ); + return ofs >= n ? BSONObj() : b->keyNode(ofs).key.toBson(); + } + + virtual BSONObj currKey() const { + assert( !bucket.isNull() ); + return bucket.btree<V>()->keyNode(keyOfs).key.toBson(); + } + + virtual bool curKeyHasChild() { + return !currKeyNode().prevChildBucket.isNull(); + } + + bool skipUnusedKeys() { + int u = 0; + while ( 1 ) { + if ( !ok() ) + break; + const _KeyNode& kn = keyNode(keyOfs); + if ( kn.isUsed() ) + break; + bucket = _advance(bucket, keyOfs, _direction, "skipUnusedKeys"); + u++; + //don't include unused keys in nscanned + //++_nscanned; + } + if ( u > 10 ) + OCCASIONALLY log() << "btree unused skipped:" << u << '\n'; + return u; + } + + /* Since the last noteLocation(), our key may have moved around, and that old cached + information may thus be stale and wrong (although often it is right). We check + that here; if we have moved, we have to search back for where we were at. + + i.e., after operations on the index, the BtreeCursor's cached location info may + be invalid. This function ensures validity, so you should call it before using + the cursor if other writers have used the database since the last noteLocation + call. + */ + void checkLocation() { + if ( eof() ) + return; + + _multikey = d->isMultikey(idxNo); + + if ( keyOfs >= 0 ) { + assert( !keyAtKeyOfs.isEmpty() ); + + try { + // Note keyAt() returns an empty BSONObj if keyOfs is now out of range, + // which is possible as keys may have been deleted. + int x = 0; + while( 1 ) { + // if ( b->keyAt(keyOfs).woEqual(keyAtKeyOfs) && + // b->k(keyOfs).recordLoc == locAtKeyOfs ) { + if ( keyAt(keyOfs).binaryEqual(keyAtKeyOfs) ) { + const _KeyNode& kn = keyNode(keyOfs); + if( kn.recordLoc == locAtKeyOfs ) { + if ( !kn.isUsed() ) { + // we were deleted but still exist as an unused + // marker key. advance. + skipUnusedKeys(); + } + return; + } + } + + // we check one key earlier too, in case a key was just deleted. this is + // important so that multi updates are reasonably fast. + if( keyOfs == 0 || x++ ) + break; + keyOfs--; + } + } + catch(UserException& e) { + if( e.getCode() != 15850 ) + throw; + // hack: fall through if bucket was just deleted. should only happen under deleteObjects() + DEV log() << "debug info: bucket was deleted" << endl; + } + } + + /* normally we don't get to here. when we do, old position is no longer + valid and we must refind where we left off (which is expensive) + */ + + /* TODO: Switch to keep indexdetails and do idx.head! */ + bucket = _locate(keyAtKeyOfs, locAtKeyOfs); + RARELY log() << "key seems to have moved in the index, refinding. " << bucket.toString() << endl; + if ( ! bucket.isNull() ) + skipUnusedKeys(); + + } + + protected: + virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) { + thisLoc.btree<V>()->advanceTo(thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction); + } + virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) { + return thisLoc.btree<V>()->advance(thisLoc, keyOfs, direction, caller); + } + virtual void _audit() { + out() << "BtreeCursor(). dumping head bucket" << endl; + indexDetails.head.btree<V>()->dump(); + } + virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc) { + bool found; + return indexDetails.head.btree<V>()-> + locate(indexDetails, indexDetails.head, key, _ordering, keyOfs, found, loc, _direction); + } + + const _KeyNode& keyNode(int keyOfs) const { + return bucket.btree<V>()->k(keyOfs); + } + + private: + const KeyNode currKeyNode() const { + assert( !bucket.isNull() ); + const BtreeBucket<V> *b = bucket.btree<V>(); + return b->keyNode(keyOfs); + } + }; + + template class BtreeCursorImpl<V0>; + template class BtreeCursorImpl<V1>; + + /* + class BtreeCursorV1 : public BtreeCursor { + public: + typedef BucketBasics<V1>::KeyNode KeyNode; + typedef V1::Key Key; + + BtreeCursorV1(NamespaceDetails *a, int b, const IndexDetails& c, const BSONObj &d, const BSONObj &e, bool f, int g) : + BtreeCursor(a,b,c,d,e,f,g) { } + BtreeCursorV1(NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction) : + BtreeCursor(_d,_idxNo,_id,_bounds,_direction) + { + pair< DiskLoc, int > noBestParent; + indexDetails.head.btree<V1>()->customLocate( bucket, keyOfs, startKey, 0, false, _boundsIterator->cmp(), _boundsIterator->inc(), _ordering, _direction, noBestParent ); + skipAndCheck(); + dassert( _dups.size() == 0 ); + } + + virtual DiskLoc currLoc() { + if( bucket.isNull() ) return DiskLoc(); + return currKeyNode().recordLoc; + } + + virtual BSONObj currKey() const { + assert( !bucket.isNull() ); + return bucket.btree<V1>()->keyNode(keyOfs).key.toBson(); + } + + protected: + virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) { + thisLoc.btree<V1>()->advanceTo(thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction); + } + virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) { + return thisLoc.btree<V1>()->advance(thisLoc, keyOfs, direction, caller); + } + virtual void _audit() { + out() << "BtreeCursor(). dumping head bucket" << endl; + indexDetails.head.btree<V1>()->dump(); + } + virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc); + virtual const _KeyNode& keyNode(int keyOfs) { + return bucket.btree<V1>()->k(keyOfs); + } + + private: + const KeyNode currKeyNode() const { + assert( !bucket.isNull() ); + const BtreeBucket<V1> *b = bucket.btree<V1>(); + return b->keyNode(keyOfs); + } + };*/ + + BtreeCursor* BtreeCursor::make( + NamespaceDetails *_d, const IndexDetails& _id, + const shared_ptr< FieldRangeVector > &_bounds, int _direction ) + { + return make( _d, _d->idxNo( (IndexDetails&) _id), _id, _bounds, _direction ); + } + + BtreeCursor* BtreeCursor::make( + NamespaceDetails *_d, const IndexDetails& _id, + const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction) + { + return make( _d, _d->idxNo( (IndexDetails&) _id), _id, startKey, endKey, endKeyInclusive, direction ); + } + + + BtreeCursor* BtreeCursor::make( + NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, + const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction) + { + int v = _id.version(); + BtreeCursor *c = 0; + if( v == 1 ) { + c = new BtreeCursorImpl<V1>(_d,_idxNo,_id,startKey,endKey,endKeyInclusive,direction); + } + else if( v == 0 ) { + c = new BtreeCursorImpl<V0>(_d,_idxNo,_id,startKey,endKey,endKeyInclusive,direction); + } + else { + uasserted(14800, str::stream() << "unsupported index version " << v); + } + c->initWithoutIndependentFieldRanges(); + dassert( c->_dups.size() == 0 ); + return c; + } + + BtreeCursor* BtreeCursor::make( + NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, + const shared_ptr< FieldRangeVector > &_bounds, int _direction ) + { + int v = _id.version(); + if( v == 1 ) + return new BtreeCursorImpl<V1>(_d,_idxNo,_id,_bounds,_direction); + if( v == 0 ) + return new BtreeCursorImpl<V0>(_d,_idxNo,_id,_bounds,_direction); + uasserted(14801, str::stream() << "unsupported index version " << v); + + // just check we are in sync with this method + dassert( IndexDetails::isASupportedIndexVersionNumber(v) ); + + return 0; + } + + BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails &_id, + const BSONObj &_startKey, const BSONObj &_endKey, bool endKeyInclusive, int _direction ) : + d(_d), idxNo(_idxNo), + startKey( _startKey ), + endKey( _endKey ), + _endKeyInclusive( endKeyInclusive ), + _multikey( d->isMultikey( idxNo ) ), + indexDetails( _id ), + _order( _id.keyPattern() ), + _ordering( Ordering::make( _order ) ), + _direction( _direction ), + _independentFieldRanges( false ), + _nscanned( 0 ) { + audit(); + } + + BtreeCursor::BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction ) + : + d(_d), idxNo(_idxNo), + _endKeyInclusive( true ), + _multikey( d->isMultikey( idxNo ) ), + indexDetails( _id ), + _order( _id.keyPattern() ), + _ordering( Ordering::make( _order ) ), + _direction( _direction ), + _bounds( ( assert( _bounds.get() ), _bounds ) ), + _boundsIterator( new FieldRangeVectorIterator( *_bounds ) ), + _independentFieldRanges( true ), + _nscanned( 0 ) { + audit(); + startKey = _bounds->startKey(); + _boundsIterator->advance( startKey ); // handles initialization + _boundsIterator->prepDive(); + bucket = indexDetails.head; + keyOfs = 0; + } + + /** Properly destroy forward declared class members. */ + BtreeCursor::~BtreeCursor() {} + + void BtreeCursor::audit() { + dassert( d->idxNo((IndexDetails&) indexDetails) == idxNo ); + } + + void BtreeCursor::initWithoutIndependentFieldRanges() { + if ( indexDetails.getSpec().getType() ) { + startKey = indexDetails.getSpec().getType()->fixKey( startKey ); + endKey = indexDetails.getSpec().getType()->fixKey( endKey ); + } + bucket = _locate(startKey, _direction > 0 ? minDiskLoc : maxDiskLoc); + if ( ok() ) { + _nscanned = 1; + } + skipUnusedKeys(); + checkEnd(); + } + + void BtreeCursor::skipAndCheck() { + long long startNscanned = _nscanned; + skipUnusedKeys(); + while( 1 ) { + if ( !skipOutOfRangeKeysAndCheckEnd() ) { + break; + } + do { + if ( _nscanned > startNscanned + 20 ) { + skipUnusedKeys(); + return; + } + } while( skipOutOfRangeKeysAndCheckEnd() ); + if ( !skipUnusedKeys() ) { + break; + } + } + } + + bool BtreeCursor::skipOutOfRangeKeysAndCheckEnd() { + if ( !ok() ) { + return false; + } + int ret = _boundsIterator->advance( currKey() ); + if ( ret == -2 ) { + bucket = DiskLoc(); + return false; + } + else if ( ret == -1 ) { + ++_nscanned; + return false; + } + ++_nscanned; + advanceTo( currKey(), ret, _boundsIterator->after(), _boundsIterator->cmp(), _boundsIterator->inc() ); + return true; + } + + // Return a value in the set {-1, 0, 1} to represent the sign of parameter i. + int sgn( int i ) { + if ( i == 0 ) + return 0; + return i > 0 ? 1 : -1; + } + + // Check if the current key is beyond endKey. + void BtreeCursor::checkEnd() { + if ( bucket.isNull() ) + return; + if ( !endKey.isEmpty() ) { + int cmp = sgn( endKey.woCompare( currKey(), _order ) ); + if ( ( cmp != 0 && cmp != _direction ) || + ( cmp == 0 && !_endKeyInclusive ) ) + bucket = DiskLoc(); + } + } + + void BtreeCursor::advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive) { + _advanceTo( bucket, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, _ordering, _direction ); + } + + bool BtreeCursor::advance() { + killCurrentOp.checkForInterrupt(); + if ( bucket.isNull() ) + return false; + + bucket = _advance(bucket, keyOfs, _direction, "BtreeCursor::advance"); + + if ( !_independentFieldRanges ) { + skipUnusedKeys(); + checkEnd(); + if ( ok() ) { + ++_nscanned; + } + } + else { + skipAndCheck(); + } + return ok(); + } + + void BtreeCursor::noteLocation() { + if ( !eof() ) { + BSONObj o = currKey().getOwned(); + keyAtKeyOfs = o; + locAtKeyOfs = currLoc(); + } + } + + string BtreeCursor::toString() { + string s = string("BtreeCursor ") + indexDetails.indexName(); + if ( _direction < 0 ) s += " reverse"; + if ( _bounds.get() && _bounds->size() > 1 ) s += " multi"; + return s; + } + + BSONObj BtreeCursor::prettyIndexBounds() const { + if ( !_independentFieldRanges ) { + return BSON( "start" << prettyKey( startKey ) << "end" << prettyKey( endKey ) ); + } + else { + return _bounds->obj(); + } + } + + /* ----------------------------------------------------------------------------- */ + + struct BtreeCursorUnitTest { + BtreeCursorUnitTest() { + assert( minDiskLoc.compare(maxDiskLoc) < 0 ); + } + } btut; + +} // namespace mongo diff --git a/src/mongo/db/cap.cpp b/src/mongo/db/cap.cpp new file mode 100644 index 00000000000..a8be2383115 --- /dev/null +++ b/src/mongo/db/cap.cpp @@ -0,0 +1,457 @@ +// @file cap.cpp capped collection related +// the "old" version (<= v1.6) + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "pdfile.h" +#include "db.h" +#include "../util/mmap.h" +#include "../util/hashtab.h" +#include "../scripting/engine.h" +#include "btree.h" +#include <algorithm> +#include <list> +#include "json.h" +#include "clientcursor.h" + +/* + capped collection layout + + d's below won't exist if things align perfectly: + + extent1 -> extent2 -> extent3 + ------------------- ----------------------- --------------------- + d r r r r r r r r d d r r r r d r r r r r d d r r r r r r r r r d + ^ ^ + oldest newest + + ^cappedFirstDeletedInCurExtent() + ^cappedLastDelRecLastExtent() + ^cappedListOfAllDeletedRecords() +*/ + + +namespace mongo { + + /* combine adjacent deleted records *for the current extent* of the capped collection + + this is O(n^2) but we call it for capped tables where typically n==1 or 2! + (or 3...there will be a little unused sliver at the end of the extent.) + */ + void NamespaceDetails::compact() { + assert(capped); + + list<DiskLoc> drecs; + + // Pull out capExtent's DRs from deletedList + DiskLoc i = cappedFirstDeletedInCurExtent(); + for (; !i.isNull() && inCapExtent( i ); i = i.drec()->nextDeleted ) + drecs.push_back( i ); + + getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = i; + + // This is the O(n^2) part. + drecs.sort(); + + list<DiskLoc>::iterator j = drecs.begin(); + assert( j != drecs.end() ); + DiskLoc a = *j; + while ( 1 ) { + j++; + if ( j == drecs.end() ) { + DEBUGGING out() << "TEMP: compact adddelrec\n"; + addDeletedRec(a.drec(), a); + break; + } + DiskLoc b = *j; + while ( a.a() == b.a() && a.getOfs() + a.drec()->lengthWithHeaders == b.getOfs() ) { + // a & b are adjacent. merge. + getDur().writingInt( a.drec()->lengthWithHeaders ) += b.drec()->lengthWithHeaders; + j++; + if ( j == drecs.end() ) { + DEBUGGING out() << "temp: compact adddelrec2\n"; + addDeletedRec(a.drec(), a); + return; + } + b = *j; + } + DEBUGGING out() << "temp: compact adddelrec3\n"; + addDeletedRec(a.drec(), a); + a = b; + } + } + + DiskLoc &NamespaceDetails::cappedFirstDeletedInCurExtent() { + if ( cappedLastDelRecLastExtent().isNull() ) + return cappedListOfAllDeletedRecords(); + else + return cappedLastDelRecLastExtent().drec()->nextDeleted; + } + + void NamespaceDetails::cappedCheckMigrate() { + // migrate old NamespaceDetails format + assert( capped ); + if ( capExtent.a() == 0 && capExtent.getOfs() == 0 ) { + //capFirstNewRecord = DiskLoc(); + capFirstNewRecord.writing().setInvalid(); + // put all the DeletedRecords in cappedListOfAllDeletedRecords() + for ( int i = 1; i < Buckets; ++i ) { + DiskLoc first = deletedList[ i ]; + if ( first.isNull() ) + continue; + DiskLoc last = first; + for (; !last.drec()->nextDeleted.isNull(); last = last.drec()->nextDeleted ); + last.drec()->nextDeleted.writing() = cappedListOfAllDeletedRecords(); + cappedListOfAllDeletedRecords().writing() = first; + deletedList[i].writing() = DiskLoc(); + } + // NOTE cappedLastDelRecLastExtent() set to DiskLoc() in above + + // Last, in case we're killed before getting here + capExtent.writing() = firstExtent; + } + } + + bool NamespaceDetails::inCapExtent( const DiskLoc &dl ) const { + assert( !dl.isNull() ); + // We could have a rec or drec, doesn't matter. + bool res = dl.drec()->myExtentLoc(dl) == capExtent; + DEV { + // old implementation. this check is temp to test works the same. new impl should be a little faster. + assert( res == (dl.drec()->myExtent( dl ) == capExtent.ext()) ); + } + return res; + } + + bool NamespaceDetails::nextIsInCapExtent( const DiskLoc &dl ) const { + assert( !dl.isNull() ); + DiskLoc next = dl.drec()->nextDeleted; + if ( next.isNull() ) + return false; + return inCapExtent( next ); + } + + void NamespaceDetails::advanceCapExtent( const char *ns ) { + // We want cappedLastDelRecLastExtent() to be the last DeletedRecord of the prev cap extent + // (or DiskLoc() if new capExtent == firstExtent) + if ( capExtent == lastExtent ) + getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc(); + else { + DiskLoc i = cappedFirstDeletedInCurExtent(); + for (; !i.isNull() && nextIsInCapExtent( i ); i = i.drec()->nextDeleted ); + getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = i; + } + + getDur().writingDiskLoc( capExtent ) = theCapExtent()->xnext.isNull() ? firstExtent : theCapExtent()->xnext; + + /* this isn't true if a collection has been renamed...that is ok just used for diagnostics */ + //dassert( theCapExtent()->ns == ns ); + + theCapExtent()->assertOk(); + getDur().writingDiskLoc( capFirstNewRecord ) = DiskLoc(); + } + + DiskLoc NamespaceDetails::__capAlloc( int len ) { + DiskLoc prev = cappedLastDelRecLastExtent(); + DiskLoc i = cappedFirstDeletedInCurExtent(); + DiskLoc ret; + for (; !i.isNull() && inCapExtent( i ); prev = i, i = i.drec()->nextDeleted ) { + // We need to keep at least one DR per extent in cappedListOfAllDeletedRecords(), + // so make sure there's space to create a DR at the end. + if ( i.drec()->lengthWithHeaders >= len + 24 ) { + ret = i; + break; + } + } + + /* unlink ourself from the deleted list */ + if ( !ret.isNull() ) { + if ( prev.isNull() ) + cappedListOfAllDeletedRecords().writing() = ret.drec()->nextDeleted; + else + prev.drec()->nextDeleted.writing() = ret.drec()->nextDeleted; + ret.drec()->nextDeleted.writing().setInvalid(); // defensive. + assert( ret.drec()->extentOfs < ret.getOfs() ); + } + + return ret; + } + + DiskLoc NamespaceDetails::cappedAlloc(const char *ns, int len) { + // signal done allocating new extents. + if ( !cappedLastDelRecLastExtent().isValid() ) + getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc(); + + assert( len < 400000000 ); + int passes = 0; + int maxPasses = ( len / 30 ) + 2; // 30 is about the smallest entry that could go in the oplog + if ( maxPasses < 5000 ) { + // this is for bacwards safety since 5000 was the old value + maxPasses = 5000; + } + DiskLoc loc; + + // delete records until we have room and the max # objects limit achieved. + + /* this fails on a rename -- that is ok but must keep commented out */ + //assert( theCapExtent()->ns == ns ); + + theCapExtent()->assertOk(); + DiskLoc firstEmptyExtent; + while ( 1 ) { + if ( stats.nrecords < max ) { + loc = __capAlloc( len ); + if ( !loc.isNull() ) + break; + } + + // If on first iteration through extents, don't delete anything. + if ( !capFirstNewRecord.isValid() ) { + advanceCapExtent( ns ); + + if ( capExtent != firstExtent ) + capFirstNewRecord.writing().setInvalid(); + // else signal done with first iteration through extents. + continue; + } + + if ( !capFirstNewRecord.isNull() && + theCapExtent()->firstRecord == capFirstNewRecord ) { + // We've deleted all records that were allocated on the previous + // iteration through this extent. + advanceCapExtent( ns ); + continue; + } + + if ( theCapExtent()->firstRecord.isNull() ) { + if ( firstEmptyExtent.isNull() ) + firstEmptyExtent = capExtent; + advanceCapExtent( ns ); + if ( firstEmptyExtent == capExtent ) { + maybeComplain( ns, len ); + return DiskLoc(); + } + continue; + } + + DiskLoc fr = theCapExtent()->firstRecord; + theDataFileMgr.deleteRecord(ns, fr.rec(), fr, true); // ZZZZZZZZZZZZ + compact(); + if( ++passes > maxPasses ) { + log() << "passes ns:" << ns << " len:" << len << " maxPasses: " << maxPasses << '\n'; + log() << "passes max:" << max << " nrecords:" << stats.nrecords << " datasize: " << stats.datasize << endl; + massert( 10345 , "passes >= maxPasses in capped collection alloc", false ); + } + } + + // Remember first record allocated on this iteration through capExtent. + if ( capFirstNewRecord.isValid() && capFirstNewRecord.isNull() ) + getDur().writingDiskLoc(capFirstNewRecord) = loc; + + return loc; + } + + void NamespaceDetails::dumpExtents() { + cout << "dumpExtents:" << endl; + for ( DiskLoc i = firstExtent; !i.isNull(); i = i.ext()->xnext ) { + Extent *e = i.ext(); + stringstream ss; + e->dump(ss); + cout << ss.str() << endl; + } + } + + void NamespaceDetails::cappedDumpDelInfo() { + cout << "dl[0]: " << deletedList[0].toString() << endl; + for( DiskLoc z = deletedList[0]; !z.isNull(); z = z.drec()->nextDeleted ) { + cout << " drec:" << z.toString() << " dreclen:" << hex << z.drec()->lengthWithHeaders << + " ext:" << z.drec()->myExtent(z)->myLoc.toString() << endl; + } + cout << "dl[1]: " << deletedList[1].toString() << endl; + } + + void NamespaceDetails::cappedTruncateLastDelUpdate() { + if ( capExtent == firstExtent ) { + // Only one extent of the collection is in use, so there + // is no deleted record in a previous extent, so nullify + // cappedLastDelRecLastExtent(). + cappedLastDelRecLastExtent().writing() = DiskLoc(); + } + else { + // Scan through all deleted records in the collection + // until the last deleted record for the extent prior + // to the new capExtent is found. Then set + // cappedLastDelRecLastExtent() to that deleted record. + DiskLoc i = cappedListOfAllDeletedRecords(); + for( ; + !i.drec()->nextDeleted.isNull() && + !inCapExtent( i.drec()->nextDeleted ); + i = i.drec()->nextDeleted ); + // In our capped storage model, every extent must have at least one + // deleted record. Here we check that 'i' is not the last deleted + // record. (We expect that there will be deleted records in the new + // capExtent as well.) + assert( !i.drec()->nextDeleted.isNull() ); + cappedLastDelRecLastExtent().writing() = i; + } + } + + void NamespaceDetails::cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive) { + DEV assert( this == nsdetails(ns) ); + assert( cappedLastDelRecLastExtent().isValid() ); + + // We iteratively remove the newest document until the newest document + // is 'end', then we remove 'end' if requested. + bool foundLast = false; + while( 1 ) { + if ( foundLast ) { + // 'end' has been found and removed, so break. + break; + } + getDur().commitIfNeeded(); + // 'curr' will point to the newest document in the collection. + DiskLoc curr = theCapExtent()->lastRecord; + assert( !curr.isNull() ); + if ( curr == end ) { + if ( inclusive ) { + // 'end' has been found, so break next iteration. + foundLast = true; + } + else { + // 'end' has been found, so break. + break; + } + } + + // TODO The algorithm used in this function cannot generate an + // empty collection, but we could call emptyCappedCollection() in + // this case instead of asserting. + uassert( 13415, "emptying the collection is not allowed", stats.nrecords > 1 ); + + // Delete the newest record, and coalesce the new deleted + // record with existing deleted records. + theDataFileMgr.deleteRecord(ns, curr.rec(), curr, true); + compact(); + + // This is the case where we have not yet had to remove any + // documents to make room for other documents, and we are allocating + // documents from free space in fresh extents instead of reusing + // space from familiar extents. + if ( !capLooped() ) { + + // We just removed the last record from the 'capExtent', and + // the 'capExtent' can't be empty, so we set 'capExtent' to + // capExtent's prev extent. + if ( theCapExtent()->lastRecord.isNull() ) { + assert( !theCapExtent()->xprev.isNull() ); + // NOTE Because we didn't delete the last document, and + // capLooped() is false, capExtent is not the first extent + // so xprev will be nonnull. + capExtent.writing() = theCapExtent()->xprev; + theCapExtent()->assertOk(); + + // update cappedLastDelRecLastExtent() + cappedTruncateLastDelUpdate(); + } + continue; + } + + // This is the case where capLooped() is true, and we just deleted + // from capExtent, and we just deleted capFirstNewRecord, which was + // the last record on the fresh side of capExtent. + // NOTE In this comparison, curr and potentially capFirstNewRecord + // may point to invalid data, but we can still compare the + // references themselves. + if ( curr == capFirstNewRecord ) { + + // Set 'capExtent' to the first nonempty extent prior to the + // initial capExtent. There must be such an extent because we + // have not deleted the last document in the collection. It is + // possible that all extents other than the capExtent are empty. + // In this case we will keep the initial capExtent and specify + // that all records contained within are on the fresh rather than + // stale side of the extent. + DiskLoc newCapExtent = capExtent; + do { + // Find the previous extent, looping if necessary. + newCapExtent = ( newCapExtent == firstExtent ) ? lastExtent : newCapExtent.ext()->xprev; + newCapExtent.ext()->assertOk(); + } + while ( newCapExtent.ext()->firstRecord.isNull() ); + capExtent.writing() = newCapExtent; + + // Place all documents in the new capExtent on the fresh side + // of the capExtent by setting capFirstNewRecord to the first + // document in the new capExtent. + capFirstNewRecord.writing() = theCapExtent()->firstRecord; + + // update cappedLastDelRecLastExtent() + cappedTruncateLastDelUpdate(); + } + } + } + + void NamespaceDetails::emptyCappedCollection( const char *ns ) { + DEV assert( this == nsdetails(ns) ); + massert( 13424, "collection must be capped", capped ); + massert( 13425, "background index build in progress", !indexBuildInProgress ); + massert( 13426, "indexes present", nIndexes == 0 ); + + // Clear all references to this namespace. + ClientCursor::invalidate( ns ); + NamespaceDetailsTransient::clearForPrefix( ns ); + + // Get a writeable reference to 'this' and reset all pertinent + // attributes. + NamespaceDetails *t = writingWithoutExtra(); + + t->cappedLastDelRecLastExtent() = DiskLoc(); + t->cappedListOfAllDeletedRecords() = DiskLoc(); + + // preserve firstExtent/lastExtent + t->capExtent = firstExtent; + t->stats.datasize = stats.nrecords = 0; + // lastExtentSize preserve + // nIndexes preserve 0 + // capped preserve true + // max preserve + t->paddingFactor = 1.0; + t->flags = 0; + t->capFirstNewRecord = DiskLoc(); + t->capFirstNewRecord.setInvalid(); + t->cappedLastDelRecLastExtent().setInvalid(); + // dataFileVersion preserve + // indexFileVersion preserve + t->multiKeyIndexBits = 0; + t->reservedA = 0; + t->extraOffset = 0; + // indexBuildInProgress preserve 0 + memset(t->reserved, 0, sizeof(t->reserved)); + + // Reset all existing extents and recreate the deleted list. + for( DiskLoc ext = firstExtent; !ext.isNull(); ext = ext.ext()->xnext ) { + DiskLoc prev = ext.ext()->xprev; + DiskLoc next = ext.ext()->xnext; + DiskLoc empty = ext.ext()->reuse( ns, true ); + ext.ext()->xprev.writing() = prev; + ext.ext()->xnext.writing() = next; + addDeletedRec( empty.drec(), empty ); + } + } + +} diff --git a/src/mongo/db/client.cpp b/src/mongo/db/client.cpp new file mode 100644 index 00000000000..92b78d87ee5 --- /dev/null +++ b/src/mongo/db/client.cpp @@ -0,0 +1,697 @@ +// client.cpp + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* Client represents a connection to the database (the server-side) and corresponds + to an open socket (or logical connection if pooling on sockets) from a client. +*/ + +#include "pch.h" +#include "db.h" +#include "client.h" +#include "curop-inl.h" +#include "json.h" +#include "security.h" +#include "commands.h" +#include "instance.h" +#include "../s/d_logic.h" +#include "dbwebserver.h" +#include "../util/mongoutils/html.h" +#include "../util/mongoutils/checksum.h" +#include "../util/file_allocator.h" +#include "repl/rs.h" +#include "../scripting/engine.h" + +namespace mongo { + + Client* Client::syncThread; + mongo::mutex Client::clientsMutex("clientsMutex"); + set<Client*> Client::clients; // always be in clientsMutex when manipulating this + + TSP_DEFINE(Client, currentClient) + +#if defined(_DEBUG) + struct StackChecker; + ThreadLocalValue<StackChecker *> checker; + + struct StackChecker { + enum { SZ = 256 * 1024 }; + char buf[SZ]; + StackChecker() { + checker.set(this); + } + void init() { + memset(buf, 42, sizeof(buf)); + } + static void check(const char *tname) { + static int max; + StackChecker *sc = checker.get(); + const char *p = sc->buf; + int i = 0; + for( ; i < SZ; i++ ) { + if( p[i] != 42 ) + break; + } + int z = SZ-i; + if( z > max ) { + max = z; + log() << "thread " << tname << " stack usage was " << z << " bytes" << endl; + } + wassert( i > 16000 ); + } + }; +#endif + + /* each thread which does db operations has a Client object in TLS. + call this when your thread starts. + */ +#if defined _DEBUG + static unsigned long long nThreads = 0; + void assertStartingUp() { + assert( nThreads <= 1 ); + } +#else + void assertStartingUp() { } +#endif + + Client& Client::initThread(const char *desc, AbstractMessagingPort *mp) { +#if defined(_DEBUG) + { + nThreads++; // never decremented. this is for casi class asserts + if( sizeof(void*) == 8 ) { + StackChecker sc; + sc.init(); + } + } +#endif + assert( currentClient.get() == 0 ); + Client *c = new Client(desc, mp); + currentClient.reset(c); + mongo::lastError.initThread(); + return *c; + } + + Client::Client(const char *desc, AbstractMessagingPort *p) : + _context(0), + _shutdown(false), + _desc(desc), + _god(0), + _lastOp(0), + _mp(p), + _sometimes(0) + { + _hasWrittenThisPass = false; + _pageFaultRetryableSection = 0; + _connectionId = setThreadName(desc); + _curOp = new CurOp( this ); +#ifndef _WIN32 + stringstream temp; + temp << hex << showbase << pthread_self(); + _threadId = temp.str(); +#endif + scoped_lock bl(clientsMutex); + clients.insert(this); + } + + Client::~Client() { + _god = 0; + + if ( _context ) + error() << "Client::~Client _context should be null but is not; client:" << _desc << endl; + + if ( ! _shutdown ) { + error() << "Client::shutdown not called: " << _desc << endl; + } + + if ( ! inShutdown() ) { + // we can't clean up safely once we're in shutdown + scoped_lock bl(clientsMutex); + if ( ! _shutdown ) + clients.erase(this); + delete _curOp; + } + } + + bool Client::shutdown() { +#if defined(_DEBUG) + { + if( sizeof(void*) == 8 ) { + StackChecker::check( desc() ); + } + } +#endif + _shutdown = true; + if ( inShutdown() ) + return false; + { + scoped_lock bl(clientsMutex); + clients.erase(this); + if ( isSyncThread() ) { + syncThread = 0; + } + } + + return false; + } + + BSONObj CachedBSONObj::_tooBig = fromjson("{\"$msg\":\"query not recording (too large)\"}"); + Client::Context::Context( string ns , Database * db, bool doauth ) : + _client( currentClient.get() ), + _oldContext( _client->_context ), + _path( mongo::dbpath ), // is this right? could be a different db? may need a dassert for this + _justCreated(false), + _ns( ns ), + _db(db) + { + assert( db == 0 || db->isOk() ); + _client->_context = this; + checkNsAccess( doauth ); + _client->checkLocks(); + } + + Client::Context::Context(const string& ns, string path , bool doauth ) : + _client( currentClient.get() ), + _oldContext( _client->_context ), + _path( path ), + _justCreated(false), // set for real in finishInit + _ns( ns ), + _db(0) + { + _finishInit( doauth ); + _client->checkLocks(); + } + + /** "read lock, and set my context, all in one operation" + * This handles (if not recursively locked) opening an unopened database. + */ + Client::ReadContext::ReadContext(const string& ns, string path, bool doauth ) { + { + lk.reset( new _LockCollectionForReading(ns) ); + Database *db = dbHolder().get(ns, path); + if( db ) { + c.reset( new Context(path, ns, db, doauth) ); + return; + } + } + + // we usually don't get here, so doesn't matter how fast this part is + { + int x = d.dbMutex.getState(); + if( x > 0 ) { + // write locked already + DEV RARELY log() << "write locked on ReadContext construction " << ns << endl; + c.reset( new Context(ns, path, doauth) ); + } + else if( x == -1 ) { + lk.reset(0); + { + writelock w; + Context c(ns, path, doauth); + } + // db could be closed at this interim point -- that is ok, we will throw, and don't mind throwing. + lk.reset( new _LockCollectionForReading(ns) ); + c.reset( new Context(ns, path, doauth) ); + } + else { + assert( x < -1 ); + uasserted(15928, str::stream() << "can't open a database from a nested read lock " << ns); + } + } + + // todo: are receipts of thousands of queries for a nonexisting database a potential + // cause of bad performance due to the write lock acquisition above? let's fix that. + // it would be easy to first check that there is at least a .ns file, or something similar. + } + + void Client::Context::checkNotStale() const { + switch ( _client->_curOp->getOp() ) { + case dbGetMore: // getMore's are special and should be handled else where + case dbUpdate: // update & delete check shard version in instance.cpp, so don't check here as well + case dbDelete: + break; + default: { + string errmsg; + if ( ! shardVersionOk( _ns , errmsg ) ) { + ostringstream os; + os << "[" << _ns << "] shard version not ok in Client::Context: " << errmsg; + throw SendStaleConfigException( _ns, os.str() ); + } + } + } + } + + // invoked from ReadContext + Client::Context::Context(const string& path, const string& ns, Database *db , bool doauth) : + _client( currentClient.get() ), + _oldContext( _client->_context ), + _path( path ), + _justCreated(false), + _ns( ns ), + _db(db) + { + assert(_db); + checkNotStale(); + _client->_context = this; + _client->_curOp->enter( this ); + checkNsAccess( doauth, d.dbMutex.getState() ); + _client->checkLocks(); + } + + void Client::Context::_finishInit( bool doauth ) { + int lockState = d.dbMutex.getState(); + assert( lockState ); + if ( lockState > 0 && FileAllocator::get()->hasFailed() ) { + uassert(14031, "Can't take a write lock while out of disk space", false); + } + + _db = dbHolderUnchecked().getOrCreate( _ns , _path , _justCreated ); + assert(_db); + checkNotStale(); + _client->_context = this; + _client->_curOp->enter( this ); + checkNsAccess( doauth, lockState ); + } + + void Client::Context::_auth( int lockState ) { + if ( _client->_ai.isAuthorizedForLock( _db->name , lockState ) ) + return; + + // before we assert, do a little cleanup + _client->_context = _oldContext; // note: _oldContext may be null + + stringstream ss; + ss << "unauthorized db:" << _db->name << " lock type:" << lockState << " client:" << _client->clientAddress(); + uasserted( 10057 , ss.str() ); + } + + Client::Context::~Context() { + DEV assert( _client == currentClient.get() ); + _client->_curOp->leave( this ); + _client->_context = _oldContext; // note: _oldContext may be null + } + + bool Client::Context::inDB( const string& db , const string& path ) const { + if ( _path != path ) + return false; + + if ( db == _ns ) + return true; + + string::size_type idx = _ns.find( db ); + if ( idx != 0 ) + return false; + + return _ns[db.size()] == '.'; + } + + void Client::Context::checkNsAccess( bool doauth, int lockState ) { + if ( 0 ) { // SERVER-4276 + uassert( 15929, "client access to index backing namespace prohibited", NamespaceString::normal( _ns.c_str() ) ); + } + if ( doauth ) { + _auth( lockState ); + } + } + + void Client::appendLastOp( BSONObjBuilder& b ) const { + // _lastOp is never set if replication is off + if( theReplSet || ! _lastOp.isNull() ) { + b.appendTimestamp( "lastOp" , _lastOp.asDate() ); + } + } + + string Client::clientAddress(bool includePort) const { + if( _curOp ) + return _curOp->getRemoteString(includePort); + return ""; + } + + string Client::toString() const { + stringstream ss; + if ( _curOp ) + ss << _curOp->infoNoauth().jsonString(); + return ss.str(); + } + + string sayClientState() { + Client* c = currentClient.get(); + if ( !c ) + return "no client"; + return c->toString(); + } + + Client* curopWaitingForLock( int type ) { + Client * c = currentClient.get(); + assert( c ); + CurOp * co = c->curop(); + if ( co ) { + co->waitingForLock( type ); + } + return c; + } + void curopGotLock(Client *c) { + assert(c); + CurOp * co = c->curop(); + if ( co ) + co->gotLock(); + } + + void KillCurrentOp::interruptJs( AtomicUInt *op ) { + if ( !globalScriptEngine ) + return; + if ( !op ) { + globalScriptEngine->interruptAll(); + } + else { + globalScriptEngine->interrupt( *op ); + } + } + + void KillCurrentOp::killAll() { + _globalKill = true; + interruptJs( 0 ); + } + + void KillCurrentOp::kill(AtomicUInt i) { + bool found = false; + { + scoped_lock l( Client::clientsMutex ); + for( set< Client* >::const_iterator j = Client::clients.begin(); !found && j != Client::clients.end(); ++j ) { + for( CurOp *k = ( *j )->curop(); !found && k; k = k->parent() ) { + if ( k->opNum() == i ) { + k->kill(); + for( CurOp *l = ( *j )->curop(); l != k; l = l->parent() ) { + l->kill(); + } + found = true; + } + } + } + } + if ( found ) { + interruptJs( &i ); + } + } + + void Client::gotHandshake( const BSONObj& o ) { + BSONObjIterator i(o); + + { + BSONElement id = i.next(); + assert( id.type() ); + _remoteId = id.wrap( "_id" ); + } + + BSONObjBuilder b; + while ( i.more() ) + b.append( i.next() ); + + b.appendElementsUnique( _handshake ); + + _handshake = b.obj(); + + if (theReplSet && o.hasField("member")) { + theReplSet->ghost->associateSlave(_remoteId, o["member"].Int()); + } + } + + ClientBasic* ClientBasic::getCurrent() { + return currentClient.get(); + } + + class HandshakeCmd : public Command { + public: + void help(stringstream& h) const { h << "internal"; } + HandshakeCmd() : Command( "handshake" ) {} + virtual LockType locktype() const { return NONE; } + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return false; } + virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + Client& c = cc(); + c.gotHandshake( cmdObj ); + return 1; + } + + } handshakeCmd; + + class ClientListPlugin : public WebStatusPlugin { + public: + ClientListPlugin() : WebStatusPlugin( "clients" , 20 ) {} + virtual void init() {} + + virtual void run( stringstream& ss ) { + using namespace mongoutils::html; + + ss << "\n<table border=1 cellpadding=2 cellspacing=0>"; + ss << "<tr align='left'>" + << th( a("", "Connections to the database, both internal and external.", "Client") ) + << th( a("http://www.mongodb.org/display/DOCS/Viewing+and+Terminating+Current+Operation", "", "OpId") ) + << "<th>Active</th>" + << "<th>LockType</th>" + << "<th>Waiting</th>" + << "<th>SecsRunning</th>" + << "<th>Op</th>" + << th( a("http://www.mongodb.org/display/DOCS/Developer+FAQ#DeveloperFAQ-What%27sa%22namespace%22%3F", "", "Namespace") ) + << "<th>Query</th>" + << "<th>client</th>" + << "<th>msg</th>" + << "<th>progress</th>" + + << "</tr>\n"; + { + scoped_lock bl(Client::clientsMutex); + for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) { + Client *c = *i; + CurOp& co = *(c->curop()); + ss << "<tr><td>" << c->desc() << "</td>"; + + tablecell( ss , co.opNum() ); + tablecell( ss , co.active() ); + { + int lt = co.getLockType(); + if( lt == -1 ) tablecell(ss, "R"); + else if( lt == 1 ) tablecell(ss, "W"); + else + tablecell( ss , lt); + } + tablecell( ss , co.isWaitingForLock() ); + if ( co.active() ) + tablecell( ss , co.elapsedSeconds() ); + else + tablecell( ss , "" ); + tablecell( ss , co.getOp() ); + tablecell( ss , co.getNS() ); + if ( co.haveQuery() ) { + tablecell( ss , co.query() ); + } + else + tablecell( ss , "" ); + tablecell( ss , co.getRemoteString() ); + + tablecell( ss , co.getMessage() ); + tablecell( ss , co.getProgressMeter().toString() ); + + + ss << "</tr>\n"; + } + } + ss << "</table>\n"; + + } + + } clientListPlugin; + + int Client::recommendedYieldMicros( int * writers , int * readers ) { + int num = 0; + int w = 0; + int r = 0; + { + scoped_lock bl(clientsMutex); + for ( set<Client*>::iterator i=clients.begin(); i!=clients.end(); ++i ) { + Client* c = *i; + if ( c->curop()->isWaitingForLock() ) { + num++; + if ( c->curop()->getLockType() > 0 ) + w++; + else + r++; + } + } + } + + if ( writers ) + *writers = w; + if ( readers ) + *readers = r; + + int time = r * 100; + time += w * 500; + + time = min( time , 1000000 ); + + // if there has been a kill request for this op - we should yield to allow the op to stop + // This function returns empty string if we aren't interrupted + if ( *killCurrentOp.checkForInterruptNoAssert() ) { + return 100; + } + + return time; + } + + int Client::getActiveClientCount( int& writers, int& readers ) { + writers = 0; + readers = 0; + + scoped_lock bl(clientsMutex); + for ( set<Client*>::iterator i=clients.begin(); i!=clients.end(); ++i ) { + Client* c = *i; + if ( ! c->curop()->active() ) + continue; + + int l = c->curop()->getLockType(); + if ( l > 0 ) + writers++; + else if ( l < 0 ) + readers++; + + } + + return writers + readers; + } + + void OpDebug::reset() { + extra.reset(); + + op = 0; + iscommand = false; + ns = ""; + query = BSONObj(); + updateobj = BSONObj(); + + cursorid = -1; + ntoreturn = -1; + ntoskip = -1; + exhaust = false; + + nscanned = -1; + idhack = false; + scanAndOrder = false; + moved = false; + fastmod = false; + fastmodinsert = false; + upsert = false; + keyUpdates = 0; // unsigned, so -1 not possible + + exceptionInfo.reset(); + + executionTime = 0; + nreturned = -1; + responseLength = -1; + } + + +#define OPDEBUG_TOSTRING_HELP(x) if( x >= 0 ) s << " " #x ":" << (x) +#define OPDEBUG_TOSTRING_HELP_BOOL(x) if( x ) s << " " #x ":" << (x) + string OpDebug::toString() const { + StringBuilder s( ns.size() + 64 ); + if ( iscommand ) + s << "command "; + else + s << opToString( op ) << ' '; + s << ns.toString(); + + if ( ! query.isEmpty() ) { + if ( iscommand ) + s << " command: "; + else + s << " query: "; + s << query.toString(); + } + + if ( ! updateobj.isEmpty() ) { + s << " update: "; + updateobj.toString( s ); + } + + OPDEBUG_TOSTRING_HELP( cursorid ); + OPDEBUG_TOSTRING_HELP( ntoreturn ); + OPDEBUG_TOSTRING_HELP( ntoskip ); + OPDEBUG_TOSTRING_HELP_BOOL( exhaust ); + + OPDEBUG_TOSTRING_HELP( nscanned ); + OPDEBUG_TOSTRING_HELP_BOOL( idhack ); + OPDEBUG_TOSTRING_HELP_BOOL( scanAndOrder ); + OPDEBUG_TOSTRING_HELP_BOOL( moved ); + OPDEBUG_TOSTRING_HELP_BOOL( fastmod ); + OPDEBUG_TOSTRING_HELP_BOOL( fastmodinsert ); + OPDEBUG_TOSTRING_HELP_BOOL( upsert ); + OPDEBUG_TOSTRING_HELP( keyUpdates ); + + if ( extra.len() ) + s << " " << extra.str(); + + if ( ! exceptionInfo.empty() ) { + s << " exception: " << exceptionInfo.msg; + if ( exceptionInfo.code ) + s << " code:" << exceptionInfo.code; + } + + OPDEBUG_TOSTRING_HELP( nreturned ); + if ( responseLength ) + s << " reslen:" << responseLength; + s << " " << executionTime << "ms"; + + return s.str(); + } + +#define OPDEBUG_APPEND_NUMBER(x) if( x != -1 ) b.append( #x , (x) ) +#define OPDEBUG_APPEND_BOOL(x) if( x ) b.appendBool( #x , (x) ) + void OpDebug::append( const CurOp& curop, BSONObjBuilder& b ) const { + b.append( "op" , iscommand ? "command" : opToString( op ) ); + b.append( "ns" , ns.toString() ); + if ( ! query.isEmpty() ) + b.append( iscommand ? "command" : "query" , query ); + else if ( ! iscommand && curop.haveQuery() ) + curop.appendQuery( b , "query" ); + + if ( ! updateobj.isEmpty() ) + b.append( "updateobj" , updateobj ); + + OPDEBUG_APPEND_NUMBER( cursorid ); + OPDEBUG_APPEND_NUMBER( ntoreturn ); + OPDEBUG_APPEND_NUMBER( ntoskip ); + OPDEBUG_APPEND_BOOL( exhaust ); + + OPDEBUG_APPEND_NUMBER( nscanned ); + OPDEBUG_APPEND_BOOL( idhack ); + OPDEBUG_APPEND_BOOL( scanAndOrder ); + OPDEBUG_APPEND_BOOL( moved ); + OPDEBUG_APPEND_BOOL( fastmod ); + OPDEBUG_APPEND_BOOL( fastmodinsert ); + OPDEBUG_APPEND_BOOL( upsert ); + OPDEBUG_APPEND_NUMBER( keyUpdates ); + + if ( ! exceptionInfo.empty() ) + exceptionInfo.append( b , "exception" , "exceptionCode" ); + + OPDEBUG_APPEND_NUMBER( nreturned ); + OPDEBUG_APPEND_NUMBER( responseLength ); + b.append( "millis" , executionTime ); + + } + +} diff --git a/src/mongo/db/client.h b/src/mongo/db/client.h new file mode 100644 index 00000000000..6aa8bc00f02 --- /dev/null +++ b/src/mongo/db/client.h @@ -0,0 +1,286 @@ +/* @file db/client.h + + "Client" represents a connection to the database (the server-side) and corresponds + to an open socket (or logical connection if pooling on sockets) from a client. + + todo: switch to asio...this will fit nicely with that. +*/ + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../pch.h" +#include "security.h" +#include "namespace-inl.h" +#include "lasterror.h" +#include "stats/top.h" +#include "../db/client_common.h" +#include "../util/concurrency/threadlocal.h" +#include "../util/net/message_port.h" +#include "../util/concurrency/rwlock.h" +#include "d_concurrency.h" + +namespace mongo { + + extern class ReplSet *theReplSet; + class AuthenticationInfo; + class Database; + class CurOp; + class Command; + class Client; + class AbstractMessagingPort; + class LockCollectionForReading; + class PageFaultRetryableSection; + +#if defined(CLC) + typedef LockCollectionForReading _LockCollectionForReading; +#else + typedef readlock _LockCollectionForReading; +#endif + + TSP_DECLARE(Client, currentClient) + + typedef long long ConnectionId; + + /** the database's concept of an outside "client" */ + class Client : public ClientBasic { + static Client *syncThread; + public: + // always be in clientsMutex when manipulating this. killop stuff uses these. + static set<Client*> clients; + static mongo::mutex clientsMutex; + static int getActiveClientCount( int& writers , int& readers ); + class Context; + ~Client(); + static int recommendedYieldMicros( int * writers = 0 , int * readers = 0 ); + + /** each thread which does db operations has a Client object in TLS. + * call this when your thread starts. + */ + static Client& initThread(const char *desc, AbstractMessagingPort *mp = 0); + + static void initThreadIfNotAlready(const char *desc) { + if( currentClient.get() ) + return; + initThread(desc); + } + + /** this has to be called as the client goes away, but before thread termination + * @return true if anything was done + */ + bool shutdown(); + + /** set so isSyncThread() works */ + void iAmSyncThread() { + wassert( syncThread == 0 ); + syncThread = this; + } + /** @return true if this client is the replication secondary pull thread. not used much, is used in create index sync code. */ + bool isSyncThread() const { return this == syncThread; } + + string clientAddress(bool includePort=false) const; + const AuthenticationInfo * getAuthenticationInfo() const { return &_ai; } + AuthenticationInfo * getAuthenticationInfo() { return &_ai; } + bool isAdmin() { return _ai.isAuthorized( "admin" ); } + CurOp* curop() const { return _curOp; } + Context* getContext() const { return _context; } + Database* database() const { return _context ? _context->db() : 0; } + const char *ns() const { return _context->ns(); } + const char *desc() const { return _desc; } + void setLastOp( OpTime op ) { _lastOp = op; } + OpTime getLastOp() const { return _lastOp; } + + /** caution -- use Context class instead */ + void setContext(Context *c) { _context = c; } + + /* report what the last operation was. used by getlasterror */ + void appendLastOp( BSONObjBuilder& b ) const; + + bool isGod() const { return _god; } /* this is for map/reduce writes */ + string toString() const; + void gotHandshake( const BSONObj& o ); + bool hasRemote() const { return _mp; } + HostAndPort getRemote() const { assert( _mp ); return _mp->remote(); } + BSONObj getRemoteID() const { return _remoteId; } + BSONObj getHandshake() const { return _handshake; } + AbstractMessagingPort * port() const { return _mp; } + ConnectionId getConnectionId() const { return _connectionId; } + private: + Client(const char *desc, AbstractMessagingPort *p = 0); + friend class CurOp; + ConnectionId _connectionId; // > 0 for things "conn", 0 otherwise + string _threadId; // "" on non support systems + CurOp * _curOp; + Context * _context; + bool _shutdown; // to track if Client::shutdown() gets called + const char * const _desc; + bool _god; + AuthenticationInfo _ai; + OpTime _lastOp; + BSONObj _handshake; + BSONObj _remoteId; + AbstractMessagingPort * const _mp; + unsigned _sometimes; + public: + bool _hasWrittenThisPass; + PageFaultRetryableSection *_pageFaultRetryableSection; + + /** the concept here is the same as MONGO_SOMETIMES. however that + macro uses a static that will be shared by all threads, and each + time incremented it might eject that line from the other cpu caches (?), + so idea is that this is better. + */ + bool sometimes(unsigned howOften) { return ++_sometimes % howOften == 0; } + + /* set _god=true temporarily, safely */ + class GodScope { + bool _prev; + public: + GodScope(); + ~GodScope(); + }; + + //static void assureDatabaseIsOpen(const string& ns, string path=dbpath); + + /** "read lock, and set my context, all in one operation" + * This handles (if not recursively locked) opening an unopened database. + */ + class ReadContext : boost::noncopyable { + public: + ReadContext(const string& ns, string path=dbpath, bool doauth=true ); + Context& ctx() { return *c.get(); } + private: + scoped_ptr<_LockCollectionForReading> lk; + scoped_ptr<Context> c; + }; + + /* Set database we want to use, then, restores when we finish (are out of scope) + Note this is also helpful if an exception happens as the state if fixed up. + */ + class Context : boost::noncopyable { + public: + /** this is probably what you want */ + Context(const string& ns, string path=dbpath, bool doauth=true ); + + /** note: this does not call finishInit -- i.e., does not call + shardVersionOk() for example. + see also: reset(). + */ + Context( string ns , Database * db, bool doauth=true ); + + // used by ReadContext + Context(const string& path, const string& ns, Database *db, bool doauth); + + ~Context(); + Client* getClient() const { return _client; } + Database* db() const { return _db; } + const char * ns() const { return _ns.c_str(); } + bool equals( const string& ns , const string& path=dbpath ) const { return _ns == ns && _path == path; } + + /** @return if the db was created by this Context */ + bool justCreated() const { return _justCreated; } + + /** @return true iff the current Context is using db/path */ + bool inDB( const string& db , const string& path=dbpath ) const; + + void _clear() { // this is sort of an "early destruct" indication, _ns can never be uncleared + const_cast<string&>(_ns).empty(); + _db = 0; + } + + /** call before unlocking, so clear any non-thread safe state + * _db gets restored on the relock + */ + void unlocked() { _db = 0; } + + /** call after going back into the lock, will re-establish non-thread safe stuff */ + void relocked() { _finishInit(); } + + private: + friend class CurOp; + void _finishInit( bool doauth=true); + void _auth( int lockState ); + void checkNotStale() const; + void checkNsAccess( bool doauth, int lockState = d.dbMutex.getState() ); + Client * const _client; + Context * const _oldContext; + const string _path; + bool _justCreated; + const string _ns; + Database * _db; + }; // class Client::Context + + struct LockStatus { + LockStatus(); + string whichCollection; + unsigned excluder, global, collection; + string toString() const; + } lockStatus; + +#if defined(CLC) + void checkLocks() const; +#else + void checkLocks() const { } +#endif + + }; // class Client + + /** get the Client object for this thread. */ + inline Client& cc() { + Client * c = currentClient.get(); + assert( c ); + return *c; + } + + inline Client::GodScope::GodScope() { + _prev = cc()._god; + cc()._god = true; + } + inline Client::GodScope::~GodScope() { cc()._god = _prev; } + + /* this unreadlocks and then writelocks; i.e. it does NOT upgrade inside the + lock (and is thus wrong to use if you need that, which is usually). + that said we use it today for a specific case where the usage is correct. + */ +#if 0 + inline void mongolock::releaseAndWriteLock() { + if( !_writelock ) { + +#if BOOST_VERSION >= 103500 + int s = d.dbMutex.getState(); + if( s != -1 ) { + log() << "error: releaseAndWriteLock() s == " << s << endl; + msgasserted( 12600, "releaseAndWriteLock: unlock_shared failed, probably recursive" ); + } +#endif + + _writelock = true; + d.dbMutex.unlock_shared(); + d.dbMutex.lock(); + + // todo: unlocked() method says to call it before unlocking, not after. so fix this here, + // or fix the doc there. + if ( cc().getContext() ) + cc().getContext()->unlocked(); + } + } +#endif + + inline bool haveClient() { return currentClient.get() > 0; } + +}; diff --git a/src/mongo/db/client_common.h b/src/mongo/db/client_common.h new file mode 100644 index 00000000000..eb70105ef99 --- /dev/null +++ b/src/mongo/db/client_common.h @@ -0,0 +1,47 @@ +// client_common.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +//#include "../pch.h" +//#include "security.h" +#include "../util/net/hostandport.h" + +namespace mongo { + + class AuthenticationInfo; + + /** + * this is the base class for Client and ClientInfo + * Client is for mongod + * Client is for mongos + * They should converge slowly + * The idea is this has the basic api so that not all code has to be duplicated + */ + class ClientBasic : boost::noncopyable { + public: + virtual ~ClientBasic(){} + virtual const AuthenticationInfo * getAuthenticationInfo() const = 0; + virtual AuthenticationInfo * getAuthenticationInfo() = 0; + + virtual bool hasRemote() const = 0; + virtual HostAndPort getRemote() const = 0; + + static ClientBasic* getCurrent(); + }; +} diff --git a/src/mongo/db/clientcursor.cpp b/src/mongo/db/clientcursor.cpp new file mode 100644 index 00000000000..dc04ec38f63 --- /dev/null +++ b/src/mongo/db/clientcursor.cpp @@ -0,0 +1,747 @@ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* clientcursor.cpp + + ClientCursor is a wrapper that represents a cursorid from our database + application's perspective. + + Cursor -- and its derived classes -- are our internal cursors. +*/ + +#include "pch.h" +#include "clientcursor.h" +#include "introspect.h" +#include <time.h> +#include "db.h" +#include "commands.h" +#include "repl_block.h" +#include "../util/processinfo.h" +#include "../util/timer.h" +#include "../server.h" + +namespace mongo { + + CCById ClientCursor::clientCursorsById; + boost::recursive_mutex& ClientCursor::ccmutex( *(new boost::recursive_mutex()) ); + long long ClientCursor::numberTimedOut = 0; + + void aboutToDeleteForSharding( const Database* db , const DiskLoc& dl ); // from s/d_logic.h + + /*static*/ void ClientCursor::assertNoCursors() { + recursive_scoped_lock lock(ccmutex); + if( clientCursorsById.size() ) { + log() << "ERROR clientcursors exist but should not at this point" << endl; + ClientCursor *cc = clientCursorsById.begin()->second; + log() << "first one: " << cc->_cursorid << ' ' << cc->_ns << endl; + clientCursorsById.clear(); + assert(false); + } + } + + + void ClientCursor::setLastLoc_inlock(DiskLoc L) { + assert( _pos != -2 ); // defensive - see ~ClientCursor + + if ( L == _lastLoc ) + return; + + CCByLoc& bl = byLoc(); + + if ( !_lastLoc.isNull() ) { + bl.erase( ByLocKey( _lastLoc, _cursorid ) ); + } + + if ( !L.isNull() ) + bl[ByLocKey(L,_cursorid)] = this; + _lastLoc = L; + } + + /* ------------------------------------------- */ + + /* must call this when a btree node is updated */ + //void removedKey(const DiskLoc& btreeLoc, int keyPos) { + //} + + // ns is either a full namespace or "dbname." when invalidating for a whole db + void ClientCursor::invalidate(const char *ns) { + d.dbMutex.assertWriteLocked(); + int len = strlen(ns); + const char* dot = strchr(ns, '.'); + assert( len > 0 && dot); + + bool isDB = (dot == &ns[len-1]); // first (and only) dot is the last char + + { + //cout << "\nTEMP invalidate " << ns << endl; + recursive_scoped_lock lock(ccmutex); + + Database *db = cc().database(); + assert(db); + assert( str::startsWith(ns, db->name) ); + + for( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); /*++i*/ ) { + ClientCursor *cc = i->second; + + ++i; // we may be removing this node + + if( cc->_db != db ) + continue; + + if (isDB) { + // already checked that db matched above + dassert( str::startsWith(cc->_ns.c_str(), ns) ); + delete cc; //removes self from ccByID + } + else { + if ( str::equals(cc->_ns.c_str(), ns) ) + delete cc; //removes self from ccByID + } + } + + /* + note : we can't iterate byloc because clientcursors may exist with a loc of null in which case + they are not in the map. perhaps they should not exist though in the future? something to + change??? + + CCByLoc& bl = db->ccByLoc; + for ( CCByLoc::iterator i = bl.begin(); i != bl.end(); ++i ) { + ClientCursor *cc = i->second; + if ( strncmp(ns, cc->ns.c_str(), len) == 0 ) { + assert( cc->_db == db ); + toDelete.push_back(i->second); + } + }*/ + + /*cout << "TEMP after invalidate " << endl; + for( auto i = clientCursorsById.begin(); i != clientCursorsById.end(); ++i ) { + cout << " " << i->second->ns << endl; + } + cout << "TEMP after invalidate done" << endl;*/ + } + } + + /* note called outside of locks (other than ccmutex) so care must be exercised */ + bool ClientCursor::shouldTimeout( unsigned millis ) { + _idleAgeMillis += millis; + return _idleAgeMillis > 600000 && _pinValue == 0; + } + + /* called every 4 seconds. millis is amount of idle time passed since the last call -- could be zero */ + void ClientCursor::idleTimeReport(unsigned millis) { + bool foundSomeToTimeout = false; + + // two passes so that we don't need to readlock unless we really do some timeouts + // we assume here that incrementing _idleAgeMillis outside readlock is ok. + { + recursive_scoped_lock lock(ccmutex); + { + unsigned sz = clientCursorsById.size(); + static time_t last; + if( sz >= 100000 ) { + if( time(0) - last > 300 ) { + last = time(0); + log() << "warning number of open cursors is very large: " << sz << endl; + } + } + } + for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); ) { + CCById::iterator j = i; + i++; + if( j->second->shouldTimeout( millis ) ) { + foundSomeToTimeout = true; + break; + } + } + } + + if( foundSomeToTimeout ) { + // todo: ideally all readlocks automatically note what we are locking for so this + // can be reported in currentop command. e.g. something like: + // readlock lk("", "timeout cursors"); + readlock lk(""); + recursive_scoped_lock lock(ccmutex); + for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); ) { + CCById::iterator j = i; + i++; + if( j->second->shouldTimeout(0) ) { + numberTimedOut++; + LOG(1) << "killing old cursor " << j->second->_cursorid << ' ' << j->second->_ns + << " idle:" << j->second->idleTime() << "ms\n"; + delete j->second; + } + } + } + } + + /* must call when a btree bucket going away. + note this is potentially slow + */ + void ClientCursor::informAboutToDeleteBucket(const DiskLoc& b) { + recursive_scoped_lock lock(ccmutex); + Database *db = cc().database(); + CCByLoc& bl = db->ccByLoc; + RARELY if ( bl.size() > 70 ) { + log() << "perf warning: byLoc.size=" << bl.size() << " in aboutToDeleteBucket\n"; + } + if( bl.size() == 0 ) { + DEV tlog() << "debug warning: no cursors found in informAboutToDeleteBucket()" << endl; + } + for ( CCByLoc::iterator i = bl.begin(); i != bl.end(); i++ ) + i->second->_c->aboutToDeleteBucket(b); + } + void aboutToDeleteBucket(const DiskLoc& b) { + ClientCursor::informAboutToDeleteBucket(b); + } + + /* must call this on a delete so we clean up the cursors. */ + void ClientCursor::aboutToDelete(const DiskLoc& dl) { + recursive_scoped_lock lock(ccmutex); + + Database *db = cc().database(); + assert(db); + + aboutToDeleteForSharding( db , dl ); + + CCByLoc& bl = db->ccByLoc; + CCByLoc::iterator j = bl.lower_bound(ByLocKey::min(dl)); + CCByLoc::iterator stop = bl.upper_bound(ByLocKey::max(dl)); + if ( j == stop ) + return; + + vector<ClientCursor*> toAdvance; + + while ( 1 ) { + toAdvance.push_back(j->second); + DEV assert( j->first.loc == dl ); + ++j; + if ( j == stop ) + break; + } + + if( toAdvance.size() >= 3000 ) { + log() << "perf warning MPW101: " << toAdvance.size() << " cursors for one diskloc " + << dl.toString() + << ' ' << toAdvance[1000]->_ns + << ' ' << toAdvance[2000]->_ns + << ' ' << toAdvance[1000]->_pinValue + << ' ' << toAdvance[2000]->_pinValue + << ' ' << toAdvance[1000]->_pos + << ' ' << toAdvance[2000]->_pos + << ' ' << toAdvance[1000]->_idleAgeMillis + << ' ' << toAdvance[2000]->_idleAgeMillis + << ' ' << toAdvance[1000]->_doingDeletes + << ' ' << toAdvance[2000]->_doingDeletes + << endl; + //wassert( toAdvance.size() < 5000 ); + } + + for ( vector<ClientCursor*>::iterator i = toAdvance.begin(); i != toAdvance.end(); ++i ) { + ClientCursor* cc = *i; + wassert(cc->_db == db); + + if ( cc->_doingDeletes ) continue; + + Cursor *c = cc->_c.get(); + if ( c->capped() ) { + /* note we cannot advance here. if this condition occurs, writes to the oplog + have "caught" the reader. skipping ahead, the reader would miss postentially + important data. + */ + delete cc; + continue; + } + + c->checkLocation(); + DiskLoc tmp1 = c->refLoc(); + if ( tmp1 != dl ) { + // This might indicate a failure to call ClientCursor::updateLocation() but it can + // also happen during correct operation, see SERVER-2009. + problem() << "warning: cursor loc " << tmp1 << " does not match byLoc position " << dl << " !" << endl; + } + else { + c->advance(); + } + while (!c->eof() && c->refLoc() == dl) { + /* We don't delete at EOF because we want to return "no more results" rather than "no such cursor". + * The loop is to handle MultiKey indexes where the deleted record is pointed to by multiple adjacent keys. + * In that case we need to advance until we get to the next distinct record or EOF. + * SERVER-4154 + */ + c->advance(); + } + cc->updateLocation(); + } + } + void aboutToDelete(const DiskLoc& dl) { ClientCursor::aboutToDelete(dl); } + + ClientCursor::ClientCursor(int queryOptions, const shared_ptr<Cursor>& c, const string& ns, BSONObj query ) : + _ns(ns), _db( cc().database() ), + _c(c), _pos(0), + _query(query), _queryOptions(queryOptions), + _idleAgeMillis(0), _pinValue(0), + _doingDeletes(false), _yieldSometimesTracker(128,10) { + + d.dbMutex.assertAtLeastReadLocked(); + + assert( _db ); + assert( str::startsWith(_ns, _db->name) ); + if( queryOptions & QueryOption_NoCursorTimeout ) + noTimeout(); + recursive_scoped_lock lock(ccmutex); + _cursorid = allocCursorId_inlock(); + clientCursorsById.insert( make_pair(_cursorid, this) ); + + if ( ! _c->modifiedKeys() ) { + // store index information so we can decide if we can + // get something out of the index key rather than full object + + int x = 0; + BSONObjIterator i( _c->indexKeyPattern() ); + while ( i.more() ) { + BSONElement e = i.next(); + if ( e.isNumber() ) { + // only want basic index fields, not "2d" etc + _indexedFields[e.fieldName()] = x; + } + x++; + } + } + + } + + + ClientCursor::~ClientCursor() { + if( _pos == -2 ) { + // defensive: destructor called twice + wassert(false); + return; + } + + { + recursive_scoped_lock lock(ccmutex); + setLastLoc_inlock( DiskLoc() ); // removes us from bylocation multimap + clientCursorsById.erase(_cursorid); + + // defensive: + (CursorId&)_cursorid = -1; + _pos = -2; + } + } + + bool ClientCursor::getFieldsDotted( const string& name, BSONElementSet &ret, BSONObj& holder ) { + + map<string,int>::const_iterator i = _indexedFields.find( name ); + if ( i == _indexedFields.end() ) { + current().getFieldsDotted( name , ret ); + return false; + } + + int x = i->second; + + holder = currKey(); + BSONObjIterator it( holder ); + while ( x && it.more() ) { + it.next(); + x--; + } + assert( x == 0 ); + ret.insert( it.next() ); + return true; + } + + BSONElement ClientCursor::getFieldDotted( const string& name , BSONObj& holder , bool * fromKey ) { + + map<string,int>::const_iterator i = _indexedFields.find( name ); + if ( i == _indexedFields.end() ) { + if ( fromKey ) + *fromKey = false; + holder = current(); + return holder.getFieldDotted( name ); + } + + int x = i->second; + + holder = currKey(); + BSONObjIterator it( holder ); + while ( x && it.more() ) { + it.next(); + x--; + } + assert( x == 0 ); + + if ( fromKey ) + *fromKey = true; + return it.next(); + } + + BSONObj ClientCursor::extractFields(const BSONObj &pattern , bool fillWithNull ) { + BSONObjBuilder b( pattern.objsize() * 2 ); + + BSONObj holder; + + BSONObjIterator i( pattern ); + while ( i.more() ) { + BSONElement key = i.next(); + BSONElement value = getFieldDotted( key.fieldName() , holder ); + + if ( value.type() ) { + b.appendAs( value , key.fieldName() ); + continue; + } + + if ( fillWithNull ) + b.appendNull( key.fieldName() ); + + } + + return b.obj(); + } + + + /* call when cursor's location changes so that we can update the + cursorsbylocation map. if you are locked and internally iterating, only + need to call when you are ready to "unlock". + */ + void ClientCursor::updateLocation() { + assert( _cursorid ); + _idleAgeMillis = 0; + DiskLoc cl = _c->refLoc(); + if ( lastLoc() == cl ) { + //log() << "info: lastloc==curloc " << ns << '\n'; + } + else { + recursive_scoped_lock lock(ccmutex); + setLastLoc_inlock(cl); + } + // may be necessary for MultiCursor even when cl hasn't changed + _c->noteLocation(); + } + + int ClientCursor::suggestYieldMicros() { + int writers = 0; + int readers = 0; + + int micros = Client::recommendedYieldMicros( &writers , &readers ); + + if ( micros > 0 && writers == 0 && d.dbMutex.getState() <= 0 ) { + // we have a read lock, and only reads are coming on, so why bother unlocking + return 0; + } + + wassert( micros < 10000000 ); + dassert( micros < 1000001 ); + return micros; + } + + Record* ClientCursor::_recordForYield( ClientCursor::RecordNeeds need ) { + if ( need == DontNeed ) { + return 0; + } + else if ( need == MaybeCovered ) { + // TODO + return 0; + } + else if ( need == WillNeed ) { + // no-op + } + else { + warning() << "don't understand RecordNeeds: " << (int)need << endl; + return 0; + } + + DiskLoc l = currLoc(); + if ( l.isNull() ) + return 0; + + Record * rec = l.rec(); + if ( rec->likelyInPhysicalMemory() ) + return 0; + + return rec; + } + + bool ClientCursor::yieldSometimes( RecordNeeds need, bool *yielded ) { + if ( yielded ) { + *yielded = false; + } + if ( ! _yieldSometimesTracker.intervalHasElapsed() ) { + Record* rec = _recordForYield( need ); + if ( rec ) { + // yield for page fault + if ( yielded ) { + *yielded = true; + } + return yield( suggestYieldMicros() , rec ); + } + return true; + } + + int micros = suggestYieldMicros(); + if ( micros > 0 ) { + if ( yielded ) { + *yielded = true; + } + return yield( micros , _recordForYield( need ) ); + } + return true; + } + + void ClientCursor::staticYield( int micros , const StringData& ns , Record * rec ) { + killCurrentOp.checkForInterrupt( false ); + { + auto_ptr<LockMongoFilesShared> lk; + if ( rec ) { + // need to lock this else rec->touch won't be safe file could disappear + lk.reset( new LockMongoFilesShared() ); + } + + dbtempreleasecond unlock; + if ( unlock.unlocked() ) { + if ( micros == -1 ) + micros = Client::recommendedYieldMicros(); + if ( micros > 0 ) + sleepmicros( micros ); + } + else { + CurOp * c = cc().curop(); + while ( c->parent() ) + c = c->parent(); + LOGSOME << "warning ClientCursor::yield can't unlock b/c of recursive lock" + << " ns: " << ns + << " top: " << c->info() + << endl; + } + + if ( rec ) + rec->touch(); + + lk.reset(0); // need to release this before dbtempreleasecond + } + } + + bool ClientCursor::prepareToYield( YieldData &data ) { + if ( ! _c->supportYields() ) + return false; + if ( ! _c->prepareToYield() ) { + return false; + } + // need to store in case 'this' gets deleted + data._id = _cursorid; + + data._doingDeletes = _doingDeletes; + _doingDeletes = false; + + updateLocation(); + + { + /* a quick test that our temprelease is safe. + todo: make a YieldingCursor class + and then make the following code part of a unit test. + */ + const int test = 0; + static bool inEmpty = false; + if( test && !inEmpty ) { + inEmpty = true; + log() << "TEST: manipulate collection during cc:yield" << endl; + if( test == 1 ) + Helpers::emptyCollection(_ns.c_str()); + else if( test == 2 ) { + BSONObjBuilder b; string m; + dropCollection(_ns.c_str(), m, b); + } + else { + dropDatabase(_ns.c_str()); + } + } + } + return true; + } + + bool ClientCursor::recoverFromYield( const YieldData &data ) { + ClientCursor *cc = ClientCursor::find( data._id , false ); + if ( cc == 0 ) { + // id was deleted + return false; + } + + cc->_doingDeletes = data._doingDeletes; + cc->_c->recoverFromYield(); + return true; + } + + /** @return true if cursor is still ok */ + bool ClientCursor::yield( int micros , Record * recordToLoad ) { + + if ( ! _c->supportYields() ) // so me cursors (geo@oct2011) don't support yielding + return true; + + YieldData data; + prepareToYield( data ); + staticYield( micros , _ns , recordToLoad ); + return ClientCursor::recoverFromYield( data ); + } + + long long ctmLast = 0; // so we don't have to do find() which is a little slow very often. + long long ClientCursor::allocCursorId_inlock() { + long long ctm = curTimeMillis64(); + dassert( ctm ); + long long x; + while ( 1 ) { + x = (((long long)rand()) << 32); + x = x ^ ctm; + if ( ctm != ctmLast || ClientCursor::find_inlock(x, false) == 0 ) + break; + } + ctmLast = ctm; + return x; + } + + void ClientCursor::storeOpForSlave( DiskLoc last ) { + if ( ! ( _queryOptions & QueryOption_OplogReplay )) + return; + + if ( last.isNull() ) + return; + + BSONElement e = last.obj()["ts"]; + if ( e.type() == Date || e.type() == Timestamp ) + _slaveReadTill = e._opTime(); + } + + void ClientCursor::updateSlaveLocation( CurOp& curop ) { + if ( _slaveReadTill.isNull() ) + return; + mongo::updateSlaveLocation( curop , _ns.c_str() , _slaveReadTill ); + } + + + void ClientCursor::appendStats( BSONObjBuilder& result ) { + recursive_scoped_lock lock(ccmutex); + result.appendNumber("totalOpen", clientCursorsById.size() ); + result.appendNumber("clientCursors_size", (int) numCursors()); + result.appendNumber("timedOut" , numberTimedOut); + unsigned pinned = 0; + unsigned notimeout = 0; + for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); i++ ) { + unsigned p = i->second->_pinValue; + if( p >= 100 ) + pinned++; + else if( p > 0 ) + notimeout++; + } + if( pinned ) + result.append("pinned", pinned); + if( notimeout ) + result.append("totalNoTimeout", notimeout); + } + + // QUESTION: Restrict to the namespace from which this command was issued? + // Alternatively, make this command admin-only? + class CmdCursorInfo : public Command { + public: + CmdCursorInfo() : Command( "cursorInfo", true ) {} + virtual bool slaveOk() const { return true; } + virtual void help( stringstream& help ) const { + help << " example: { cursorInfo : 1 }"; + } + virtual LockType locktype() const { return NONE; } + bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + ClientCursor::appendStats( result ); + return true; + } + } cmdCursorInfo; + + struct Mem { + Mem() { res = virt = mapped = 0; } + int res; + int virt; + int mapped; + bool grew(const Mem& r) { + return (r.res && (((double)res)/r.res)>1.1 ) || + (r.virt && (((double)virt)/r.virt)>1.1 ) || + (r.mapped && (((double)mapped)/r.mapped)>1.1 ); + } + }; + + /** called once a minute from killcursors thread */ + void sayMemoryStatus() { + static time_t last; + static Mem mlast; + try { + ProcessInfo p; + if ( !cmdLine.quiet && p.supported() ) { + Mem m; + m.res = p.getResidentSize(); + m.virt = p.getVirtualMemorySize(); + m.mapped = (int) (MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 )); + if( time(0)-last >= 300 || m.grew(mlast) ) { + log() << "mem (MB) res:" << m.res << " virt:" << m.virt << " mapped:" << m.mapped << endl; + if( m.virt - (cmdLine.dur?2:1)*m.mapped > 5000 ) { + ONCE log() << "warning virtual/mapped memory differential is large. journaling:" << cmdLine.dur << endl; + } + last = time(0); + mlast = m; + } + } + } + catch(...) { + log() << "ProcessInfo exception" << endl; + } + } + + /** thread for timing out old cursors */ + void ClientCursorMonitor::run() { + Client::initThread("clientcursormon"); + Client& client = cc(); + Timer t; + const int Secs = 4; + unsigned n = 0; + while ( ! inShutdown() ) { + ClientCursor::idleTimeReport( t.millisReset() ); + sleepsecs(Secs); + if( ++n % (60/4) == 0 /*once a minute*/ ) { + sayMemoryStatus(); + } + } + client.shutdown(); + } + + void ClientCursor::find( const string& ns , set<CursorId>& all ) { + recursive_scoped_lock lock(ccmutex); + + for ( CCById::iterator i=clientCursorsById.begin(); i!=clientCursorsById.end(); ++i ) { + if ( i->second->_ns == ns ) + all.insert( i->first ); + } + } + + int ClientCursor::erase(int n, long long *ids) { + int found = 0; + for ( int i = 0; i < n; i++ ) { + if ( erase(ids[i]) ) + found++; + + if ( inShutdown() ) + break; + } + return found; + + } + + ClientCursorMonitor clientCursorMonitor; + +} // namespace mongo diff --git a/src/mongo/db/clientcursor.h b/src/mongo/db/clientcursor.h new file mode 100644 index 00000000000..e570820f62c --- /dev/null +++ b/src/mongo/db/clientcursor.h @@ -0,0 +1,430 @@ +/* clientcursor.h */ + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* Cursor -- and its derived classes -- are our internal cursors. + + ClientCursor is a wrapper that represents a cursorid from our database + application's perspective. +*/ + +#pragma once + +#include "../pch.h" +#include "cursor.h" +#include "jsobj.h" +#include "../util/net/message.h" +#include "../util/net/listen.h" +#include "../util/background.h" +#include "diskloc.h" +#include "dbhelpers.h" +#include "matcher.h" +#include "../client/dbclient.h" +#include "projection.h" +#include "s/d_chunk_manager.h" + +namespace mongo { + + typedef long long CursorId; /* passed to the client so it can send back on getMore */ + class Cursor; /* internal server cursor base class */ + class ClientCursor; + class ParsedQuery; + + struct ByLocKey { + + ByLocKey( const DiskLoc & l , const CursorId& i ) : loc(l), id(i) {} + + static ByLocKey min( const DiskLoc& l ) { return ByLocKey( l , numeric_limits<long long>::min() ); } + static ByLocKey max( const DiskLoc& l ) { return ByLocKey( l , numeric_limits<long long>::max() ); } + + bool operator<( const ByLocKey &other ) const { + int x = loc.compare( other.loc ); + if ( x ) + return x < 0; + return id < other.id; + } + + DiskLoc loc; + CursorId id; + + }; + + /* todo: make this map be per connection. this will prevent cursor hijacking security attacks perhaps. + * ERH: 9/2010 this may not work since some drivers send getMore over a different connection + */ + typedef map<CursorId, ClientCursor*> CCById; + typedef map<ByLocKey, ClientCursor*> CCByLoc; + + extern BSONObj id_obj; + + class ClientCursor { + friend class CmdCursorInfo; + public: + static void assertNoCursors(); + + /* use this to assure we don't in the background time out cursor while it is under use. + if you are using noTimeout() already, there is no risk anyway. + Further, this mechanism guards against two getMore requests on the same cursor executing + at the same time - which might be bad. That should never happen, but if a client driver + had a bug, it could (or perhaps some sort of attack situation). + */ + class Pointer : boost::noncopyable { + ClientCursor *_c; + public: + ClientCursor * c() { return _c; } + void release() { + if( _c ) { + assert( _c->_pinValue >= 100 ); + _c->_pinValue -= 100; + _c = 0; + } + } + /** + * call this if during a yield, the cursor got deleted + * if so, we don't want to use the point address + */ + void deleted() { + _c = 0; + } + ~Pointer() { release(); } + Pointer(long long cursorid) { + recursive_scoped_lock lock(ccmutex); + _c = ClientCursor::find_inlock(cursorid, true); + if( _c ) { + if( _c->_pinValue >= 100 ) { + _c = 0; + uasserted(12051, "clientcursor already in use? driver problem?"); + } + _c->_pinValue += 100; + } + } + }; + + // This object assures safe and reliable cleanup of the ClientCursor. + // The implementation assumes that there will be no duplicate ids among cursors + // (which is assured if cursors must last longer than 1 second). + class CleanupPointer : boost::noncopyable { + public: + CleanupPointer() : _c( 0 ), _id( -1 ) {} + void reset( ClientCursor *c = 0 ) { + if ( c == _c ) + return; + if ( _c ) { + // be careful in case cursor was deleted by someone else + ClientCursor::erase( _id ); + } + if ( c ) { + _c = c; + _id = c->_cursorid; + } + else { + _c = 0; + _id = -1; + } + } + ~CleanupPointer() { + DESTRUCTOR_GUARD ( reset(); ); + } + operator bool() { return _c; } + ClientCursor * operator-> () { return _c; } + private: + ClientCursor *_c; + CursorId _id; + }; + + ClientCursor(int queryOptions, const shared_ptr<Cursor>& c, const string& ns, BSONObj query = BSONObj() ); + + ~ClientCursor(); + + // *************** basic accessors ******************* + + CursorId cursorid() const { return _cursorid; } + string ns() const { return _ns; } + Database * db() const { return _db; } + const BSONObj& query() const { return _query; } + int queryOptions() const { return _queryOptions; } + + DiskLoc lastLoc() const { return _lastLoc; } + + /* Get rid of cursors for namespaces 'ns'. When dropping a db, ns is "dbname." + Used by drop, dropIndexes, dropDatabase. + */ + static void invalidate(const char *ns); + + /** + * @param microsToSleep -1 : ask client + * >=0 : sleep for that amount + * @param recordToLoad after yielding lock, load this record with only mmutex + * do a dbtemprelease + * note: caller should check matcher.docMatcher().atomic() first and not yield if atomic - + * we don't do herein as this->matcher (above) is only initialized for true queries/getmore. + * (ie not set for remote/update) + * @return if the cursor is still valid. + * if false is returned, then this ClientCursor should be considered deleted - + * in fact, the whole database could be gone. + */ + bool yield( int microsToSleep = -1 , Record * recordToLoad = 0 ); + + enum RecordNeeds { + DontNeed = -1 , MaybeCovered = 0 , WillNeed = 100 + }; + + /** + * @param needRecord whether or not the next record has to be read from disk for sure + * if this is true, will yield of next record isn't in memory + * @param yielded true if a yield occurred, and potentially if a yield did not occur + * @return same as yield() + */ + bool yieldSometimes( RecordNeeds need, bool *yielded = 0 ); + + static int suggestYieldMicros(); + static void staticYield( int micros , const StringData& ns , Record * rec ); + + struct YieldData { CursorId _id; bool _doingDeletes; }; + bool prepareToYield( YieldData &data ); + static bool recoverFromYield( const YieldData &data ); + + struct YieldLock : boost::noncopyable { + explicit YieldLock( ptr<ClientCursor> cc ) + : _canYield(cc->_c->supportYields()) { + if ( _canYield ) { + cc->prepareToYield( _data ); + _unlock.reset(new dbtempreleasecond()); + } + } + ~YieldLock() { + if ( _unlock ) { + log( LL_WARNING ) << "ClientCursor::YieldLock not closed properly" << endl; + relock(); + } + } + bool stillOk() { + if ( ! _canYield ) + return true; + relock(); + return ClientCursor::recoverFromYield( _data ); + } + void relock() { + _unlock.reset(); + } + private: + const bool _canYield; + YieldData _data; + scoped_ptr<dbtempreleasecond> _unlock; + }; + + // --- some pass through helpers for Cursor --- + + Cursor* c() const { return _c.get(); } + int pos() const { return _pos; } + + void incPos( int n ) { _pos += n; } // TODO: this is bad + void setPos( int n ) { _pos = n; } // TODO : this is bad too + + BSONObj indexKeyPattern() { return _c->indexKeyPattern(); } + bool modifiedKeys() const { return _c->modifiedKeys(); } + bool isMultiKey() const { return _c->isMultiKey(); } + + bool ok() { return _c->ok(); } + bool advance() { return _c->advance(); } + BSONObj current() { return _c->current(); } + DiskLoc currLoc() { return _c->currLoc(); } + BSONObj currKey() const { return _c->currKey(); } + + /** + * same as BSONObj::getFieldsDotted + * if it can be retrieved from key, it is + * @param holder keeps the currKey in scope by keeping a reference to it here. generally you'll want + * holder and ret to destruct about the same time. + * @return if this was retrieved from key + */ + bool getFieldsDotted( const string& name, BSONElementSet &ret, BSONObj& holder ); + + /** + * same as BSONObj::getFieldDotted + * if it can be retrieved from key, it is + * @return if this was retrieved from key + */ + BSONElement getFieldDotted( const string& name , BSONObj& holder , bool * fromKey = 0 ) ; + + /** extract items from object which match a pattern object. + * e.g., if pattern is { x : 1, y : 1 }, builds an object with + * x and y elements of this object, if they are present. + * returns elements with original field names + * NOTE: copied from BSONObj::extractFields + */ + BSONObj extractFields(const BSONObj &pattern , bool fillWithNull = false) ; + + bool currentIsDup() { return _c->getsetdup( _c->currLoc() ); } + + bool currentMatches() { + if ( ! _c->matcher() ) + return true; + return _c->matcher()->matchesCurrent( _c.get() ); + } + + void setChunkManager( ShardChunkManagerPtr manager ){ _chunkManager = manager; } + ShardChunkManagerPtr getChunkManager(){ return _chunkManager; } + + private: + void setLastLoc_inlock(DiskLoc); + + static ClientCursor* find_inlock(CursorId id, bool warn = true) { + CCById::iterator it = clientCursorsById.find(id); + if ( it == clientCursorsById.end() ) { + if ( warn ) + OCCASIONALLY out() << "ClientCursor::find(): cursor not found in map " << id << " (ok after a drop)\n"; + return 0; + } + return it->second; + } + public: + static ClientCursor* find(CursorId id, bool warn = true) { + recursive_scoped_lock lock(ccmutex); + ClientCursor *c = find_inlock(id, warn); + // if this asserts, your code was not thread safe - you either need to set no timeout + // for the cursor or keep a ClientCursor::Pointer in scope for it. + massert( 12521, "internal error: use of an unlocked ClientCursor", c == 0 || c->_pinValue ); + return c; + } + + static bool erase(CursorId id) { + recursive_scoped_lock lock(ccmutex); + ClientCursor *cc = find_inlock(id); + if ( cc ) { + assert( cc->_pinValue < 100 ); // you can't still have an active ClientCursor::Pointer + delete cc; + return true; + } + return false; + } + + /** + * @return number of cursors found + */ + static int erase( int n , long long * ids ); + + /* call when cursor's location changes so that we can update the + cursorsbylocation map. if you are locked and internally iterating, only + need to call when you are ready to "unlock". + */ + void updateLocation(); + + void mayUpgradeStorage() { + /* if ( !ids_.get() ) + return; + stringstream ss; + ss << ns << "." << cursorid; + ids_->mayUpgradeStorage( ss.str() );*/ + } + + /** + * @param millis amount of idle passed time since last call + */ + bool shouldTimeout( unsigned millis ); + + void storeOpForSlave( DiskLoc last ); + void updateSlaveLocation( CurOp& curop ); + + unsigned idleTime() const { return _idleAgeMillis; } + + void setDoingDeletes( bool doingDeletes ) {_doingDeletes = doingDeletes; } + + void slaveReadTill( const OpTime& t ) { _slaveReadTill = t; } + + public: // static methods + + static void idleTimeReport(unsigned millis); + + static void appendStats( BSONObjBuilder& result ); + static unsigned numCursors() { return clientCursorsById.size(); } + static void informAboutToDeleteBucket(const DiskLoc& b); + static void aboutToDelete(const DiskLoc& dl); + static void find( const string& ns , set<CursorId>& all ); + + + private: // methods + + // cursors normally timeout after an inactivy period to prevent excess memory use + // setting this prevents timeout of the cursor in question. + void noTimeout() { _pinValue++; } + + CCByLoc& byLoc() { return _db->ccByLoc; } + + Record* _recordForYield( RecordNeeds need ); + + private: + + CursorId _cursorid; + + const string _ns; + Database * _db; + + const shared_ptr<Cursor> _c; + map<string,int> _indexedFields; // map from indexed field to offset in key object + int _pos; // # objects into the cursor so far + + const BSONObj _query; // used for logging diags only; optional in constructor + int _queryOptions; // see enum QueryOptions dbclient.h + + OpTime _slaveReadTill; + + DiskLoc _lastLoc; // use getter and setter not this (important) + unsigned _idleAgeMillis; // how long has the cursor been around, relative to server idle time + + /* 0 = normal + 1 = no timeout allowed + 100 = in use (pinned) -- see Pointer class + */ + unsigned _pinValue; + + bool _doingDeletes; // when true we are the delete and aboutToDelete shouldn't manipulate us + ElapsedTracker _yieldSometimesTracker; + + ShardChunkManagerPtr _chunkManager; + + public: + shared_ptr<ParsedQuery> pq; + shared_ptr<Projection> fields; // which fields query wants returned + Message originalMessage; // this is effectively an auto ptr for data the matcher points to + + + + private: // static members + + static CCById clientCursorsById; + static long long numberTimedOut; + static boost::recursive_mutex& ccmutex; // must use this for all statics above! + static CursorId allocCursorId_inlock(); + + }; + + class ClientCursorMonitor : public BackgroundJob { + public: + string name() const { return "ClientCursorMonitor"; } + void run(); + }; + +} // namespace mongo + +// ClientCursor should only be used with auto_ptr because it needs to be +// release()ed after a yield if stillOk() returns false and these pointer types +// do not support releasing. This will prevent them from being used accidentally +namespace boost{ + template<> class scoped_ptr<mongo::ClientCursor> {}; + template<> class shared_ptr<mongo::ClientCursor> {}; +} diff --git a/src/mongo/db/cloner.cpp b/src/mongo/db/cloner.cpp new file mode 100644 index 00000000000..e35ae95052d --- /dev/null +++ b/src/mongo/db/cloner.cpp @@ -0,0 +1,763 @@ +// cloner.cpp - copy a database (export/import basically) + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "cloner.h" +#include "pdfile.h" +#include "../client/dbclient.h" +#include "../bson/util/builder.h" +#include "jsobj.h" +#include "ops/query.h" +#include "commands.h" +#include "db.h" +#include "instance.h" +#include "repl.h" + +namespace mongo { + + BSONElement getErrField(const BSONObj& o); + + void ensureHaveIdIndex(const char *ns); + + bool replAuthenticate(DBClientBase *); + + /** Selectively release the mutex based on a parameter. */ + class dbtempreleaseif { + public: + dbtempreleaseif( bool release ) : _impl( release ? new dbtemprelease() : 0 ) {} + private: + shared_ptr< dbtemprelease > _impl; + }; + + void mayInterrupt( bool mayBeInterrupted ) { + if ( mayBeInterrupted ) { + killCurrentOp.checkForInterrupt( false ); + } + } + + class Cloner: boost::noncopyable { + auto_ptr< DBClientWithCommands > conn; + void copy(const char *from_ns, const char *to_ns, bool isindex, bool logForRepl, + bool masterSameProcess, bool slaveOk, bool mayYield, bool mayBeInterrupted, Query q = Query()); + struct Fun; + public: + Cloner() { } + + /* slaveOk - if true it is ok if the source of the data is !ismaster. + useReplAuth - use the credentials we normally use as a replication slave for the cloning + snapshot - use $snapshot mode for copying collections. note this should not be used when it isn't required, as it will be slower. + for example repairDatabase need not use it. + */ + void setConnection( DBClientWithCommands *c ) { conn.reset( c ); } + + /** copy the entire database */ + bool go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield, bool mayBeInterrupted, int *errCode = 0); + + bool copyCollection( const string& ns , const BSONObj& query , string& errmsg , bool mayYield, bool mayBeInterrupted, bool copyIndexes = true, bool logForRepl = true ); + }; + + /* for index info object: + { "name" : "name_1" , "ns" : "foo.index3" , "key" : { "name" : 1.0 } } + we need to fix up the value in the "ns" parameter so that the name prefix is correct on a + copy to a new name. + */ + BSONObj fixindex(BSONObj o) { + BSONObjBuilder b; + BSONObjIterator i(o); + while ( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + + // for now, skip the "v" field so that v:0 indexes will be upgraded to v:1 + if ( string("v") == e.fieldName() ) { + continue; + } + + if ( string("ns") == e.fieldName() ) { + uassert( 10024 , "bad ns field for index during dbcopy", e.type() == String); + const char *p = strchr(e.valuestr(), '.'); + uassert( 10025 , "bad ns field for index during dbcopy [2]", p); + string newname = cc().database()->name + p; + b.append("ns", newname); + } + else + b.append(e); + } + BSONObj res= b.obj(); + + /* if( mod ) { + out() << "before: " << o.toString() << endl; + o.dump(); + out() << "after: " << res.toString() << endl; + res.dump(); + }*/ + + return res; + } + + struct Cloner::Fun { + Fun() : lastLog(0) { } + time_t lastLog; + void operator()( DBClientCursorBatchIterator &i ) { + mongolock l( true ); + if ( context ) { + context->relocked(); + } + + while( i.moreInCurrentBatch() ) { + if ( n % 128 == 127 /*yield some*/ ) { + time_t now = time(0); + if( now - lastLog >= 60 ) { + // report progress + if( lastLog ) + log() << "clone " << to_collection << ' ' << n << endl; + lastLog = now; + } + mayInterrupt( _mayBeInterrupted ); + dbtempreleaseif t( _mayYield ); + } + + BSONObj tmp = i.nextSafe(); + + /* assure object is valid. note this will slow us down a little. */ + if ( !tmp.valid() ) { + stringstream ss; + ss << "Cloner: skipping corrupt object from " << from_collection; + BSONElement e = tmp.firstElement(); + try { + e.validate(); + ss << " firstElement: " << e; + } + catch( ... ) { + ss << " firstElement corrupt"; + } + out() << ss.str() << endl; + continue; + } + + ++n; + + BSONObj js = tmp; + if ( isindex ) { + assert( strstr(from_collection, "system.indexes") ); + js = fixindex(tmp); + storedForLater->push_back( js.getOwned() ); + continue; + } + + try { + theDataFileMgr.insertWithObjMod(to_collection, js); + if ( logForRepl ) + logOp("i", to_collection, js); + + getDur().commitIfNeeded(); + } + catch( UserException& e ) { + log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n'; + } + + RARELY if ( time( 0 ) - saveLast > 60 ) { + log() << n << " objects cloned so far from collection " << from_collection << endl; + saveLast = time( 0 ); + } + } + } + int n; + bool isindex; + const char *from_collection; + const char *to_collection; + time_t saveLast; + list<BSONObj> *storedForLater; + bool logForRepl; + Client::Context *context; + bool _mayYield; + bool _mayBeInterrupted; + }; + + /* copy the specified collection + isindex - if true, this is system.indexes collection, in which we do some transformation when copying. + */ + void Cloner::copy(const char *from_collection, const char *to_collection, bool isindex, bool logForRepl, bool masterSameProcess, bool slaveOk, bool mayYield, bool mayBeInterrupted, Query query) { + list<BSONObj> storedForLater; + + Fun f; + f.n = 0; + f.isindex = isindex; + f.from_collection = from_collection; + f.to_collection = to_collection; + f.saveLast = time( 0 ); + f.storedForLater = &storedForLater; + f.logForRepl = logForRepl; + f._mayYield = mayYield; + f._mayBeInterrupted = mayBeInterrupted; + + int options = QueryOption_NoCursorTimeout | ( slaveOk ? QueryOption_SlaveOk : 0 ); + { + f.context = cc().getContext(); + mayInterrupt( mayBeInterrupted ); + dbtempreleaseif r( mayYield ); + DBClientConnection *remote = dynamic_cast< DBClientConnection* >( conn.get() ); + if ( remote ) { + remote->query( boost::function<void(DBClientCursorBatchIterator &)>( f ), from_collection, query, 0, options ); + } + else { + // there is no exhaust mode for direct client, so we have this hack + auto_ptr<DBClientCursor> c = conn->query( from_collection, query, 0, 0, 0, options ); + assert( c.get() ); + while( c->more() ) { + DBClientCursorBatchIterator i( *c ); + f( i ); + } + } + } + + if ( storedForLater.size() ) { + for ( list<BSONObj>::iterator i = storedForLater.begin(); i!=storedForLater.end(); i++ ) { + BSONObj js = *i; + try { + theDataFileMgr.insertWithObjMod(to_collection, js); + if ( logForRepl ) + logOp("i", to_collection, js); + + getDur().commitIfNeeded(); + } + catch( UserException& e ) { + log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n'; + } + } + } + } + + bool copyCollectionFromRemote(const string& host, const string& ns, string& errmsg) { + Cloner c; + + DBClientConnection *conn = new DBClientConnection(); + // cloner owns conn in auto_ptr + c.setConnection(conn); + uassert(15908, errmsg, conn->connect(host, errmsg) && replAuthenticate(conn)); + + return c.copyCollection(ns, BSONObj(), errmsg, true, false, /*copyIndexes*/ true, false); + } + + bool Cloner::copyCollection( const string& ns, const BSONObj& query, string& errmsg, + bool mayYield, bool mayBeInterrupted, bool copyIndexes, bool logForRepl ) { + + writelock lk(ns); // TODO: make this lower down + Client::Context ctx(ns); + + { + // config + string temp = ctx.db()->name + ".system.namespaces"; + BSONObj config = conn->findOne( temp , BSON( "name" << ns ) ); + if ( config["options"].isABSONObj() ) + if ( ! userCreateNS( ns.c_str() , config["options"].Obj() , errmsg, logForRepl , 0 ) ) + return false; + } + + { + // main data + copy( ns.c_str() , ns.c_str() , /*isindex*/false , logForRepl , false , true , mayYield, mayBeInterrupted, Query(query).snapshot() ); + } + + /* TODO : copyIndexes bool does not seem to be implemented! */ + if( !copyIndexes ) { + log() << "ERROR copy collection copyIndexes not implemented? " << ns << endl; + } + + { + // indexes + string temp = ctx.db()->name + ".system.indexes"; + copy( temp.c_str() , temp.c_str() , /*isindex*/true , logForRepl , false , true , mayYield, mayBeInterrupted, BSON( "ns" << ns ) ); + } + getDur().commitIfNeeded(); + return true; + } + + extern bool inDBRepair; + void ensureIdIndexForNewNs(const char *ns); + + bool Cloner::go(const char *masterHost, string& errmsg, const string& fromdb, bool logForRepl, bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield, bool mayBeInterrupted, int *errCode) { + if ( errCode ) { + *errCode = 0; + } + massert( 10289 , "useReplAuth is not written to replication log", !useReplAuth || !logForRepl ); + + string todb = cc().database()->name; + stringstream a,b; + a << "localhost:" << cmdLine.port; + b << "127.0.0.1:" << cmdLine.port; + bool masterSameProcess = ( a.str() == masterHost || b.str() == masterHost ); + if ( masterSameProcess ) { + if ( fromdb == todb && cc().database()->path == dbpath ) { + // guard against an "infinite" loop + /* if you are replicating, the local.sources config may be wrong if you get this */ + errmsg = "can't clone from self (localhost)."; + return false; + } + } + /* todo: we can put these releases inside dbclient or a dbclient specialization. + or just wait until we get rid of global lock anyway. + */ + string ns = fromdb + ".system.namespaces"; + list<BSONObj> toClone; + { + mayInterrupt( mayBeInterrupted ); + dbtempreleaseif r( mayYield ); + + // just using exhaust for collection copying right now + auto_ptr<DBClientCursor> c; + { + if ( conn.get() ) { + // nothing to do + } + else if ( !masterSameProcess ) { + ConnectionString cs = ConnectionString::parse( masterHost, errmsg ); + auto_ptr<DBClientBase> con( cs.connect( errmsg )); + if ( !con.get() ) + return false; + if( !replAuthenticate(con.get()) ) + return false; + + conn = con; + } + else { + conn.reset( new DBDirectClient() ); + } + // todo: if snapshot (bool param to this func) is true, we need to snapshot this query? + // only would be relevant if a thousands of collections -- maybe even then it is hard + // to exceed a single cursor batch. + // for repl it is probably ok as we apply oplog section after the clone (i.e. repl + // doesnt not use snapshot=true). + c = conn->query( ns.c_str(), BSONObj(), 0, 0, 0, slaveOk ? QueryOption_SlaveOk : 0 ); + } + + if ( c.get() == 0 ) { + errmsg = "query failed " + ns; + return false; + } + + if ( c->more() ) { + BSONObj first = c->next(); + if( !getErrField(first).eoo() ) { + if ( errCode ) { + *errCode = first.getIntField("code"); + } + errmsg = "query failed " + ns; + return false; + } + c->putBack( first ); + } + + while ( c->more() ) { + BSONObj collection = c->next(); + + log(2) << "\t cloner got " << collection << endl; + + BSONElement e = collection.getField("name"); + if ( e.eoo() ) { + string s = "bad system.namespaces object " + collection.toString(); + massert( 10290 , s.c_str(), false); + } + assert( !e.eoo() ); + assert( e.type() == String ); + const char *from_name = e.valuestr(); + + if( strstr(from_name, ".system.") ) { + /* system.users and s.js is cloned -- but nothing else from system. + * system.indexes is handled specially at the end*/ + if( legalClientSystemNS( from_name , true ) == 0 ) { + log(2) << "\t\t not cloning because system collection" << endl; + continue; + } + } + if( ! NamespaceString::normal( from_name ) ) { + log(2) << "\t\t not cloning because has $ " << endl; + continue; + } + toClone.push_back( collection.getOwned() ); + } + } + + for ( list<BSONObj>::iterator i=toClone.begin(); i != toClone.end(); i++ ) { + { + mayInterrupt( mayBeInterrupted ); + dbtempreleaseif r( mayYield ); + } + BSONObj collection = *i; + log(2) << " really will clone: " << collection << endl; + const char * from_name = collection["name"].valuestr(); + BSONObj options = collection.getObjectField("options"); + + /* change name "<fromdb>.collection" -> <todb>.collection */ + const char *p = strchr(from_name, '.'); + assert(p); + string to_name = todb + p; + + bool wantIdIndex = false; + { + string err; + const char *toname = to_name.c_str(); + /* we defer building id index for performance - building it in batch is much faster */ + userCreateNS(toname, options, err, logForRepl, &wantIdIndex); + } + log(1) << "\t\t cloning " << from_name << " -> " << to_name << endl; + Query q; + if( snapshot ) + q.snapshot(); + copy(from_name, to_name.c_str(), false, logForRepl, masterSameProcess, slaveOk, mayYield, mayBeInterrupted, q); + + if( wantIdIndex ) { + /* we need dropDups to be true as we didn't do a true snapshot and this is before applying oplog operations + that occur during the initial sync. inDBRepair makes dropDups be true. + */ + bool old = inDBRepair; + try { + inDBRepair = true; + ensureIdIndexForNewNs(to_name.c_str()); + inDBRepair = old; + } + catch(...) { + inDBRepair = old; + throw; + } + } + } + + // now build the indexes + + string system_indexes_from = fromdb + ".system.indexes"; + string system_indexes_to = todb + ".system.indexes"; + /* [dm]: is the ID index sometimes not called "_id_"? There is other code in the system that looks for a "_id" prefix + rather than this exact value. we should standardize. OR, remove names - which is in the bugdb. Anyway, this + is dubious here at the moment. + */ + // won't need a snapshot of the query of system.indexes as there can never be very many. + copy(system_indexes_from.c_str(), system_indexes_to.c_str(), true, logForRepl, masterSameProcess, slaveOk, mayYield, mayBeInterrupted, BSON( "name" << NE << "_id_" ) ); + + return true; + } + + bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication, + bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield, bool mayBeInterrupted, int *errCode) { + Cloner c; + return c.go(masterHost, errmsg, fromdb, logForReplication, slaveOk, useReplAuth, snapshot, mayYield, mayBeInterrupted, errCode); + } + + /* Usage: + mydb.$cmd.findOne( { clone: "fromhost" } ); + */ + class CmdClone : public Command { + public: + virtual bool slaveOk() const { + return false; + } + virtual LockType locktype() const { return WRITE; } + virtual void help( stringstream &help ) const { + help << "clone this database from an instance of the db on another host\n"; + help << "{ clone : \"host13\" }"; + } + CmdClone() : Command("clone") { } + virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + string from = cmdObj.getStringField("clone"); + if ( from.empty() ) + return false; + /* replication note: we must logOp() not the command, but the cloned data -- if the slave + were to clone it would get a different point-in-time and not match. + */ + return cloneFrom(from.c_str(), errmsg, dbname, + /*logForReplication=*/!fromRepl, /*slaveOk*/false, /*usereplauth*/false, /*snapshot*/true, /*mayYield*/true, /*mayBeInterrupted*/false); + } + } cmdclone; + + class CmdCloneCollection : public Command { + public: + virtual bool slaveOk() const { + return false; + } + virtual LockType locktype() const { return NONE; } + CmdCloneCollection() : Command("cloneCollection") { } + virtual void help( stringstream &help ) const { + help << "{ cloneCollection: <namespace>, from: <host> [,query: <query_filter>] [,copyIndexes:<bool>] }" + "\nCopies a collection from one server to another. Do not use on a single server as the destination " + "is placed at the same db.collection (namespace) as the source.\n" + "Warning: the local copy of 'ns' is emptied before the copying begins. Any existing data will be lost there." + ; + } + virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + string fromhost = cmdObj.getStringField("from"); + if ( fromhost.empty() ) { + errmsg = "missing 'from' parameter"; + return false; + } + { + HostAndPort h(fromhost); + if( h.isSelf() ) { + errmsg = "can't cloneCollection from self"; + return false; + } + } + string collection = cmdObj.getStringField("cloneCollection"); + if ( collection.empty() ) { + errmsg = "bad 'cloneCollection' value"; + return false; + } + BSONObj query = cmdObj.getObjectField("query"); + if ( query.isEmpty() ) + query = BSONObj(); + + BSONElement copyIndexesSpec = cmdObj.getField("copyindexes"); + bool copyIndexes = copyIndexesSpec.isBoolean() ? copyIndexesSpec.boolean() : true; + + log() << "cloneCollection. db:" << dbname << " collection:" << collection << " from: " << fromhost + << " query: " << query << " " << ( copyIndexes ? "" : ", not copying indexes" ) << endl; + + Cloner c; + auto_ptr<DBClientConnection> myconn; + myconn.reset( new DBClientConnection() ); + if ( ! myconn->connect( fromhost , errmsg ) ) + return false; + + c.setConnection( myconn.release() ); + + return c.copyCollection( collection , query, errmsg , true, false, copyIndexes ); + } + } cmdclonecollection; + + + thread_specific_ptr< DBClientConnection > authConn_; + /* Usage: + admindb.$cmd.findOne( { copydbgetnonce: 1, fromhost: <hostname> } ); + */ + class CmdCopyDbGetNonce : public Command { + public: + CmdCopyDbGetNonce() : Command("copydbgetnonce") { } + virtual bool adminOnly() const { + return true; + } + virtual bool slaveOk() const { + return false; + } + virtual LockType locktype() const { return WRITE; } + virtual void help( stringstream &help ) const { + help << "get a nonce for subsequent copy db request from secure server\n"; + help << "usage: {copydbgetnonce: 1, fromhost: <hostname>}"; + } + virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + string fromhost = cmdObj.getStringField("fromhost"); + if ( fromhost.empty() ) { + /* copy from self */ + stringstream ss; + ss << "localhost:" << cmdLine.port; + fromhost = ss.str(); + } + authConn_.reset( new DBClientConnection() ); + BSONObj ret; + { + dbtemprelease t; + if ( !authConn_->connect( fromhost, errmsg ) ) + return false; + if( !authConn_->runCommand( "admin", BSON( "getnonce" << 1 ), ret ) ) { + errmsg = "couldn't get nonce " + ret.toString(); + return false; + } + } + result.appendElements( ret ); + return true; + } + } cmdcopydbgetnonce; + + /* Usage: + admindb.$cmd.findOne( { copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>[, username: <username>, nonce: <nonce>, key: <key>] } ); + */ + class CmdCopyDb : public Command { + public: + CmdCopyDb() : Command("copydb") { } + virtual bool adminOnly() const { + return true; + } + virtual bool slaveOk() const { + return false; + } + virtual LockType locktype() const { return WRITE; } + virtual void help( stringstream &help ) const { + help << "copy a database from another host to this host\n"; + help << "usage: {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>[, slaveOk: <bool>, username: <username>, nonce: <nonce>, key: <key>]}"; + } + virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + bool slaveOk = cmdObj["slaveOk"].trueValue(); + string fromhost = cmdObj.getStringField("fromhost"); + if ( fromhost.empty() ) { + /* copy from self */ + stringstream ss; + ss << "localhost:" << cmdLine.port; + fromhost = ss.str(); + } + string fromdb = cmdObj.getStringField("fromdb"); + string todb = cmdObj.getStringField("todb"); + if ( fromhost.empty() || todb.empty() || fromdb.empty() ) { + errmsg = "parms missing - {copydb: 1, fromhost: <hostname>, fromdb: <db>, todb: <db>}"; + return false; + } + Cloner c; + string username = cmdObj.getStringField( "username" ); + string nonce = cmdObj.getStringField( "nonce" ); + string key = cmdObj.getStringField( "key" ); + if ( !username.empty() && !nonce.empty() && !key.empty() ) { + uassert( 13008, "must call copydbgetnonce first", authConn_.get() ); + BSONObj ret; + { + dbtemprelease t; + if ( !authConn_->runCommand( fromdb, BSON( "authenticate" << 1 << "user" << username << "nonce" << nonce << "key" << key ), ret ) ) { + errmsg = "unable to login " + ret.toString(); + return false; + } + } + c.setConnection( authConn_.release() ); + } + Client::Context ctx(todb); + bool res = c.go(fromhost.c_str(), errmsg, fromdb, /*logForReplication=*/!fromRepl, slaveOk, /*replauth*/false, /*snapshot*/true, /*mayYield*/true, /*mayBeInterrupted*/ false); + return res; + } + } cmdcopydb; + + class CmdRenameCollection : public Command { + public: + CmdRenameCollection() : Command( "renameCollection" ) {} + virtual bool adminOnly() const { + return true; + } + virtual bool requiresAuth() { return false; } // do our own auth + virtual bool slaveOk() const { + return false; + } + virtual LockType locktype() const { return WRITE; } + virtual bool logTheOp() { + return true; // can't log steps when doing fast rename within a db, so always log the op rather than individual steps comprising it. + } + virtual void help( stringstream &help ) const { + help << " example: { renameCollection: foo.a, to: bar.b }"; + } + virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + string source = cmdObj.getStringField( name.c_str() ); + string target = cmdObj.getStringField( "to" ); + uassert(15967,"invalid collection name: " + target, NamespaceString::validCollectionName(target.c_str())); + if ( source.empty() || target.empty() ) { + errmsg = "invalid command syntax"; + return false; + } + + bool capped = false; + long long size = 0; + { + Client::Context ctx( source ); // auths against source + NamespaceDetails *nsd = nsdetails( source.c_str() ); + uassert( 10026 , "source namespace does not exist", nsd ); + capped = nsd->capped; + if ( capped ) + for( DiskLoc i = nsd->firstExtent; !i.isNull(); i = i.ext()->xnext ) + size += i.ext()->length; + } + + Client::Context ctx( target ); //auths against target + + if ( nsdetails( target.c_str() ) ) { + uassert( 10027 , "target namespace exists", cmdObj["dropTarget"].trueValue() ); + BSONObjBuilder bb( result.subobjStart( "dropTarget" ) ); + dropCollection( target , errmsg , bb ); + bb.done(); + if ( errmsg.size() > 0 ) + return false; + } + + { + char from[256]; + nsToDatabase( source.c_str(), from ); + char to[256]; + nsToDatabase( target.c_str(), to ); + if ( strcmp( from, to ) == 0 ) { + renameNamespace( source.c_str(), target.c_str() ); + // make sure we drop counters etc + Top::global.collectionDropped( source ); + return true; + } + } + + BSONObjBuilder spec; + if ( capped ) { + spec.appendBool( "capped", true ); + spec.append( "size", double( size ) ); + } + if ( !userCreateNS( target.c_str(), spec.done(), errmsg, false ) ) + return false; + + auto_ptr< DBClientCursor > c; + DBDirectClient bridge; + + { + c = bridge.query( source, BSONObj() ); + } + while( 1 ) { + { + if ( !c->more() ) + break; + } + BSONObj o = c->next(); + theDataFileMgr.insertWithObjMod( target.c_str(), o ); + } + + char cl[256]; + nsToDatabase( source.c_str(), cl ); + string sourceIndexes = string( cl ) + ".system.indexes"; + nsToDatabase( target.c_str(), cl ); + string targetIndexes = string( cl ) + ".system.indexes"; + { + c = bridge.query( sourceIndexes, QUERY( "ns" << source ) ); + } + while( 1 ) { + { + if ( !c->more() ) + break; + } + BSONObj o = c->next(); + BSONObjBuilder b; + BSONObjIterator i( o ); + while( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + if ( strcmp( e.fieldName(), "ns" ) == 0 ) { + b.append( "ns", target ); + } + else { + b.append( e ); + } + } + BSONObj n = b.done(); + theDataFileMgr.insertWithObjMod( targetIndexes.c_str(), n ); + } + + { + Client::Context ctx( source ); + dropCollection( source, errmsg, result ); + } + return true; + } + } cmdrenamecollection; + +} // namespace mongo diff --git a/src/mongo/db/cloner.h b/src/mongo/db/cloner.h new file mode 100644 index 00000000000..130fea0fac1 --- /dev/null +++ b/src/mongo/db/cloner.h @@ -0,0 +1,39 @@ +// cloner.h - copy a database (export/import basically) + +/** + * Copyright (C) 2011 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "jsobj.h" + +namespace mongo { + + /** + * @param slaveOk - if true it is ok if the source of the data is !ismaster. + * @param useReplAuth - use the credentials we normally use as a replication slave for the cloning + * @param snapshot - use $snapshot mode for copying collections. note this should not be used when it isn't required, as it will be slower. + * for example repairDatabase need not use it. + * @param errCode - If provided, this will be set on error to the server's error code. Currently + * this will only be set if there is an error in the initial system.namespaces query. + */ + bool cloneFrom(const char *masterHost, string& errmsg, const string& fromdb, bool logForReplication, + bool slaveOk, bool useReplAuth, bool snapshot, bool mayYield, + bool mayBeInterrupted, int *errCode = 0); + + bool copyCollectionFromRemote(const string& host, const string& ns, string& errmsg); + +} // namespace mongo diff --git a/src/mongo/db/cmdline.cpp b/src/mongo/db/cmdline.cpp new file mode 100644 index 00000000000..a9b0d7097ca --- /dev/null +++ b/src/mongo/db/cmdline.cpp @@ -0,0 +1,519 @@ +// cmdline.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "cmdline.h" +#include "commands.h" +#include "../util/password.h" +#include "../util/processinfo.h" +#include "../util/net/listen.h" +#include "security_common.h" +#ifdef _WIN32 +#include <direct.h> +#else +#include <sys/types.h> +#include <sys/wait.h> +#endif +#include "globals.h" + +#define MAX_LINE_LENGTH 256 + +namespace po = boost::program_options; +namespace fs = boost::filesystem; + +namespace mongo { + + void setupSignals( bool inFork ); + string getHostNameCached(); + static BSONArray argvArray; + static BSONObj parsedOpts; + + void CmdLine::addGlobalOptions( boost::program_options::options_description& general , + boost::program_options::options_description& hidden ) { + /* support for -vv -vvvv etc. */ + for (string s = "vv"; s.length() <= 12; s.append("v")) { + hidden.add_options()(s.c_str(), "verbose"); + } + + general.add_options() + ("help,h", "show this usage information") + ("version", "show version information") + ("config,f", po::value<string>(), "configuration file specifying additional options") + ("verbose,v", "be more verbose (include multiple times for more verbosity e.g. -vvvvv)") + ("quiet", "quieter output") + ("port", po::value<int>(&cmdLine.port), "specify port number") + ("bind_ip", po::value<string>(&cmdLine.bind_ip), "comma separated list of ip addresses to listen on - all local ips by default") + ("maxConns",po::value<int>(), "max number of simultaneous connections") + ("objcheck", "inspect client data for validity on receipt") + ("logpath", po::value<string>() , "log file to send write to instead of stdout - has to be a file, not directory" ) + ("logappend" , "append to logpath instead of over-writing" ) + ("pidfilepath", po::value<string>(), "full path to pidfile (if not set, no pidfile is created)") + ("keyFile", po::value<string>(), "private key for cluster authentication (only for replica sets)") +#ifndef _WIN32 + ("nounixsocket", "disable listening on unix sockets") + ("unixSocketPrefix", po::value<string>(), "alternative directory for UNIX domain sockets (defaults to /tmp)") + ("fork" , "fork server process" ) + ("syslog" , "log to system's syslog facility instead of file or stdout" ) +#endif + ; + + hidden.add_options() + ("cloud", po::value<string>(), "custom dynamic host naming") +#ifdef MONGO_SSL + ("sslOnNormalPorts" , "use ssl on configured ports" ) + ("sslPEMKeyFile" , po::value<string>(&cmdLine.sslPEMKeyFile), "PEM file for ssl" ) + ("sslPEMKeyPassword" , new PasswordValue(&cmdLine.sslPEMKeyPassword) , "PEM file password" ) +#endif + ; + + } + + +#if defined(_WIN32) + void CmdLine::addWindowsOptions( boost::program_options::options_description& windows , + boost::program_options::options_description& hidden ) { + windows.add_options() + ("install", "install mongodb service") + ("remove", "remove mongodb service") + ("reinstall", "reinstall mongodb service (equivilant of mongod --remove followed by mongod --install)") + ("serviceName", po::value<string>(), "windows service name") + ("serviceDisplayName", po::value<string>(), "windows service display name") + ("serviceDescription", po::value<string>(), "windows service description") + ("serviceUser", po::value<string>(), "user name service executes as") + ("servicePassword", po::value<string>(), "password used to authenticate serviceUser") + ; + hidden.add_options()("service", "start mongodb service"); + } +#endif + + void CmdLine::parseConfigFile( istream &f, stringstream &ss ) { + string s; + char line[MAX_LINE_LENGTH]; + + while ( f ) { + f.getline(line, MAX_LINE_LENGTH); + s = line; + std::remove(s.begin(), s.end(), ' '); + std::remove(s.begin(), s.end(), '\t'); + boost::to_upper(s); + + if ( s.find( "FASTSYNC" ) != string::npos ) + cout << "warning \"fastsync\" should not be put in your configuration file" << endl; + + if ( s.c_str()[0] == '#' ) { + // skipping commented line + } else if ( s.find( "=FALSE" ) == string::npos ) { + ss << line << endl; + } else { + cout << "warning: remove or comment out this line by starting it with \'#\', skipping now : " << line << endl; + } + } + return; + } + +#ifndef _WIN32 + // support for exit value propagation with fork + void launchSignal( int sig ) { + if ( sig == SIGUSR2 ) { + pid_t cur = getpid(); + + if ( cur == cmdLine.parentProc || cur == cmdLine.leaderProc ) { + // signal indicates successful start allowing us to exit + _exit(0); + } + } + } + + void setupLaunchSignals() { + assert( signal(SIGUSR2 , launchSignal ) != SIG_ERR ); + } + + + void CmdLine::launchOk() { + if ( cmdLine.doFork ) { + // killing leader will propagate to parent + assert( kill( cmdLine.leaderProc, SIGUSR2 ) == 0 ); + } + } +#endif + + bool CmdLine::store( int argc , char ** argv , + boost::program_options::options_description& visible, + boost::program_options::options_description& hidden, + boost::program_options::positional_options_description& positional, + boost::program_options::variables_map ¶ms ) { + + + { + // setup binary name + cmdLine.binaryName = argv[0]; + size_t i = cmdLine.binaryName.rfind( '/' ); + if ( i != string::npos ) + cmdLine.binaryName = cmdLine.binaryName.substr( i + 1 ); + + // setup cwd + char buffer[1024]; +#ifdef _WIN32 + assert( _getcwd( buffer , 1000 ) ); +#else + assert( getcwd( buffer , 1000 ) ); +#endif + cmdLine.cwd = buffer; + } + + + /* don't allow guessing - creates ambiguities when some options are + * prefixes of others. allow long disguises and don't allow guessing + * to get away with our vvvvvvv trick. */ + int style = (((po::command_line_style::unix_style ^ + po::command_line_style::allow_guessing) | + po::command_line_style::allow_long_disguise) ^ + po::command_line_style::allow_sticky); + + + try { + + po::options_description all; + all.add( visible ); + all.add( hidden ); + + po::store( po::command_line_parser(argc, argv) + .options( all ) + .positional( positional ) + .style( style ) + .run(), + params ); + + if ( params.count("config") ) { + ifstream f( params["config"].as<string>().c_str() ); + if ( ! f.is_open() ) { + cout << "ERROR: could not read from config file" << endl << endl; + cout << visible << endl; + return false; + } + + stringstream ss; + CmdLine::parseConfigFile( f, ss ); + po::store( po::parse_config_file( ss , all ) , params ); + f.close(); + } + + po::notify(params); + } + catch (po::error &e) { + cout << "error command line: " << e.what() << endl; + cout << "use --help for help" << endl; + //cout << visible << endl; + return false; + } + + if (params.count("verbose")) { + logLevel = 1; + } + + for (string s = "vv"; s.length() <= 12; s.append("v")) { + if (params.count(s)) { + logLevel = s.length(); + } + } + + if (params.count("quiet")) { + cmdLine.quiet = true; + } + + if ( params.count( "maxConns" ) ) { + int newSize = params["maxConns"].as<int>(); + if ( newSize < 5 ) { + out() << "maxConns has to be at least 5" << endl; + dbexit( EXIT_BADOPTIONS ); + } + else if ( newSize >= 10000000 ) { + out() << "maxConns can't be greater than 10000000" << endl; + dbexit( EXIT_BADOPTIONS ); + } + connTicketHolder.resize( newSize ); + } + + if (params.count("objcheck")) { + cmdLine.objcheck = true; + } + + string logpath; + +#ifndef _WIN32 + if (params.count("unixSocketPrefix")) { + cmdLine.socket = params["unixSocketPrefix"].as<string>(); + if (!fs::is_directory(cmdLine.socket)) { + cout << cmdLine.socket << " must be a directory" << endl; + ::exit(-1); + } + } + + if (params.count("nounixsocket")) { + cmdLine.noUnixSocket = true; + } + + if (params.count("fork")) { + cmdLine.doFork = true; + if ( ! params.count( "logpath" ) && ! params.count( "syslog" ) ) { + cout << "--fork has to be used with --logpath or --syslog" << endl; + ::exit(-1); + } + + if ( params.count( "logpath" ) ) { + // test logpath + logpath = params["logpath"].as<string>(); + assert( logpath.size() ); + if ( logpath[0] != '/' ) { + logpath = cmdLine.cwd + "/" + logpath; + } + FILE * test = fopen( logpath.c_str() , "a" ); + if ( ! test ) { + cout << "can't open [" << logpath << "] for log file: " << errnoWithDescription() << endl; + ::exit(-1); + } + fclose( test ); + } + + cout.flush(); + cerr.flush(); + + cmdLine.parentProc = getpid(); + + // facilitate clean exit when child starts successfully + setupLaunchSignals(); + + pid_t c = fork(); + if ( c ) { + int pstat; + waitpid(c, &pstat, 0); + + if ( WIFEXITED(pstat) ) { + if ( ! WEXITSTATUS(pstat) ) { + cout << "child process started successfully, parent exiting" << endl; + } + + _exit( WEXITSTATUS(pstat) ); + } + + _exit(50); + } + + if ( chdir("/") < 0 ) { + cout << "Cant chdir() while forking server process: " << strerror(errno) << endl; + ::exit(-1); + } + setsid(); + + cmdLine.leaderProc = getpid(); + + pid_t c2 = fork(); + if ( c2 ) { + int pstat; + cout << "forked process: " << c2 << endl; + waitpid(c2, &pstat, 0); + + if ( WIFEXITED(pstat) ) { + _exit( WEXITSTATUS(pstat) ); + } + + _exit(51); + } + + // stdout handled in initLogging + //fclose(stdout); + //freopen("/dev/null", "w", stdout); + + fclose(stderr); + fclose(stdin); + + FILE* f = freopen("/dev/null", "w", stderr); + if ( f == NULL ) { + cout << "Cant reassign stderr while forking server process: " << strerror(errno) << endl; + ::exit(-1); + } + + f = freopen("/dev/null", "r", stdin); + if ( f == NULL ) { + cout << "Cant reassign stdin while forking server process: " << strerror(errno) << endl; + ::exit(-1); + } + + setupCoreSignals(); + setupSignals( true ); + } + + if (params.count("syslog")) { + StringBuilder sb(128); + sb << cmdLine.binaryName << "." << cmdLine.port; + Logstream::useSyslog( sb.str().c_str() ); + } +#endif + if (params.count("logpath")) { + if ( params.count("syslog") ) { + cout << "Cant use both a logpath and syslog " << endl; + ::exit(-1); + } + + if ( logpath.size() == 0 ) + logpath = params["logpath"].as<string>(); + uassert( 10033 , "logpath has to be non-zero" , logpath.size() ); + initLogging( logpath , params.count( "logappend" ) ); + } + + if ( params.count("pidfilepath")) { + writePidFile( params["pidfilepath"].as<string>() ); + } + + if (params.count("keyFile")) { + const string f = params["keyFile"].as<string>(); + + if (!setUpSecurityKey(f)) { + // error message printed in setUpPrivateKey + dbexit(EXIT_BADOPTIONS); + } + + cmdLine.keyFile = true; + noauth = false; + } + else { + cmdLine.keyFile = false; + } + +#ifdef MONGO_SSL + if (params.count("sslOnNormalPorts") ) { + cmdLine.sslOnNormalPorts = true; + + if ( cmdLine.sslPEMKeyPassword.size() == 0 ) { + log() << "need sslPEMKeyPassword" << endl; + dbexit(EXIT_BADOPTIONS); + } + + if ( cmdLine.sslPEMKeyFile.size() == 0 ) { + log() << "need sslPEMKeyFile" << endl; + dbexit(EXIT_BADOPTIONS); + } + + cmdLine.sslServerManager = new SSLManager( false ); + cmdLine.sslServerManager->setupPEM( cmdLine.sslPEMKeyFile , cmdLine.sslPEMKeyPassword ); + } + + if ( cmdLine.sslPEMKeyFile.size() || cmdLine.sslPEMKeyPassword.size() ) { + log() << "need to enable sslOnNormalPorts" << endl; + dbexit(EXIT_BADOPTIONS); + } +#endif + + { + BSONObjBuilder b; + for (po::variables_map::const_iterator it(params.begin()), end(params.end()); it != end; it++){ + if (!it->second.defaulted()){ + const string& key = it->first; + const po::variable_value& value = it->second; + const type_info& type = value.value().type(); + + if (type == typeid(string)){ + if (value.as<string>().empty()) + b.appendBool(key, true); // boost po uses empty string for flags like --quiet + else + b.append(key, value.as<string>()); + } + else if (type == typeid(int)) + b.append(key, value.as<int>()); + else if (type == typeid(double)) + b.append(key, value.as<double>()); + else if (type == typeid(bool)) + b.appendBool(key, value.as<bool>()); + else if (type == typeid(long)) + b.appendNumber(key, (long long)value.as<long>()); + else if (type == typeid(unsigned)) + b.appendNumber(key, (long long)value.as<unsigned>()); + else if (type == typeid(unsigned long long)) + b.appendNumber(key, (long long)value.as<unsigned long long>()); + else if (type == typeid(vector<string>)) + b.append(key, value.as<vector<string> >()); + else + b.append(key, "UNKNOWN TYPE: " + demangleName(type)); + } + } + parsedOpts = b.obj(); + } + + { + BSONArrayBuilder b; + for (int i=0; i < argc; i++) + b << argv[i]; + argvArray = b.arr(); + } + + return true; + } + + void printCommandLineOpts() { + log() << "options: " << parsedOpts << endl; + } + + void ignoreSignal( int sig ) {} + + void setupCoreSignals() { +#if !defined(_WIN32) + assert( signal(SIGUSR1 , rotateLogs ) != SIG_ERR ); + assert( signal(SIGHUP , ignoreSignal ) != SIG_ERR ); +#endif + } + + class CmdGetCmdLineOpts : Command { + public: + CmdGetCmdLineOpts(): Command("getCmdLineOpts") {} + void help(stringstream& h) const { h << "get argv"; } + virtual LockType locktype() const { return NONE; } + virtual bool adminOnly() const { return true; } + virtual bool slaveOk() const { return true; } + + virtual bool run(const string&, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + result.append("argv", argvArray); + result.append("parsed", parsedOpts); + return true; + } + + } cmdGetCmdLineOpts; + + string prettyHostName() { + StringBuilder s(128); + s << getHostNameCached(); + if( cmdLine.port != CmdLine::DefaultDBPort ) + s << ':' << mongo::cmdLine.port; + return s.str(); + } + + casi< map<string,ParameterValidator*> * > pv_all (NULL); + + ParameterValidator::ParameterValidator( const string& name ) : _name( name ) { + if ( ! pv_all) + pv_all.ref() = new map<string,ParameterValidator*>(); + (*pv_all.ref())[_name] = this; + } + + ParameterValidator * ParameterValidator::get( const string& name ) { + map<string,ParameterValidator*>::const_iterator i = pv_all.get()->find( name ); + if ( i == pv_all.get()->end() ) + return NULL; + return i->second; + } + +} diff --git a/src/mongo/db/cmdline.h b/src/mongo/db/cmdline.h new file mode 100644 index 00000000000..5fe6ceb1005 --- /dev/null +++ b/src/mongo/db/cmdline.h @@ -0,0 +1,203 @@ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../pch.h" +#include "jsobj.h" + +namespace mongo { + +#ifdef MONGO_SSL + class SSLManager; +#endif + + /* command line options + */ + /* concurrency: OK/READ */ + struct CmdLine { + + CmdLine(); + + string binaryName; // mongod or mongos + string cwd; // cwd of when process started + + // this is suboptimal as someone could rename a binary. todo... + bool isMongos() const { return binaryName == "mongos"; } + + int port; // --port + enum { + DefaultDBPort = 27017, + ConfigServerPort = 27019, + ShardServerPort = 27018 + }; + bool isDefaultPort() const { return port == DefaultDBPort; } + + string bind_ip; // --bind_ip + bool rest; // --rest + bool jsonp; // --jsonp + + string _replSet; // --replSet[/<seedlist>] + string ourSetName() const { + string setname; + size_t sl = _replSet.find('/'); + if( sl == string::npos ) + return _replSet; + return _replSet.substr(0, sl); + } + bool usingReplSets() const { return !_replSet.empty(); } + + // for master/slave replication + string source; // --source + string only; // --only + + bool quiet; // --quiet + bool noTableScan; // --notablescan no table scans allowed + bool prealloc; // --noprealloc no preallocation of data files + bool preallocj; // --nopreallocj no preallocation of journal files + bool smallfiles; // --smallfiles allocate smaller data files + + bool configsvr; // --configsvr + + bool quota; // --quota + int quotaFiles; // --quotaFiles + bool cpu; // --cpu show cpu time periodically + + bool dur; // --dur durability (now --journal) + unsigned journalCommitInterval; // group/batch commit interval ms + + /** --durOptions 7 dump journal and terminate without doing anything further + --durOptions 4 recover and terminate without listening + */ + enum { // bits to be ORed + DurDumpJournal = 1, // dump diagnostics on the journal during recovery + DurScanOnly = 2, // don't do any real work, just scan and dump if dump specified + DurRecoverOnly = 4, // terminate after recovery step + DurParanoid = 8, // paranoid mode enables extra checks + DurAlwaysCommit = 16, // do a group commit every time the writelock is released + DurAlwaysRemap = 32, // remap the private view after every group commit (may lag to the next write lock acquisition, but will do all files then) + DurNoCheckSpace = 64 // don't check that there is enough room for journal files before startup (for diskfull tests) + }; + int durOptions; // --durOptions <n> for debugging + + bool objcheck; // --objcheck + + long long oplogSize; // --oplogSize + int defaultProfile; // --profile + int slowMS; // --time in ms that is "slow" + + int pretouch; // --pretouch for replication application (experimental) + bool moveParanoia; // for move chunk paranoia + double syncdelay; // seconds between fsyncs + + bool noUnixSocket; // --nounixsocket + bool doFork; // --fork + string socket; // UNIX domain socket directory + + bool keyFile; + +#ifndef _WIN32 + pid_t parentProc; // --fork pid of initial process + pid_t leaderProc; // --fork pid of leader process +#endif + +#ifdef MONGO_SSL + bool sslOnNormalPorts; // --sslOnNormalPorts + string sslPEMKeyFile; // --sslPEMKeyFile + string sslPEMKeyPassword; // --sslPEMKeyPassword + + SSLManager* sslServerManager; // currently leaks on close +#endif + + static void launchOk(); + + static void addGlobalOptions( boost::program_options::options_description& general , + boost::program_options::options_description& hidden ); + + static void addWindowsOptions( boost::program_options::options_description& windows , + boost::program_options::options_description& hidden ); + + + static void parseConfigFile( istream &f, stringstream &ss); + /** + * @return true if should run program, false if should exit + */ + static bool store( int argc , char ** argv , + boost::program_options::options_description& visible, + boost::program_options::options_description& hidden, + boost::program_options::positional_options_description& positional, + boost::program_options::variables_map &output ); + + time_t started; + }; + + // todo move to cmdline.cpp? + inline CmdLine::CmdLine() : + port(DefaultDBPort), rest(false), jsonp(false), quiet(false), noTableScan(false), prealloc(true), preallocj(true), smallfiles(sizeof(int*) == 4), + configsvr(false), + quota(false), quotaFiles(8), cpu(false), durOptions(0), objcheck(false), oplogSize(0), defaultProfile(0), slowMS(100), pretouch(0), moveParanoia( true ), + syncdelay(60), noUnixSocket(false), doFork(0), socket("/tmp") + { + started = time(0); + + journalCommitInterval = 0; // 0 means use default + dur = false; +#if defined(_DURABLEDEFAULTON) + dur = true; +#endif + if( sizeof(void*) == 8 ) + dur = true; +#if defined(_DURABLEDEFAULTOFF) + dur = false; +#endif + +#ifdef MONGO_SSL + sslOnNormalPorts = false; + sslServerManager = 0; +#endif + } + + extern CmdLine cmdLine; + + void setupLaunchSignals(); + void setupCoreSignals(); + + string prettyHostName(); + + void printCommandLineOpts(); + + /** + * used for setParameter command + * so you can write validation code that lives with code using it + * rather than all in the command place + * also lets you have mongos or mongod specific code + * without pulling it all sorts of things + */ + class ParameterValidator { + public: + ParameterValidator( const string& name ); + virtual ~ParameterValidator() {} + + virtual bool isValid( BSONElement e , string& errmsg ) const = 0; + + static ParameterValidator * get( const string& name ); + + private: + const string _name; + }; + +} + diff --git a/src/mongo/db/collection.h b/src/mongo/db/collection.h new file mode 100644 index 00000000000..998b2f0beac --- /dev/null +++ b/src/mongo/db/collection.h @@ -0,0 +1,15 @@ +// @file collection.h + +#pragma once + +#include "namespace.h" + +namespace mongo { + + class Collection { + public: + NamespaceDetails * const d; + NamespaceDetailsTransient * const nsd; + }; + +} diff --git a/src/mongo/db/commands.cpp b/src/mongo/db/commands.cpp new file mode 100755 index 00000000000..cbe9ffc6861 --- /dev/null +++ b/src/mongo/db/commands.cpp @@ -0,0 +1,209 @@ +/* commands.cpp
+ db "commands" (sent via db.$cmd.findOne(...))
+ */
+
+/* Copyright 2009 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "pch.h"
+#include "jsobj.h"
+#include "commands.h"
+#include "client.h"
+#include "replutil.h"
+
+namespace mongo {
+
+ map<string,Command*> * Command::_commandsByBestName;
+ map<string,Command*> * Command::_webCommands;
+ map<string,Command*> * Command::_commands;
+
+ string Command::parseNsFullyQualified(const string& dbname, const BSONObj& cmdObj) const {
+ string s = cmdObj.firstElement().valuestr();
+ NamespaceString nss(s);
+ // these are for security, do not remove:
+ verify(15966, dbname == nss.db || dbname == "admin" );
+ verify(15962, !nss.db.empty() );
+ return s;
+ }
+
+ /*virtual*/ string Command::parseNs(const string& dbname, const BSONObj& cmdObj) const {
+ string coll = cmdObj.firstElement().valuestr();
+#if defined(CLC)
+ DEV if( mongoutils::str::startsWith(coll, dbname+'.') ) {
+ log() << "DEBUG parseNs Command's collection name looks like it includes the db name\n"
+ << dbname << '\n'
+ << coll << '\n'
+ << cmdObj.toString() << endl;
+ dassert(false);
+ }
+#endif
+ return dbname + '.' + coll;
+ }
+
+ void Command::htmlHelp(stringstream& ss) const {
+ string helpStr;
+ {
+ stringstream h;
+ help(h);
+ helpStr = h.str();
+ }
+ ss << "\n<tr><td>";
+ bool web = _webCommands->count(name) != 0;
+ if( web ) ss << "<a href=\"/" << name << "?text=1\">";
+ ss << name;
+ if( web ) ss << "</a>";
+ ss << "</td>\n";
+ ss << "<td>";
+ int l = locktype();
+ //if( l == NONE ) ss << "N ";
+ if( l == READ ) ss << "R ";
+ else if( l == WRITE ) ss << "W ";
+ if( slaveOk() )
+ ss << "S ";
+ if( adminOnly() )
+ ss << "A";
+ ss << "</td>";
+ ss << "<td>";
+ if( helpStr != "no help defined" ) {
+ const char *p = helpStr.c_str();
+ while( *p ) {
+ if( *p == '<' ) {
+ ss << "<";
+ p++; continue;
+ }
+ else if( *p == '{' )
+ ss << "<code>";
+ else if( *p == '}' ) {
+ ss << "}</code>";
+ p++;
+ continue;
+ }
+ if( strncmp(p, "http:", 5) == 0 ) {
+ ss << "<a href=\"";
+ const char *q = p;
+ while( *q && *q != ' ' && *q != '\n' )
+ ss << *q++;
+ ss << "\">";
+ q = p;
+ if( startsWith(q, "http://www.mongodb.org/display/") )
+ q += 31;
+ while( *q && *q != ' ' && *q != '\n' ) {
+ ss << (*q == '+' ? ' ' : *q);
+ q++;
+ if( *q == '#' )
+ while( *q && *q != ' ' && *q != '\n' ) q++;
+ }
+ ss << "</a>";
+ p = q;
+ continue;
+ }
+ if( *p == '\n' ) ss << "<br>";
+ else ss << *p;
+ p++;
+ }
+ }
+ ss << "</td>";
+ ss << "</tr>\n";
+ }
+
+ Command::Command(const char *_name, bool web, const char *oldName) : name(_name) {
+ // register ourself.
+ if ( _commands == 0 )
+ _commands = new map<string,Command*>;
+ if( _commandsByBestName == 0 )
+ _commandsByBestName = new map<string,Command*>;
+ Command*& c = (*_commands)[name];
+ if ( c )
+ log() << "warning: 2 commands with name: " << _name << endl;
+ c = this;
+ (*_commandsByBestName)[name] = this;
+
+ if( web ) {
+ if( _webCommands == 0 )
+ _webCommands = new map<string,Command*>;
+ (*_webCommands)[name] = this;
+ }
+
+ if( oldName )
+ (*_commands)[oldName] = this;
+ }
+
+ void Command::help( stringstream& help ) const {
+ help << "no help defined";
+ }
+
+ Command* Command::findCommand( const string& name ) {
+ map<string,Command*>::iterator i = _commands->find( name );
+ if ( i == _commands->end() )
+ return 0;
+ return i->second;
+ }
+
+
+ Command::LockType Command::locktype( const string& name ) {
+ Command * c = findCommand( name );
+ if ( ! c )
+ return WRITE;
+ return c->locktype();
+ }
+
+ void Command::logIfSlow( const Timer& timer, const string& msg ) {
+ int ms = timer.millis();
+ if ( ms > cmdLine.slowMS ) {
+ out() << msg << " took " << ms << " ms." << endl;
+ }
+ }
+
+}
+
+#include "../client/connpool.h"
+
+namespace mongo {
+
+ extern DBConnectionPool pool;
+
+ class PoolFlushCmd : public Command {
+ public:
+ PoolFlushCmd() : Command( "connPoolSync" , false , "connpoolsync" ) {}
+ virtual void help( stringstream &help ) const { help<<"internal"; }
+ virtual LockType locktype() const { return NONE; }
+ virtual bool run(const string&, mongo::BSONObj&, int, std::string&, mongo::BSONObjBuilder& result, bool) {
+ pool.flush();
+ return true;
+ }
+ virtual bool slaveOk() const {
+ return true;
+ }
+
+ } poolFlushCmd;
+
+ class PoolStats : public Command {
+ public:
+ PoolStats() : Command( "connPoolStats" ) {}
+ virtual void help( stringstream &help ) const { help<<"stats about connection pool"; }
+ virtual LockType locktype() const { return NONE; }
+ virtual bool run(const string&, mongo::BSONObj&, int, std::string&, mongo::BSONObjBuilder& result, bool) {
+ pool.appendInfo( result );
+ result.append( "numDBClientConnection" , DBClientConnection::getNumConnections() );
+ result.append( "numAScopedConnection" , AScopedConnection::getNumConnections() );
+ return true;
+ }
+ virtual bool slaveOk() const {
+ return true;
+ }
+
+ } poolStatsCmd;
+
+} // namespace mongo
diff --git a/src/mongo/db/commands.h b/src/mongo/db/commands.h new file mode 100644 index 00000000000..85cdd38d7a4 --- /dev/null +++ b/src/mongo/db/commands.h @@ -0,0 +1,164 @@ +// commands.h + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "jsobj.h" +#include "../util/mongoutils/str.h" + +namespace mongo { + + class BSONObj; + class BSONObjBuilder; + class Client; + class Timer; + + /** mongodb "commands" (sent via db.$cmd.findOne(...)) + subclass to make a command. define a singleton object for it. + */ + class Command { + protected: + string parseNsFullyQualified(const string& dbname, const BSONObj& cmdObj) const; + public: + // only makes sense for commands where 1st parm is the collection. + virtual string parseNs(const string& dbname, const BSONObj& cmdObj) const; + + enum LockType { READ = -1 , NONE = 0 , WRITE = 1 }; + + const string name; + + /* run the given command + implement this... + + fromRepl - command is being invoked as part of replication syncing. In this situation you + normally do not want to log the command to the local oplog. + + return value is true if succeeded. if false, set errmsg text. + */ + virtual bool run(const string& db, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl = false ) = 0; + + /* + note: logTheTop() MUST be false if READ + if NONE, can't use Client::Context setup + use with caution + */ + virtual LockType locktype() const = 0; + + /* Return true if only the admin ns has privileges to run this command. */ + virtual bool adminOnly() const { + return false; + } + + void htmlHelp(stringstream&) const; + + /* Like adminOnly, but even stricter: we must either be authenticated for admin db, + or, if running without auth, on the local interface. Used for things which + are so major that remote invocation may not make sense (e.g., shutdownServer). + + When localHostOnlyIfNoAuth() is true, adminOnly() must also be true. + */ + virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { return false; } + + /* Return true if slaves are allowed to execute the command + (the command directly from a client -- if fromRepl, always allowed). + */ + virtual bool slaveOk() const = 0; + + /* Return true if the client force a command to be run on a slave by + turning on the 'slaveOk' option in the command query. + */ + virtual bool slaveOverrideOk() { + return false; + } + + /* Override and return true to if true,log the operation (logOp()) to the replication log. + (not done if fromRepl of course) + + Note if run() returns false, we do NOT log. + */ + virtual bool logTheOp() { return false; } + + virtual void help( stringstream& help ) const; + + /* Return true if authentication and security applies to the commands. Some commands + (e.g., getnonce, authenticate) can be done by anyone even unauthorized. + */ + virtual bool requiresAuth() { return true; } + + /* Return true if a replica set secondary should go into "recovering" + (unreadable) state while running this command. + */ + virtual bool maintenanceMode() const { return false; } + + /* Return true if command should be permitted when a replica set secondary is in "recovering" + (unreadable) state. + */ + virtual bool maintenanceOk() const { return true; /* assumed true prior to commit */ } + + /** @param webUI expose the command in the web ui as localhost:28017/<name> + @param oldName an optional old, deprecated name for the command + */ + Command(const char *_name, bool webUI = false, const char *oldName = 0); + + virtual ~Command() {} + + protected: + BSONObj getQuery( const BSONObj& cmdObj ) { + if ( cmdObj["query"].type() == Object ) + return cmdObj["query"].embeddedObject(); + if ( cmdObj["q"].type() == Object ) + return cmdObj["q"].embeddedObject(); + return BSONObj(); + } + + static void logIfSlow( const Timer& cmdTimer, const string& msg); + + static map<string,Command*> * _commands; + static map<string,Command*> * _commandsByBestName; + static map<string,Command*> * _webCommands; + + public: + static const map<string,Command*>* commandsByBestName() { return _commandsByBestName; } + static const map<string,Command*>* webCommands() { return _webCommands; } + /** @return if command was found and executed */ + static bool runAgainstRegistered(const char *ns, BSONObj& jsobj, BSONObjBuilder& anObjBuilder, int queryOptions = 0); + static LockType locktype( const string& name ); + static Command * findCommand( const string& name ); + }; + + class CmdShutdown : public Command { + public: + virtual bool requiresAuth() { return true; } + virtual bool adminOnly() const { return true; } + virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { return true; } + virtual bool logTheOp() { + return false; + } + virtual bool slaveOk() const { + return true; + } + virtual LockType locktype() const { return NONE; } + virtual void help( stringstream& help ) const; + CmdShutdown() : Command("shutdown") {} + bool run(const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl); + private: + bool shutdownHelper(); + }; + + bool _runCommands(const char *ns, BSONObj& jsobj, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions); + +} // namespace mongo diff --git a/src/mongo/db/commands/aggregate.js b/src/mongo/db/commands/aggregate.js new file mode 100755 index 00000000000..7741e3121ff --- /dev/null +++ b/src/mongo/db/commands/aggregate.js @@ -0,0 +1,184 @@ +/* sample aggregate command queries */
+
+// make sure we're using the right db; this is the same as "use mydb;" in shell
+db = db.getSisterDB("mydb");
+
+// just passing through fields
+var p1 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ tags : 1,
+ pageViews : 1
+ }}
+]});
+
+// unwinding an array
+var p2 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ author : 1,
+ tag : { $unwind : "tags" },
+ pageViews : 1
+ }}
+]});
+
+// pulling values out of subdocuments
+var p3 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ otherfoo : "other.foo",
+ otherbar : "other.bar"
+ }}
+]});
+
+// projection includes a computed value
+var p4 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ author : 1,
+ daveWroteIt : { $eq:["$author", "dave"] }
+ }}
+]});
+
+// projection includes a virtual (fabricated) document
+var p5 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ author : 1,
+ pageViews : 1,
+ tag : { $unwind : "tags" }
+ }},
+ { $project : {
+ author : 1,
+ subDocument : { foo : "pageViews", bar : "tag" }
+ }}
+]});
+
+// multi-step aggregate
+// nested expressions in computed fields
+var p6 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ author : 1,
+ tag : { $unwind : "tags" },
+ pageViews : 1
+ }},
+ { $project : {
+ author : 1,
+ tag : 1,
+ pageViews : 1,
+ daveWroteIt : { $eq:["$author", "dave"] },
+ weLikeIt : { $or:[ { $eq:["$author", "dave"] },
+ { $eq:["$tag", "good"] } ] }
+ }}
+]});
+
+// slightly more complex computed expression; $ifnull
+var p7 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ theSum : { $add:["$pageViews",
+ { $ifnull:["$other.foo",
+ "$other.bar"] } ] }
+ }}
+]});
+
+// dotted path inclusion; _id exclusion
+var p8 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ _id : 0,
+ author : 1,
+ tag : { $unwind : "tags" },
+ "comments.author" : 1
+ }}
+]});
+
+
+// simple matching
+var m1 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $match : { author : "dave" } }
+]});
+
+// combining matching with a projection
+var m2 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ title : 1,
+ author : 1,
+ pageViews : 1,
+ tag : { $unwind : "tags" },
+ comments : 1
+ }},
+ { $match : { tag : "nasty" } }
+]});
+
+
+// group by tag
+var g1 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ author : 1,
+ tag : { $unwind : "tags" },
+ pageViews : 1
+ }},
+ { $group : {
+ _id: { tag : 1 },
+ docsByTag : { $sum : 1 },
+ viewsByTag : { $sum : "$pageViews" }
+ }}
+]});
+
+// $max, and averaging in a final projection
+var g2 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ author : 1,
+ tag : { $unwind : "tags" },
+ pageViews : 1
+ }},
+ { $group : {
+ _id: { tag : 1 },
+ docsByTag : { $sum : 1 },
+ viewsByTag : { $sum : "$pageViews" },
+ mostViewsByTag : { $max : "$pageViews" },
+ }},
+ { $project : {
+ _id: false,
+ tag : "_id.tag",
+ mostViewsByTag : 1,
+ docsByTag : 1,
+ viewsByTag : 1,
+ avgByTag : { $divide:["$viewsByTag", "$docsByTag"] }
+ }}
+]});
+
+// $push as an accumulator; can pivot data
+var g3 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ author : 1,
+ tag : { $unwind : "tags" }
+ }},
+ { $group : {
+ _id : { tag : 1 },
+ authors : { $push : "$author" }
+ }}
+]});
+
+// $avg, and averaging in a final projection
+var g4 = db.runCommand(
+{ aggregate : "article", pipeline : [
+ { $project : {
+ author : 1,
+ tag : { $unwind : "tags" },
+ pageViews : 1
+ }},
+ { $group : {
+ _id: { tag : 1 },
+ docsByTag : { $sum : 1 },
+ viewsByTag : { $sum : "$pageViews" },
+ avgByTag : { $avg : "$pageViews" },
+ }}
+]});
diff --git a/src/mongo/db/commands/cloud.cpp b/src/mongo/db/commands/cloud.cpp new file mode 100644 index 00000000000..8f9d9d2e4b5 --- /dev/null +++ b/src/mongo/db/commands/cloud.cpp @@ -0,0 +1,90 @@ +#include "../commands.h" +#include <map> +#include "../../util/concurrency/value.h" +#include "../../util/mongoutils/str.h" +#include "../../util/net/hostandport.h" + +using namespace mongoutils; + +namespace mongo { + + mapsf<string,string> dynHostNames; + extern DiagStr _hostNameCached; + + string dynHostMyName() { + if( !str::startsWith(_hostNameCached, '#') ) + return ""; + return _hostNameCached; + } + + void dynHostResolve(string& name, int& port) { + assert( !name.empty() ); + assert( !str::contains(name, ':') ); + assert( str::startsWith(name, '#') ); + string s = dynHostNames.get(name); + if( s.empty() ) { + name.clear(); + return; + } + assert( !str::startsWith(s, '#') ); + HostAndPort hp(s); + if( hp.hasPort() ) { + port = hp.port(); + log() << "info: dynhost in:" << name << " out:" << hp.toString() << endl; + } + name = hp.host(); + } + + /** + { cloud:1, nodes: { + name : <ip>, ... + }, + me : <mylogicalname> + } + */ + class CmdCloud : public Command { + public: + virtual LockType locktype() const { return NONE; } + virtual bool logTheOp() { return false; } + virtual bool adminOnly() const { return true; } // very important + virtual bool localHostOnlyIfNoAuth(const BSONObj&) { return true; } + virtual bool slaveOk() const { return true; } + virtual void help( stringstream& help ) const { + help << "internal\n"; + help << "{cloud:1,nodes:...,me:<my_logical_name>}"; + } + CmdCloud() : Command("cloud") {} + bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + assert(!fromRepl); + BSONObj nodes = cmdObj["nodes"].Obj(); + map<string,string> ipmap; + for( BSONObj::iterator i(nodes); i.more(); ) { + BSONElement e = i.next(); + assert( *e.fieldName() == '#' ); + ipmap[e.fieldName()] = e.String(); + } + + string me = cmdObj["me"].String(); + assert( !me.empty() && me[0] == '#' ); + + log(/*1*/) << "CmdCloud" << endl; + + if( me != _hostNameCached.get() ) { + log() << "CmdCloud new 'me' value:" << me << endl; + _hostNameCached = me; + } + + dynHostNames.swap(ipmap); + return true; + } + } cmdCloud; + + BSONObj fromjson(const string &str); + + void cloudCmdLineParamIs(string cmd) { + string errmsg; + BSONObjBuilder res; + BSONObj o = fromjson(cmd); + cmdCloud.run("", o, 0, errmsg, res, false); + } +} diff --git a/src/mongo/db/commands/distinct.cpp b/src/mongo/db/commands/distinct.cpp new file mode 100644 index 00000000000..1926e6abddb --- /dev/null +++ b/src/mongo/db/commands/distinct.cpp @@ -0,0 +1,157 @@ +// distinct.cpp + +/** +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +//#include "pch.h" +#include "../commands.h" +#include "../instance.h" +#include "../queryoptimizer.h" +#include "../clientcursor.h" +#include "../../util/timer.h" + +namespace mongo { + + class DistinctCommand : public Command { + public: + DistinctCommand() : Command("distinct") {} + virtual bool slaveOk() const { return true; } + virtual LockType locktype() const { return READ; } + virtual void help( stringstream &help ) const { + help << "{ distinct : 'collection name' , key : 'a.b' , query : {} }"; + } + + bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + Timer t; + string ns = dbname + '.' + cmdObj.firstElement().valuestr(); + + string key = cmdObj["key"].valuestrsafe(); + BSONObj keyPattern = BSON( key << 1 ); + + BSONObj query = getQuery( cmdObj ); + + int bufSize = BSONObjMaxUserSize - 4096; + BufBuilder bb( bufSize ); + char * start = bb.buf(); + + BSONArrayBuilder arr( bb ); + BSONElementSet values; + + long long nscanned = 0; // locations looked at + long long nscannedObjects = 0; // full objects looked at + long long n = 0; // matches + MatchDetails md; + + NamespaceDetails * d = nsdetails( ns.c_str() ); + + if ( ! d ) { + result.appendArray( "values" , BSONObj() ); + result.append( "stats" , BSON( "n" << 0 << "nscanned" << 0 << "nscannedObjects" << 0 ) ); + return true; + } + + shared_ptr<Cursor> cursor; + if ( ! query.isEmpty() ) { + cursor = NamespaceDetailsTransient::getCursor(ns.c_str() , query , BSONObj() ); + } + else { + + // query is empty, so lets see if we can find an index + // with the key so we don't have to hit the raw data + NamespaceDetails::IndexIterator ii = d->ii(); + while ( ii.more() ) { + IndexDetails& idx = ii.next(); + + if ( d->isMultikey( ii.pos() - 1 ) ) + continue; + + if ( idx.inKeyPattern( key ) ) { + cursor = bestGuessCursor( ns.c_str() , BSONObj() , idx.keyPattern() ); + if( cursor.get() ) break; + } + + } + + if ( ! cursor.get() ) + cursor = NamespaceDetailsTransient::getCursor(ns.c_str() , query , BSONObj() ); + + } + + + assert( cursor ); + string cursorName = cursor->toString(); + + auto_ptr<ClientCursor> cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns)); + + while ( cursor->ok() ) { + nscanned++; + bool loadedObject = false; + + if ( cursor->currentMatches( &md ) && !cursor->getsetdup( cursor->currLoc() ) ) { + n++; + + BSONObj holder; + BSONElementSet temp; + loadedObject = ! cc->getFieldsDotted( key , temp, holder ); + + for ( BSONElementSet::iterator i=temp.begin(); i!=temp.end(); ++i ) { + BSONElement e = *i; + if ( values.count( e ) ) + continue; + + int now = bb.len(); + + uassert(10044, "distinct too big, 16mb cap", ( now + e.size() + 1024 ) < bufSize ); + + arr.append( e ); + BSONElement x( start + now ); + + values.insert( x ); + } + } + + if ( loadedObject || md._loadedObject ) + nscannedObjects++; + + cursor->advance(); + + if (!cc->yieldSometimes( ClientCursor::MaybeCovered )) { + cc.release(); + break; + } + + RARELY killCurrentOp.checkForInterrupt(); + } + + assert( start == bb.buf() ); + + result.appendArray( "values" , arr.done() ); + + { + BSONObjBuilder b; + b.appendNumber( "n" , n ); + b.appendNumber( "nscanned" , nscanned ); + b.appendNumber( "nscannedObjects" , nscannedObjects ); + b.appendNumber( "timems" , t.millis() ); + b.append( "cursor" , cursorName ); + result.append( "stats" , b.obj() ); + } + + return true; + } + + } distinctCmd; + +} diff --git a/src/mongo/db/commands/document_source_cursor.cpp b/src/mongo/db/commands/document_source_cursor.cpp new file mode 100755 index 00000000000..49bb9f19d9e --- /dev/null +++ b/src/mongo/db/commands/document_source_cursor.cpp @@ -0,0 +1,100 @@ +/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/cursor.h"
+#include "db/pipeline/document.h"
+
+namespace mongo {
+
+ DocumentSourceCursor::~DocumentSourceCursor() {
+ }
+
+ bool DocumentSourceCursor::eof() {
+ /* if we haven't gotten the first one yet, do so now */
+ if (!pCurrent.get())
+ findNext();
+
+ return (pCurrent.get() == NULL);
+ }
+
+ bool DocumentSourceCursor::advance() {
+ /* if we haven't gotten the first one yet, do so now */
+ if (!pCurrent.get())
+ findNext();
+
+ findNext();
+ return (pCurrent.get() != NULL);
+ }
+
+ intrusive_ptr<Document> DocumentSourceCursor::getCurrent() {
+ /* if we haven't gotten the first one yet, do so now */
+ if (!pCurrent.get())
+ findNext();
+
+ return pCurrent;
+ }
+
+ void DocumentSourceCursor::findNext() {
+ /* standard cursor usage pattern */
+ while(pCursor->ok()) {
+ CoveredIndexMatcher *pCIM; // save intermediate result
+ if ((!(pCIM = pCursor->matcher()) ||
+ pCIM->matchesCurrent(pCursor.get())) &&
+ !pCursor->getsetdup(pCursor->currLoc())) {
+
+ /* grab the matching document */
+ BSONObj documentObj(pCursor->current());
+ pCurrent = Document::createFromBsonObj(&documentObj);
+ pCursor->advance();
+ return;
+ }
+
+ pCursor->advance();
+ }
+
+ /* if we got here, there aren't any more documents */
+ pCurrent.reset();
+ }
+
+ void DocumentSourceCursor::setSource(
+ const intrusive_ptr<DocumentSource> &pSource) {
+ /* this doesn't take a source */
+ assert(false);
+ }
+
+ void DocumentSourceCursor::sourceToBson(BSONObjBuilder *pBuilder) const {
+ /* this has no analog in the BSON world */
+ assert(false);
+ }
+
+ DocumentSourceCursor::DocumentSourceCursor(
+ const shared_ptr<Cursor> &pTheCursor):
+ pCursor(pTheCursor),
+ pCurrent() {
+ }
+
+ intrusive_ptr<DocumentSourceCursor> DocumentSourceCursor::create(
+ const shared_ptr<Cursor> &pCursor) {
+ assert(pCursor.get());
+ intrusive_ptr<DocumentSourceCursor> pSource(
+ new DocumentSourceCursor(pCursor));
+ return pSource;
+ }
+}
diff --git a/src/mongo/db/commands/find_and_modify.cpp b/src/mongo/db/commands/find_and_modify.cpp new file mode 100644 index 00000000000..0cf766fcf87 --- /dev/null +++ b/src/mongo/db/commands/find_and_modify.cpp @@ -0,0 +1,153 @@ +// find_and_modify.cpp + +/** +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "../commands.h" +#include "../instance.h" +#include "../clientcursor.h" + +namespace mongo { + + /* Find and Modify an object returning either the old (default) or new value*/ + class CmdFindAndModify : public Command { + public: + virtual void help( stringstream &help ) const { + help << + "{ findAndModify: \"collection\", query: {processed:false}, update: {$set: {processed:true}}, new: true}\n" + "{ findAndModify: \"collection\", query: {processed:false}, remove: true, sort: {priority:-1}}\n" + "Either update or remove is required, all other fields have default values.\n" + "Output is in the \"value\" field\n"; + } + + CmdFindAndModify() : Command("findAndModify", false, "findandmodify") { } + virtual bool logTheOp() { return false; } // the modifications will be logged directly + virtual bool slaveOk() const { return false; } + virtual LockType locktype() const { return WRITE; } + virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { + static DBDirectClient db; + + string ns = dbname + '.' + cmdObj.firstElement().valuestr(); + + BSONObj origQuery = cmdObj.getObjectField("query"); // defaults to {} + Query q (origQuery); + BSONElement sort = cmdObj["sort"]; + if (!sort.eoo()) + q.sort(sort.embeddedObjectUserCheck()); + + bool upsert = cmdObj["upsert"].trueValue(); + + BSONObj fieldsHolder (cmdObj.getObjectField("fields")); + const BSONObj* fields = (fieldsHolder.isEmpty() ? NULL : &fieldsHolder); + + Projection projection; + if (fields) { + projection.init(fieldsHolder); + if (!projection.includeID()) + fields = NULL; // do projection in post-processing + } + + BSONObj out = db.findOne(ns, q, fields); + if (out.isEmpty()) { + if (!upsert) { + result.appendNull("value"); + return true; + } + + BSONElement update = cmdObj["update"]; + uassert(13329, "upsert mode requires update field", !update.eoo()); + uassert(13330, "upsert mode requires query field", !origQuery.isEmpty()); + db.update(ns, origQuery, update.embeddedObjectUserCheck(), true); + + BSONObj gle = db.getLastErrorDetailed(); + result.append("lastErrorObject", gle); + if (gle["err"].type() == String) { + errmsg = gle["err"].String(); + return false; + } + + if (cmdObj["new"].trueValue()) { + BSONElement _id = gle["upserted"]; + if (_id.eoo()) + _id = origQuery["_id"]; + + out = db.findOne(ns, QUERY("_id" << _id), fields); + } + + } + else { + + if (cmdObj["remove"].trueValue()) { + uassert(12515, "can't remove and update", cmdObj["update"].eoo()); + db.remove(ns, QUERY("_id" << out["_id"]), 1); + + BSONObj gle = db.getLastErrorDetailed(); + result.append("lastErrorObject", gle); + if (gle["err"].type() == String) { + errmsg = gle["err"].String(); + return false; + } + + } + else { // update + + BSONElement queryId = origQuery["_id"]; + if (queryId.eoo() || getGtLtOp(queryId) != BSONObj::Equality) { + // need to include original query for $ positional operator + + BSONObjBuilder b; + b.append(out["_id"]); + BSONObjIterator it(origQuery); + while (it.more()) { + BSONElement e = it.next(); + if (strcmp(e.fieldName(), "_id")) + b.append(e); + } + q = Query(b.obj()); + } + + if (q.isComplex()) // update doesn't work with complex queries + q = Query(q.getFilter().getOwned()); + + BSONElement update = cmdObj["update"]; + uassert(12516, "must specify remove or update", !update.eoo()); + db.update(ns, q, update.embeddedObjectUserCheck()); + + BSONObj gle = db.getLastErrorDetailed(); + result.append("lastErrorObject", gle); + if (gle["err"].type() == String) { + errmsg = gle["err"].String(); + return false; + } + + if (cmdObj["new"].trueValue()) + out = db.findOne(ns, QUERY("_id" << out["_id"]), fields); + } + } + + if (!fieldsHolder.isEmpty() && !fields){ + // we need to run projection but haven't yet + out = projection.transform(out); + } + + result.append("value", out); + + return true; + } + } cmdFindAndModify; + + +} diff --git a/src/mongo/db/commands/group.cpp b/src/mongo/db/commands/group.cpp new file mode 100644 index 00000000000..69fee587a47 --- /dev/null +++ b/src/mongo/db/commands/group.cpp @@ -0,0 +1,224 @@ +// group.cpp + +/** +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "../commands.h" +#include "../instance.h" +#include "../queryoptimizer.h" +#include "../../scripting/engine.h" +#include "../clientcursor.h" + +namespace mongo { + + class GroupCommand : public Command { + public: + GroupCommand() : Command("group") {} + virtual LockType locktype() const { return READ; } + virtual bool slaveOk() const { return false; } + virtual bool slaveOverrideOk() { return true; } + virtual void help( stringstream &help ) const { + help << "http://www.mongodb.org/display/DOCS/Aggregation"; + } + + BSONObj getKey( const BSONObj& obj , const BSONObj& keyPattern , ScriptingFunction func , double avgSize , Scope * s ) { + if ( func ) { + BSONObjBuilder b( obj.objsize() + 32 ); + b.append( "0" , obj ); + const BSONObj& key = b.obj(); + int res = s->invoke( func , &key, 0 ); + uassert( 10041 , (string)"invoke failed in $keyf: " + s->getError() , res == 0 ); + int type = s->type("return"); + uassert( 10042 , "return of $key has to be an object" , type == Object ); + return s->getObject( "return" ); + } + return obj.extractFields( keyPattern , true ).getOwned(); + } + + bool group( string realdbname , const string& ns , const BSONObj& query , + BSONObj keyPattern , string keyFunctionCode , string reduceCode , const char * reduceScope , + BSONObj initial , string finalize , + string& errmsg , BSONObjBuilder& result ) { + + + auto_ptr<Scope> s = globalScriptEngine->getPooledScope( realdbname ); + s->localConnect( realdbname.c_str() ); + + if ( reduceScope ) + s->init( reduceScope ); + + s->setObject( "$initial" , initial , true ); + + s->exec( "$reduce = " + reduceCode , "reduce setup" , false , true , true , 100 ); + s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 ); + ScriptingFunction f = s->createFunction( + "function(){ " + " if ( $arr[n] == null ){ " + " next = {}; " + " Object.extend( next , $key ); " + " Object.extend( next , $initial , true ); " + " $arr[n] = next; " + " next = null; " + " } " + " $reduce( obj , $arr[n] ); " + "}" ); + + ScriptingFunction keyFunction = 0; + if ( keyFunctionCode.size() ) { + keyFunction = s->createFunction( keyFunctionCode.c_str() ); + } + + + double keysize = keyPattern.objsize() * 3; + double keynum = 1; + + map<BSONObj,int,BSONObjCmp> map; + list<BSONObj> blah; + + shared_ptr<Cursor> cursor = NamespaceDetailsTransient::getCursor(ns.c_str() , query); + ClientCursor::CleanupPointer ccPointer; + ccPointer.reset( new ClientCursor( QueryOption_NoCursorTimeout, cursor, ns ) ); + + while ( cursor->ok() ) { + + if ( !ccPointer->yieldSometimes( ClientCursor::MaybeCovered ) || + !cursor->ok() ) { + break; + } + + if ( !cursor->currentMatches() || cursor->getsetdup( cursor->currLoc() ) ) { + cursor->advance(); + continue; + } + + if ( !ccPointer->yieldSometimes( ClientCursor::WillNeed ) || + !cursor->ok() ) { + break; + } + + BSONObj obj = cursor->current(); + cursor->advance(); + + BSONObj key = getKey( obj , keyPattern , keyFunction , keysize / keynum , s.get() ); + keysize += key.objsize(); + keynum++; + + int& n = map[key]; + if ( n == 0 ) { + n = map.size(); + s->setObject( "$key" , key , true ); + + uassert( 10043 , "group() can't handle more than 20000 unique keys" , n <= 20000 ); + } + + s->setObject( "obj" , obj , true ); + s->setNumber( "n" , n - 1 ); + if ( s->invoke( f , 0, 0 , 0 , true ) ) { + throw UserException( 9010 , (string)"reduce invoke failed: " + s->getError() ); + } + } + ccPointer.reset(); + + if (!finalize.empty()) { + s->exec( "$finalize = " + finalize , "finalize define" , false , true , true , 100 ); + ScriptingFunction g = s->createFunction( + "function(){ " + " for(var i=0; i < $arr.length; i++){ " + " var ret = $finalize($arr[i]); " + " if (ret !== undefined) " + " $arr[i] = ret; " + " } " + "}" ); + s->invoke( g , 0, 0 , 0 , true ); + } + + result.appendArray( "retval" , s->getObject( "$arr" ) ); + result.append( "count" , keynum - 1 ); + result.append( "keys" , (int)(map.size()) ); + s->exec( "$arr = [];" , "reduce setup 2" , false , true , true , 100 ); + s->gc(); + + return true; + } + + bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + + if ( !globalScriptEngine ) { + errmsg = "server-side JavaScript execution is disabled"; + return false; + } + + /* db.$cmd.findOne( { group : <p> } ) */ + const BSONObj& p = jsobj.firstElement().embeddedObjectUserCheck(); + + BSONObj q; + if ( p["cond"].type() == Object ) + q = p["cond"].embeddedObject(); + else if ( p["condition"].type() == Object ) + q = p["condition"].embeddedObject(); + else + q = getQuery( p ); + + if ( p["ns"].type() != String ) { + errmsg = "ns has to be set"; + return false; + } + + string ns = dbname + "." + p["ns"].String(); + + BSONObj key; + string keyf; + if ( p["key"].type() == Object ) { + key = p["key"].embeddedObjectUserCheck(); + if ( ! p["$keyf"].eoo() ) { + errmsg = "can't have key and $keyf"; + return false; + } + } + else if ( p["$keyf"].type() ) { + keyf = p["$keyf"]._asCode(); + } + else { + // no key specified, will use entire object as key + } + + BSONElement reduce = p["$reduce"]; + if ( reduce.eoo() ) { + errmsg = "$reduce has to be set"; + return false; + } + + BSONElement initial = p["initial"]; + if ( initial.type() != Object ) { + errmsg = "initial has to be an object"; + return false; + } + + + string finalize; + if (p["finalize"].type()) + finalize = p["finalize"]._asCode(); + + return group( dbname , ns , q , + key , keyf , reduce._asCode() , reduce.type() != CodeWScope ? 0 : reduce.codeWScopeScopeData() , + initial.embeddedObject() , finalize , + errmsg , result ); + } + + } cmdGroup; + + +} // namespace mongo diff --git a/src/mongo/db/commands/isself.cpp b/src/mongo/db/commands/isself.cpp new file mode 100644 index 00000000000..ebf6d5bceec --- /dev/null +++ b/src/mongo/db/commands/isself.cpp @@ -0,0 +1,246 @@ +// isself.cpp + +#include "pch.h" +#include "../../util/net/listen.h" +#include "../commands.h" +#include "../../client/dbclient.h" +#include "../security.h" + +#include <boost/algorithm/string.hpp> + +#ifndef _WIN32 +# ifndef __sunos__ +# include <ifaddrs.h> +# endif +# include <sys/resource.h> +# include <sys/stat.h> + +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> +#include <errno.h> +#include <netdb.h> +#ifdef __openbsd__ +# include <sys/uio.h> +#endif + +#endif + + +namespace mongo { + +#if !defined(_WIN32) && !defined(__sunos__) + + vector<string> getMyAddrs() { + vector<string> out; + ifaddrs * addrs; + + if ( ! cmdLine.bind_ip.empty() ) { + boost::split( out, cmdLine.bind_ip, boost::is_any_of( ", " ) ); + return out; + } + + int status = getifaddrs(&addrs); + massert(13469, "getifaddrs failure: " + errnoWithDescription(errno), status == 0); + + // based on example code from linux getifaddrs manpage + for (ifaddrs * addr = addrs; addr != NULL; addr = addr->ifa_next) { + if ( addr->ifa_addr == NULL ) continue; + int family = addr->ifa_addr->sa_family; + char host[NI_MAXHOST]; + + if (family == AF_INET || family == AF_INET6) { + status = getnameinfo(addr->ifa_addr, + (family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)), + host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST); + if ( status != 0 ) { + freeifaddrs( addrs ); + addrs = NULL; + msgasserted( 13470, string("getnameinfo() failed: ") + gai_strerror(status) ); + } + + out.push_back(host); + } + + } + + freeifaddrs( addrs ); + addrs = NULL; + + if (logLevel >= 1) { + log(1) << "getMyAddrs():"; + for (vector<string>::const_iterator it=out.begin(), end=out.end(); it!=end; ++it) { + log(1) << " [" << *it << ']'; + } + log(1) << endl; + } + + return out; + } + + vector<string> getAllIPs(StringData iporhost) { + addrinfo* addrs = NULL; + addrinfo hints; + memset(&hints, 0, sizeof(addrinfo)); + hints.ai_socktype = SOCK_STREAM; + hints.ai_family = (IPv6Enabled() ? AF_UNSPEC : AF_INET); + + static string portNum = BSONObjBuilder::numStr(cmdLine.port); + + vector<string> out; + + int ret = getaddrinfo(iporhost.data(), portNum.c_str(), &hints, &addrs); + if ( ret ) { + warning() << "getaddrinfo(\"" << iporhost.data() << "\") failed: " << gai_strerror(ret) << endl; + return out; + } + + for (addrinfo* addr = addrs; addr != NULL; addr = addr->ai_next) { + int family = addr->ai_family; + char host[NI_MAXHOST]; + + if (family == AF_INET || family == AF_INET6) { + int status = getnameinfo(addr->ai_addr, addr->ai_addrlen, host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST); + + massert(13472, string("getnameinfo() failed: ") + gai_strerror(status), status == 0); + + out.push_back(host); + } + + } + + freeaddrinfo(addrs); + + if (logLevel >= 1) { + log(1) << "getallIPs(\"" << iporhost << "\"):"; + for (vector<string>::const_iterator it=out.begin(), end=out.end(); it!=end; ++it) { + log(1) << " [" << *it << ']'; + } + log(1) << endl; + } + + return out; + } +#endif + + + class IsSelfCommand : public Command { + public: + IsSelfCommand() : Command("_isSelf") , _cacheLock( "IsSelfCommand::_cacheLock" ) {} + virtual bool slaveOk() const { return true; } + virtual LockType locktype() const { return NONE; } + virtual void help( stringstream &help ) const { + help << "{ _isSelf : 1 } INTERNAL ONLY"; + } + + bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + init(); + result.append( "id" , _id ); + return true; + } + + void init() { + scoped_lock lk( _cacheLock ); + if ( ! _id.isSet() ) + _id.init(); + } + + OID _id; + + mongo::mutex _cacheLock; + map<string,bool> _cache; + } isSelfCommand; + + bool HostAndPort::isSelf() const { + + if( dyn() ) { + LOG(2) << "isSelf " << _dynName << ' ' << dynHostMyName() << endl; + return dynHostMyName() == _dynName; + } + + int _p = port(); + int p = _p == -1 ? CmdLine::DefaultDBPort : _p; + + if( p != cmdLine.port ) { + // shortcut - ports have to match at the very least + return false; + } + + string host = str::stream() << this->host() << ":" << p; + + { + // check cache for this host + // debatably something _could_ change, but I'm not sure right now (erh 10/14/2010) + scoped_lock lk( isSelfCommand._cacheLock ); + map<string,bool>::const_iterator i = isSelfCommand._cache.find( host ); + if ( i != isSelfCommand._cache.end() ) + return i->second; + } + +#if !defined(_WIN32) && !defined(__sunos__) + // on linux and os x we can do a quick check for an ip match + + const vector<string> myaddrs = getMyAddrs(); + const vector<string> addrs = getAllIPs(_host); + + for (vector<string>::const_iterator i=myaddrs.begin(), iend=myaddrs.end(); i!=iend; ++i) { + for (vector<string>::const_iterator j=addrs.begin(), jend=addrs.end(); j!=jend; ++j) { + string a = *i; + string b = *j; + + if ( a == b || + ( str::startsWith( a , "127." ) && str::startsWith( b , "127." ) ) // 127. is all loopback + ) { + + // add to cache + scoped_lock lk( isSelfCommand._cacheLock ); + isSelfCommand._cache[host] = true; + return true; + } + } + } + +#endif + + if ( ! Listener::getTimeTracker() ) { + // this ensures we are actually running a server + // this may return true later, so may want to retry + return false; + } + + try { + isSelfCommand.init(); + DBClientConnection conn; + string errmsg; + if ( ! conn.connect( host , errmsg ) ) { + // should this go in the cache? + return false; + } + + if (!noauth && cmdLine.keyFile && + !conn.auth("local", internalSecurity.user, internalSecurity.pwd, errmsg, false)) { + return false; + } + + BSONObj out; + bool ok = conn.simpleCommand( "admin" , &out , "_isSelf" ); + bool me = ok && out["id"].type() == jstOID && isSelfCommand._id == out["id"].OID(); + + // add to cache + scoped_lock lk( isSelfCommand._cacheLock ); + isSelfCommand._cache[host] = me; + + return me; + } + catch ( std::exception& e ) { + warning() << "could't check isSelf (" << host << ") " << e.what() << endl; + } + + return false; + } + +} diff --git a/src/mongo/db/commands/mr.cpp b/src/mongo/db/commands/mr.cpp new file mode 100644 index 00000000000..add76c39c47 --- /dev/null +++ b/src/mongo/db/commands/mr.cpp @@ -0,0 +1,1317 @@ +// mr.cpp + +/** + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "pch.h" +#include "../db.h" +#include "../instance.h" +#include "../commands.h" +#include "../../scripting/engine.h" +#include "../../client/dbclient.h" +#include "../../client/connpool.h" +#include "../../client/parallel.h" +#include "../queryoptimizer.h" +#include "../matcher.h" +#include "../clientcursor.h" +#include "../replutil.h" +#include "../../s/d_chunk_manager.h" +#include "../../s/d_logic.h" +#include "../../s/grid.h" + +#include "mr.h" + +namespace mongo { + + namespace mr { + + AtomicUInt Config::JOB_NUMBER; + + JSFunction::JSFunction( string type , const BSONElement& e ) { + _type = type; + _code = e._asCode(); + + if ( e.type() == CodeWScope ) + _wantedScope = e.codeWScopeObject(); + } + + void JSFunction::init( State * state ) { + _scope = state->scope(); + assert( _scope ); + _scope->init( &_wantedScope ); + + _func = _scope->createFunction( _code.c_str() ); + uassert( 13598 , str::stream() << "couldn't compile code for: " << _type , _func ); + + // install in JS scope so that it can be called in JS mode + _scope->setFunction(_type.c_str(), _code.c_str()); + } + + void JSMapper::init( State * state ) { + _func.init( state ); + _params = state->config().mapParams; + } + + /** + * Applies the map function to an object, which should internally call emit() + */ + void JSMapper::map( const BSONObj& o ) { + Scope * s = _func.scope(); + assert( s ); + if ( s->invoke( _func.func() , &_params, &o , 0 , true, false, true ) ) + throw UserException( 9014, str::stream() << "map invoke failed: " + s->getError() ); + } + + /** + * Applies the finalize function to a tuple obj (key, val) + * Returns tuple obj {_id: key, value: newval} + */ + BSONObj JSFinalizer::finalize( const BSONObj& o ) { + Scope * s = _func.scope(); + + Scope::NoDBAccess no = s->disableDBAccess( "can't access db inside finalize" ); + s->invokeSafe( _func.func() , &o, 0 ); + + // don't want to use o.objsize() to size b + // since there are many cases where the point of finalize + // is converting many fields to 1 + BSONObjBuilder b; + b.append( o.firstElement() ); + s->append( b , "value" , "return" ); + return b.obj(); + } + + void JSReducer::init( State * state ) { + _func.init( state ); + } + + /** + * Reduces a list of tuple objects (key, value) to a single tuple {"0": key, "1": value} + */ + BSONObj JSReducer::reduce( const BSONList& tuples ) { + if (tuples.size() <= 1) + return tuples[0]; + BSONObj key; + int endSizeEstimate = 16; + _reduce( tuples , key , endSizeEstimate ); + + BSONObjBuilder b(endSizeEstimate); + b.appendAs( key.firstElement() , "0" ); + _func.scope()->append( b , "1" , "return" ); + return b.obj(); + } + + /** + * Reduces a list of tuple object (key, value) to a single tuple {_id: key, value: val} + * Also applies a finalizer method if present. + */ + BSONObj JSReducer::finalReduce( const BSONList& tuples , Finalizer * finalizer ) { + + BSONObj res; + BSONObj key; + + if (tuples.size() == 1) { + // 1 obj, just use it + key = tuples[0]; + BSONObjBuilder b(key.objsize()); + BSONObjIterator it(key); + b.appendAs( it.next() , "_id" ); + b.appendAs( it.next() , "value" ); + res = b.obj(); + } + else { + // need to reduce + int endSizeEstimate = 16; + _reduce( tuples , key , endSizeEstimate ); + BSONObjBuilder b(endSizeEstimate); + b.appendAs( key.firstElement() , "_id" ); + _func.scope()->append( b , "value" , "return" ); + res = b.obj(); + } + + if ( finalizer ) { + res = finalizer->finalize( res ); + } + + return res; + } + + /** + * actually applies a reduce, to a list of tuples (key, value). + * After the call, tuples will hold a single tuple {"0": key, "1": value} + */ + void JSReducer::_reduce( const BSONList& tuples , BSONObj& key , int& endSizeEstimate ) { + uassert( 10074 , "need values" , tuples.size() ); + + int sizeEstimate = ( tuples.size() * tuples.begin()->getField( "value" ).size() ) + 128; + + // need to build the reduce args: ( key, [values] ) + BSONObjBuilder reduceArgs( sizeEstimate ); + boost::scoped_ptr<BSONArrayBuilder> valueBuilder; + int sizeSoFar = 0; + unsigned n = 0; + for ( ; n<tuples.size(); n++ ) { + BSONObjIterator j(tuples[n]); + BSONElement keyE = j.next(); + if ( n == 0 ) { + reduceArgs.append( keyE ); + key = keyE.wrap(); + sizeSoFar = 5 + keyE.size(); + valueBuilder.reset(new BSONArrayBuilder( reduceArgs.subarrayStart( "tuples" ) )); + } + + BSONElement ee = j.next(); + + uassert( 13070 , "value too large to reduce" , ee.size() < ( BSONObjMaxUserSize / 2 ) ); + + if ( sizeSoFar + ee.size() > BSONObjMaxUserSize ) { + assert( n > 1 ); // if not, inf. loop + break; + } + + valueBuilder->append( ee ); + sizeSoFar += ee.size(); + } + assert(valueBuilder); + valueBuilder->done(); + BSONObj args = reduceArgs.obj(); + + Scope * s = _func.scope(); + + s->invokeSafe( _func.func() , &args, 0, 0, false, true, true ); + ++numReduces; + + if ( s->type( "return" ) == Array ) { + uasserted( 10075 , "reduce -> multiple not supported yet"); + return; + } + + endSizeEstimate = key.objsize() + ( args.objsize() / tuples.size() ); + + if ( n == tuples.size() ) + return; + + // the input list was too large, add the rest of elmts to new tuples and reduce again + // note: would be better to use loop instead of recursion to avoid stack overflow + BSONList x; + for ( ; n < tuples.size(); n++ ) { + x.push_back( tuples[n] ); + } + BSONObjBuilder temp( endSizeEstimate ); + temp.append( key.firstElement() ); + s->append( temp , "1" , "return" ); + x.push_back( temp.obj() ); + _reduce( x , key , endSizeEstimate ); + } + + Config::Config( const string& _dbname , const BSONObj& cmdObj ) { + + dbname = _dbname; + ns = dbname + "." + cmdObj.firstElement().valuestr(); + + verbose = cmdObj["verbose"].trueValue(); + jsMode = cmdObj["jsMode"].trueValue(); + splitInfo = 0; + if (cmdObj.hasField("splitInfo")) + splitInfo = cmdObj["splitInfo"].Int(); + + jsMaxKeys = 500000; + reduceTriggerRatio = 10.0; + maxInMemSize = 500 * 1024; + + uassert( 13602 , "outType is no longer a valid option" , cmdObj["outType"].eoo() ); + + if ( cmdObj["out"].type() == String ) { + finalShort = cmdObj["out"].String(); + outType = REPLACE; + } + else if ( cmdObj["out"].type() == Object ) { + BSONObj o = cmdObj["out"].embeddedObject(); + + BSONElement e = o.firstElement(); + string t = e.fieldName(); + + if ( t == "normal" || t == "replace" ) { + outType = REPLACE; + finalShort = e.String(); + } + else if ( t == "merge" ) { + outType = MERGE; + finalShort = e.String(); + } + else if ( t == "reduce" ) { + outType = REDUCE; + finalShort = e.String(); + } + else if ( t == "inline" ) { + outType = INMEMORY; + } + else { + uasserted( 13522 , str::stream() << "unknown out specifier [" << t << "]" ); + } + + if (o.hasElement("db")) { + outDB = o["db"].String(); + } + + if (o.hasElement("nonAtomic")) { + outNonAtomic = o["nonAtomic"].Bool(); + if (outNonAtomic) + uassert( 15895 , "nonAtomic option cannot be used with this output type", (outType == REDUCE || outType == MERGE) ); + } + } + else { + uasserted( 13606 , "'out' has to be a string or an object" ); + } + + if ( outType != INMEMORY ) { // setup names + tempLong = str::stream() << (outDB.empty() ? dbname : outDB) << ".tmp.mr." << cmdObj.firstElement().String() << "_" << JOB_NUMBER++; + + incLong = tempLong + "_inc"; + + finalLong = str::stream() << (outDB.empty() ? dbname : outDB) << "." << finalShort; + } + + { + // scope and code + + if ( cmdObj["scope"].type() == Object ) + scopeSetup = cmdObj["scope"].embeddedObjectUserCheck(); + + mapper.reset( new JSMapper( cmdObj["map"] ) ); + reducer.reset( new JSReducer( cmdObj["reduce"] ) ); + if ( cmdObj["finalize"].type() && cmdObj["finalize"].trueValue() ) + finalizer.reset( new JSFinalizer( cmdObj["finalize"] ) ); + + if ( cmdObj["mapparams"].type() == Array ) { + mapParams = cmdObj["mapparams"].embeddedObjectUserCheck(); + } + + } + + { + // query options + BSONElement q = cmdObj["query"]; + if ( q.type() == Object ) + filter = q.embeddedObjectUserCheck(); + else + uassert( 13608 , "query has to be blank or an Object" , ! q.trueValue() ); + + + BSONElement s = cmdObj["sort"]; + if ( s.type() == Object ) + sort = s.embeddedObjectUserCheck(); + else + uassert( 13609 , "sort has to be blank or an Object" , ! s.trueValue() ); + + if ( cmdObj["limit"].isNumber() ) + limit = cmdObj["limit"].numberLong(); + else + limit = 0; + } + } + + /** + * Create temporary collection, set up indexes + */ + void State::prepTempCollection() { + if ( ! _onDisk ) + return; + + if (_config.incLong != _config.tempLong) { + // create the inc collection and make sure we have index on "0" key + _db.dropCollection( _config.incLong ); + { + writelock l( _config.incLong ); + Client::Context ctx( _config.incLong ); + string err; + if ( ! userCreateNS( _config.incLong.c_str() , BSON( "autoIndexId" << 0 ) , err , false ) ) { + uasserted( 13631 , str::stream() << "userCreateNS failed for mr incLong ns: " << _config.incLong << " err: " << err ); + } + } + + BSONObj sortKey = BSON( "0" << 1 ); + _db.ensureIndex( _config.incLong , sortKey ); + } + + // create temp collection + _db.dropCollection( _config.tempLong ); + { + writelock lock( _config.tempLong.c_str() ); + Client::Context ctx( _config.tempLong.c_str() ); + string errmsg; + if ( ! userCreateNS( _config.tempLong.c_str() , BSONObj() , errmsg , true ) ) { + uasserted( 13630 , str::stream() << "userCreateNS failed for mr tempLong ns: " << _config.tempLong << " err: " << errmsg ); + } + } + + { + // copy indexes + auto_ptr<DBClientCursor> idx = _db.getIndexes( _config.finalLong ); + while ( idx->more() ) { + BSONObj i = idx->next(); + + BSONObjBuilder b( i.objsize() + 16 ); + b.append( "ns" , _config.tempLong ); + BSONObjIterator j( i ); + while ( j.more() ) { + BSONElement e = j.next(); + if ( str::equals( e.fieldName() , "_id" ) || + str::equals( e.fieldName() , "ns" ) ) + continue; + + b.append( e ); + } + + BSONObj indexToInsert = b.obj(); + insert( Namespace( _config.tempLong.c_str() ).getSisterNS( "system.indexes" ).c_str() , indexToInsert ); + } + + } + + } + + /** + * For inline mode, appends results to output object. + * Makes sure (key, value) tuple is formatted as {_id: key, value: val} + */ + void State::appendResults( BSONObjBuilder& final ) { + if ( _onDisk ) { + if (!_config.outDB.empty()) { + BSONObjBuilder loc; + if ( !_config.outDB.empty()) + loc.append( "db" , _config.outDB ); + if ( !_config.finalShort.empty() ) + loc.append( "collection" , _config.finalShort ); + final.append("result", loc.obj()); + } + else { + if ( !_config.finalShort.empty() ) + final.append( "result" , _config.finalShort ); + } + + if ( _config.splitInfo > 0 ) { + // add split points, used for shard + BSONObj res; + BSONObj idKey = BSON( "_id" << 1 ); + if ( ! _db.runCommand( "admin" , BSON( "splitVector" << _config.finalLong << "keyPattern" << idKey << "maxChunkSizeBytes" << _config.splitInfo ) , res ) ) { + uasserted( 15921 , str::stream() << "splitVector failed: " << res ); + } + if ( res.hasField( "splitKeys" ) ) + final.append( res.getField( "splitKeys" ) ); + } + return; + } + + if (_jsMode) { + ScriptingFunction getResult = _scope->createFunction("var map = _mrMap; var result = []; for (key in map) { result.push({_id: key, value: map[key]}) } return result;"); + _scope->invoke(getResult, 0, 0, 0, false); + BSONObj obj = _scope->getObject("return"); + final.append("results", BSONArray(obj)); + return; + } + + uassert( 13604 , "too much data for in memory map/reduce" , _size < BSONObjMaxUserSize ); + + BSONArrayBuilder b( (int)(_size * 1.2) ); // _size is data size, doesn't count overhead and keys + + for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) { + BSONObj key = i->first; + BSONList& all = i->second; + + assert( all.size() == 1 ); + + BSONObjIterator vi( all[0] ); + vi.next(); + + BSONObjBuilder temp( b.subobjStart() ); + temp.appendAs( key.firstElement() , "_id" ); + temp.appendAs( vi.next() , "value" ); + temp.done(); + } + + BSONArray res = b.arr(); + final.append( "results" , res ); + } + + /** + * Does post processing on output collection. + * This may involve replacing, merging or reducing. + */ + long long State::postProcessCollection(CurOp* op, ProgressMeterHolder& pm) { + if ( _onDisk == false || _config.outType == Config::INMEMORY ) + return _temp->size(); + + if (_config.outNonAtomic) + return postProcessCollectionNonAtomic(op, pm); + writelock lock; + return postProcessCollectionNonAtomic(op, pm); + } + + long long State::postProcessCollectionNonAtomic(CurOp* op, ProgressMeterHolder& pm) { + + if ( _config.finalLong == _config.tempLong ) + return _db.count( _config.finalLong ); + + if ( _config.outType == Config::REPLACE || _db.count( _config.finalLong ) == 0 ) { + writelock lock; + // replace: just rename from temp to final collection name, dropping previous collection + _db.dropCollection( _config.finalLong ); + BSONObj info; + if ( ! _db.runCommand( "admin" , BSON( "renameCollection" << _config.tempLong << "to" << _config.finalLong ) , info ) ) { + uasserted( 10076 , str::stream() << "rename failed: " << info ); + } + + _db.dropCollection( _config.tempLong ); + } + else if ( _config.outType == Config::MERGE ) { + // merge: upsert new docs into old collection + op->setMessage( "m/r: merge post processing" , _db.count( _config.tempLong, BSONObj() ) ); + auto_ptr<DBClientCursor> cursor = _db.query( _config.tempLong , BSONObj() ); + while ( cursor->more() ) { + writelock lock; + BSONObj o = cursor->next(); + Helpers::upsert( _config.finalLong , o ); + getDur().commitIfNeeded(); + pm.hit(); + } + _db.dropCollection( _config.tempLong ); + pm.finished(); + } + else if ( _config.outType == Config::REDUCE ) { + // reduce: apply reduce op on new result and existing one + BSONList values; + + op->setMessage( "m/r: reduce post processing" , _db.count( _config.tempLong, BSONObj() ) ); + auto_ptr<DBClientCursor> cursor = _db.query( _config.tempLong , BSONObj() ); + while ( cursor->more() ) { + writelock lock; + BSONObj temp = cursor->next(); + BSONObj old; + + bool found; + { + Client::Context tx( _config.finalLong ); + found = Helpers::findOne( _config.finalLong.c_str() , temp["_id"].wrap() , old , true ); + } + + if ( found ) { + // need to reduce + values.clear(); + values.push_back( temp ); + values.push_back( old ); + Helpers::upsert( _config.finalLong , _config.reducer->finalReduce( values , _config.finalizer.get() ) ); + } + else { + Helpers::upsert( _config.finalLong , temp ); + } + getDur().commitIfNeeded(); + pm.hit(); + } + _db.dropCollection( _config.tempLong ); + pm.finished(); + } + + return _db.count( _config.finalLong ); + } + + /** + * Insert doc in collection + */ + void State::insert( const string& ns , const BSONObj& o ) { + assert( _onDisk ); + + writelock l( ns ); + Client::Context ctx( ns ); + + theDataFileMgr.insertAndLog( ns.c_str() , o , false ); + } + + /** + * Insert doc into the inc collection, taking proper lock + */ + void State::insertToInc( BSONObj& o ) { + writelock l(_config.incLong); + Client::Context ctx(_config.incLong); + _insertToInc(o); + } + + /** + * Insert doc into the inc collection + */ + void State::_insertToInc( BSONObj& o ) { + assert( _onDisk ); + theDataFileMgr.insertWithObjMod( _config.incLong.c_str() , o , true ); + getDur().commitIfNeeded(); + } + + State::State( const Config& c ) : _config( c ), _size(0), _dupCount(0), _numEmits(0) { + _temp.reset( new InMemory() ); + _onDisk = _config.outType != Config::INMEMORY; + } + + bool State::sourceExists() { + return _db.exists( _config.ns ); + } + + long long State::incomingDocuments() { + return _db.count( _config.ns , _config.filter , QueryOption_SlaveOk , (unsigned) _config.limit ); + } + + State::~State() { + if ( _onDisk ) { + try { + _db.dropCollection( _config.tempLong ); + _db.dropCollection( _config.incLong ); + } + catch ( std::exception& e ) { + error() << "couldn't cleanup after map reduce: " << e.what() << endl; + } + } + + if (_scope) { + // cleanup js objects + ScriptingFunction cleanup = _scope->createFunction("delete _emitCt; delete _keyCt; delete _mrMap;"); + _scope->invoke(cleanup, 0, 0, 0, true); + } + } + + /** + * Initialize the mapreduce operation, creating the inc collection + */ + void State::init() { + // setup js + _scope.reset(globalScriptEngine->getPooledScope( _config.dbname ).release() ); + _scope->localConnect( _config.dbname.c_str() ); + + if ( ! _config.scopeSetup.isEmpty() ) + _scope->init( &_config.scopeSetup ); + + _config.mapper->init( this ); + _config.reducer->init( this ); + if ( _config.finalizer ) + _config.finalizer->init( this ); + _scope->setBoolean("_doFinal", _config.finalizer); + + // by default start in JS mode, will be faster for small jobs + _jsMode = _config.jsMode; +// _jsMode = true; + switchMode(_jsMode); + + // global JS map/reduce hashmap + // we use a standard JS object which means keys are only simple types + // we could also add a real hashmap from a library, still we need to add object comparison methods +// _scope->setObject("_mrMap", BSONObj(), false); + ScriptingFunction init = _scope->createFunction("_emitCt = 0; _keyCt = 0; _dupCt = 0; _redCt = 0; if (typeof(_mrMap) === 'undefined') { _mrMap = {}; }"); + _scope->invoke(init, 0, 0, 0, true); + + // js function to run reduce on all keys +// redfunc = _scope->createFunction("for (var key in hashmap) { print('Key is ' + key); list = hashmap[key]; ret = reduce(key, list); print('Value is ' + ret); };"); + _reduceAll = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length != 1) { ret = _reduce(key, list); map[key] = [ret]; ++_redCt; } } _dupCt = 0;"); + _reduceAndEmit = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length == 1) { ret = list[0]; } else { ret = _reduce(key, list); ++_redCt; } emit(key, ret); }; delete _mrMap;"); + _reduceAndFinalize = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length == 1) { if (!_doFinal) {continue;} ret = list[0]; } else { ret = _reduce(key, list); ++_redCt; }; if (_doFinal){ ret = _finalize(key, ret); } map[key] = ret; }"); + _reduceAndFinalizeAndInsert = _scope->createFunction("var map = _mrMap; var list, ret; for (var key in map) { list = map[key]; if (list.length == 1) { ret = list[0]; } else { ret = _reduce(key, list); ++_redCt; }; if (_doFinal){ ret = _finalize(key, ret); } _nativeToTemp({_id: key, value: ret}); }"); + + } + + void State::switchMode(bool jsMode) { + _jsMode = jsMode; + if (jsMode) { + // emit function that stays in JS + _scope->setFunction("emit", "function(key, value) { if (typeof(key) === 'object') { _bailFromJS(key, value); return; }; ++_emitCt; var map = _mrMap; var list = map[key]; if (!list) { ++_keyCt; list = []; map[key] = list; } else { ++_dupCt; } list.push(value); }"); + _scope->injectNative("_bailFromJS", _bailFromJS, this); + } + else { + // emit now populates C++ map + _scope->injectNative( "emit" , fast_emit, this ); + } + } + + void State::bailFromJS() { + log(1) << "M/R: Switching from JS mode to mixed mode" << endl; + + // reduce and reemit into c++ + switchMode(false); + _scope->invoke(_reduceAndEmit, 0, 0, 0, true); + // need to get the real number emitted so far + _numEmits = _scope->getNumberInt("_emitCt"); + _config.reducer->numReduces = _scope->getNumberInt("_redCt"); + } + + /** + * Applies last reduce and finalize on a list of tuples (key, val) + * Inserts single result {_id: key, value: val} into temp collection + */ + void State::finalReduce( BSONList& values ) { + if ( !_onDisk || values.size() == 0 ) + return; + + BSONObj res = _config.reducer->finalReduce( values , _config.finalizer.get() ); + insert( _config.tempLong , res ); + } + + BSONObj _nativeToTemp( const BSONObj& args, void* data ) { + State* state = (State*) data; + BSONObjIterator it(args); + state->insert(state->_config.tempLong, it.next().Obj()); + return BSONObj(); + } + +// BSONObj _nativeToInc( const BSONObj& args, void* data ) { +// State* state = (State*) data; +// BSONObjIterator it(args); +// const BSONObj& obj = it.next().Obj(); +// state->_insertToInc(const_cast<BSONObj&>(obj)); +// return BSONObj(); +// } + + /** + * Applies last reduce and finalize. + * After calling this method, the temp collection will be completed. + * If inline, the results will be in the in memory map + */ + void State::finalReduce( CurOp * op , ProgressMeterHolder& pm ) { + + if (_jsMode) { + // apply the reduce within JS + if (_onDisk) { + _scope->injectNative("_nativeToTemp", _nativeToTemp, this); + _scope->invoke(_reduceAndFinalizeAndInsert, 0, 0, 0, true); + return; + } + else { + _scope->invoke(_reduceAndFinalize, 0, 0, 0, true); + return; + } + } + + if ( ! _onDisk ) { + // all data has already been reduced, just finalize + if ( _config.finalizer ) { + long size = 0; + for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) { + BSONObj key = i->first; + BSONList& all = i->second; + + assert( all.size() == 1 ); + + BSONObj res = _config.finalizer->finalize( all[0] ); + + all.clear(); + all.push_back( res ); + size += res.objsize(); + } + _size = size; + } + return; + } + + // use index on "0" to pull sorted data + assert( _temp->size() == 0 ); + BSONObj sortKey = BSON( "0" << 1 ); + { + bool foundIndex = false; + + auto_ptr<DBClientCursor> idx = _db.getIndexes( _config.incLong ); + while ( idx.get() && idx->more() ) { + BSONObj x = idx->next(); + if ( sortKey.woCompare( x["key"].embeddedObject() ) == 0 ) { + foundIndex = true; + break; + } + } + + assert( foundIndex ); + } + + readlock rl( _config.incLong.c_str() ); + Client::Context ctx( _config.incLong ); + + BSONObj prev; + BSONList all; + + assert( pm == op->setMessage( "m/r: (3/3) final reduce to collection" , _db.count( _config.incLong, BSONObj(), QueryOption_SlaveOk ) ) ); + + shared_ptr<Cursor> temp = bestGuessCursor( _config.incLong.c_str() , BSONObj() , sortKey ); + auto_ptr<ClientCursor> cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , _config.incLong.c_str() ) ); + + // iterate over all sorted objects + while ( cursor->ok() ) { + BSONObj o = cursor->current().getOwned(); + cursor->advance(); + + pm.hit(); + + if ( o.woSortOrder( prev , sortKey ) == 0 ) { + // object is same as previous, add to array + all.push_back( o ); + if ( pm->hits() % 1000 == 0 ) { + if ( ! cursor->yield() ) { + cursor.release(); + break; + } + killCurrentOp.checkForInterrupt(); + } + continue; + } + + ClientCursor::YieldLock yield (cursor.get()); + + try { + // reduce a finalize array + finalReduce( all ); + } + catch (...) { + yield.relock(); + cursor.release(); + throw; + } + + all.clear(); + prev = o; + all.push_back( o ); + + if ( ! yield.stillOk() ) { + cursor.release(); + break; + } + + killCurrentOp.checkForInterrupt(); + } + + // we need to release here since we temp release below + cursor.release(); + + { + dbtempreleasecond tl; + if ( ! tl.unlocked() ) + log( LL_WARNING ) << "map/reduce can't temp release" << endl; + // reduce and finalize last array + finalReduce( all ); + } + + pm.finished(); + } + + /** + * Attempts to reduce objects in the memory map. + * A new memory map will be created to hold the results. + * If applicable, objects with unique key may be dumped to inc collection. + * Input and output objects are both {"0": key, "1": val} + */ + void State::reduceInMemory() { + + if (_jsMode) { + // in js mode the reduce is applied when writing to collection + return; + } + + auto_ptr<InMemory> n( new InMemory() ); // for new data + long nSize = 0; + _dupCount = 0; + + for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); ++i ) { + BSONObj key = i->first; + BSONList& all = i->second; + + if ( all.size() == 1 ) { + // only 1 value for this key + if ( _onDisk ) { + // this key has low cardinality, so just write to collection + writelock l(_config.incLong); + Client::Context ctx(_config.incLong.c_str()); + _insertToInc( *(all.begin()) ); + } + else { + // add to new map + _add( n.get() , all[0] , nSize ); + } + } + else if ( all.size() > 1 ) { + // several values, reduce and add to map + BSONObj res = _config.reducer->reduce( all ); + _add( n.get() , res , nSize ); + } + } + + // swap maps + _temp.reset( n.release() ); + _size = nSize; + } + + /** + * Dumps the entire in memory map to the inc collection. + */ + void State::dumpToInc() { + if ( ! _onDisk ) + return; + + writelock l(_config.incLong); + Client::Context ctx(_config.incLong); + + for ( InMemory::iterator i=_temp->begin(); i!=_temp->end(); i++ ) { + BSONList& all = i->second; + if ( all.size() < 1 ) + continue; + + for ( BSONList::iterator j=all.begin(); j!=all.end(); j++ ) + _insertToInc( *j ); + } + _temp->clear(); + _size = 0; + + } + + /** + * Adds object to in memory map + */ + void State::emit( const BSONObj& a ) { + _numEmits++; + _add( _temp.get() , a , _size ); + } + + void State::_add( InMemory* im, const BSONObj& a , long& size ) { + BSONList& all = (*im)[a]; + all.push_back( a ); + size += a.objsize() + 16; + if (all.size() > 1) + ++_dupCount; + } + + /** + * this method checks the size of in memory map and potentially flushes to disk + */ + void State::checkSize() { + if (_jsMode) { + // try to reduce if it is beneficial + int dupCt = _scope->getNumberInt("_dupCt"); + int keyCt = _scope->getNumberInt("_keyCt"); + + if (keyCt > _config.jsMaxKeys) { + // too many keys for JS, switch to mixed + _bailFromJS(BSONObj(), this); + // then fall through to check map size + } + else if (dupCt > (keyCt * _config.reduceTriggerRatio)) { + // reduce now to lower mem usage + Timer t; + _scope->invoke(_reduceAll, 0, 0, 0, true); + log(1) << " MR - did reduceAll: keys=" << keyCt << " dups=" << dupCt << " newKeys=" << _scope->getNumberInt("_keyCt") << " time=" << t.millis() << "ms" << endl; + return; + } + } + + if (_jsMode) + return; + + if (_size > _config.maxInMemSize || _dupCount > (_temp->size() * _config.reduceTriggerRatio)) { + // attempt to reduce in memory map, if memory is too high or we have many duplicates + long oldSize = _size; + Timer t; + reduceInMemory(); + log(1) << " MR - did reduceInMemory: size=" << oldSize << " dups=" << _dupCount << " newSize=" << _size << " time=" << t.millis() << "ms" << endl; + + // if size is still high, or values are not reducing well, dump + if ( _onDisk && (_size > _config.maxInMemSize || _size > oldSize / 2) ) { + dumpToInc(); + log(1) << " MR - dumping to db" << endl; + } + } + } + + /** + * emit that will be called by js function + */ + BSONObj fast_emit( const BSONObj& args, void* data ) { + uassert( 10077 , "fast_emit takes 2 args" , args.nFields() == 2 ); + uassert( 13069 , "an emit can't be more than half max bson size" , args.objsize() < ( BSONObjMaxUserSize / 2 ) ); + + State* state = (State*) data; + if ( args.firstElement().type() == Undefined ) { + BSONObjBuilder b( args.objsize() ); + b.appendNull( "" ); + BSONObjIterator i( args ); + i.next(); + b.append( i.next() ); + state->emit( b.obj() ); + } + else { + state->emit( args ); + } + return BSONObj(); + } + + /** + * function is called when we realize we cant use js mode for m/r on the 1st key + */ + BSONObj _bailFromJS( const BSONObj& args, void* data ) { + State* state = (State*) data; + state->bailFromJS(); + + // emit this particular key if there is one + if (!args.isEmpty()) { + fast_emit(args, data); + } + return BSONObj(); + } + + /** + * This class represents a map/reduce command executed on a single server + */ + class MapReduceCommand : public Command { + public: + MapReduceCommand() : Command("mapReduce", false, "mapreduce") {} + + /* why !replset ? + bad things happen with --slave (i think because of this) + */ + virtual bool slaveOk() const { return !replSet; } + + virtual bool slaveOverrideOk() { return true; } + + virtual void help( stringstream &help ) const { + help << "Run a map/reduce operation on the server.\n"; + help << "Note this is used for aggregation, not querying, in MongoDB.\n"; + help << "http://www.mongodb.org/display/DOCS/MapReduce"; + } + + virtual LockType locktype() const { return NONE; } + + bool run(const string& dbname , BSONObj& cmd, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + Timer t; + Client& client = cc(); + CurOp * op = client.curop(); + + Config config( dbname , cmd ); + + log(1) << "mr ns: " << config.ns << endl; + + bool shouldHaveData = false; + + long long num = 0; + long long inReduce = 0; + + BSONObjBuilder countsBuilder; + BSONObjBuilder timingBuilder; + State state( config ); + if ( ! state.sourceExists() ) { + errmsg = "ns doesn't exist"; + return false; + } + + if (replSet && state.isOnDisk()) { + // this means that it will be doing a write operation, make sure we are on Master + // ideally this check should be in slaveOk(), but at that point config is not known + if (!isMaster(dbname.c_str())) { + errmsg = "not master"; + return false; + } + } + + if (state.isOnDisk() && !client.getAuthenticationInfo()->isAuthorized(dbname)) { + errmsg = "read-only user cannot output mapReduce to collection, use inline instead"; + return false; + } + + try { + state.init(); + state.prepTempCollection(); + ProgressMeterHolder pm( op->setMessage( "m/r: (1/3) emit phase" , state.incomingDocuments() ) ); + + wassert( config.limit < 0x4000000 ); // see case on next line to 32 bit unsigned + long long mapTime = 0; + { + readlock lock( config.ns ); + Client::Context ctx( config.ns ); + + ShardChunkManagerPtr chunkManager; + if ( shardingState.needShardChunkManager( config.ns ) ) { + chunkManager = shardingState.getShardChunkManager( config.ns ); + } + + // obtain cursor on data to apply mr to, sorted + shared_ptr<Cursor> temp = NamespaceDetailsTransient::getCursor( config.ns.c_str(), config.filter, config.sort ); + uassert( 15876, str::stream() << "could not create cursor over " << config.ns << " for query : " << config.filter << " sort : " << config.sort, temp.get() ); + auto_ptr<ClientCursor> cursor( new ClientCursor( QueryOption_NoCursorTimeout , temp , config.ns.c_str() ) ); + uassert( 15877, str::stream() << "could not create client cursor over " << config.ns << " for query : " << config.filter << " sort : " << config.sort, cursor.get() ); + + Timer mt; + // go through each doc + while ( cursor->ok() ) { + if ( ! cursor->currentMatches() ) { + cursor->advance(); + continue; + } + + // make sure we dont process duplicates in case data gets moved around during map + // TODO This won't actually help when data gets moved, it's to handle multikeys. + if ( cursor->currentIsDup() ) { + cursor->advance(); + continue; + } + + BSONObj o = cursor->current(); + cursor->advance(); + + // check to see if this is a new object we don't own yet + // because of a chunk migration + if ( chunkManager && ! chunkManager->belongsToMe( o ) ) + continue; + + // do map + if ( config.verbose ) mt.reset(); + config.mapper->map( o ); + if ( config.verbose ) mapTime += mt.micros(); + + num++; + if ( num % 1000 == 0 ) { + // try to yield lock regularly + ClientCursor::YieldLock yield (cursor.get()); + Timer t; + // check if map needs to be dumped to disk + state.checkSize(); + inReduce += t.micros(); + + if ( ! yield.stillOk() ) { + cursor.release(); + break; + } + + killCurrentOp.checkForInterrupt(); + } + pm.hit(); + + if ( config.limit && num >= config.limit ) + break; + } + } + pm.finished(); + + killCurrentOp.checkForInterrupt(); + // update counters + countsBuilder.appendNumber( "input" , num ); + countsBuilder.appendNumber( "emit" , state.numEmits() ); + if ( state.numEmits() ) + shouldHaveData = true; + + timingBuilder.append( "mapTime" , mapTime / 1000 ); + timingBuilder.append( "emitLoop" , t.millis() ); + + op->setMessage( "m/r: (2/3) final reduce in memory" ); + Timer t; + // do reduce in memory + // this will be the last reduce needed for inline mode + state.reduceInMemory(); + // if not inline: dump the in memory map to inc collection, all data is on disk + state.dumpToInc(); + // final reduce + state.finalReduce( op , pm ); + inReduce += t.micros(); + countsBuilder.appendNumber( "reduce" , state.numReduces() ); + timingBuilder.append( "reduceTime" , inReduce / 1000 ); + timingBuilder.append( "mode" , state.jsMode() ? "js" : "mixed" ); + + long long finalCount = state.postProcessCollection(op, pm); + state.appendResults( result ); + + timingBuilder.append( "total" , t.millis() ); + result.append( "timeMillis" , t.millis() ); + countsBuilder.appendNumber( "output" , finalCount ); + if ( config.verbose ) result.append( "timing" , timingBuilder.obj() ); + result.append( "counts" , countsBuilder.obj() ); + + if ( finalCount == 0 && shouldHaveData ) { + result.append( "cmd" , cmd ); + errmsg = "there were emits but no data!"; + return false; + } + + } + catch( SendStaleConfigException& e ){ + log() << "mr detected stale config, should retry" << causedBy(e) << endl; + throw e; + } + // TODO: The error handling code for queries is v. fragile, + // *requires* rethrow AssertionExceptions - should probably fix. + catch ( AssertionException& e ){ + log() << "mr failed, removing collection" << causedBy(e) << endl; + throw e; + } + catch ( std::exception& e ){ + log() << "mr failed, removing collection" << causedBy(e) << endl; + throw e; + } + catch ( ... ) { + log() << "mr failed for unknown reason, removing collection" << endl; + throw; + } + + return true; + } + + } mapReduceCommand; + + /** + * This class represents a map/reduce command executed on the output server of a sharded env + */ + class MapReduceFinishCommand : public Command { + public: + MapReduceFinishCommand() : Command( "mapreduce.shardedfinish" ) {} + virtual bool slaveOk() const { return !replSet; } + virtual bool slaveOverrideOk() { return true; } + + virtual LockType locktype() const { return NONE; } + bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { + ShardedConnectionInfo::addHook(); + // legacy name + string shardedOutputCollection = cmdObj["shardedOutputCollection"].valuestrsafe(); + string inputNS = cmdObj["inputNS"].valuestrsafe(); + if (inputNS.empty()) + inputNS = dbname + "." + shardedOutputCollection; + + Client& client = cc(); + CurOp * op = client.curop(); + + Config config( dbname , cmdObj.firstElement().embeddedObjectUserCheck() ); + State state(config); + state.init(); + + // no need for incremental collection because records are already sorted + config.incLong = config.tempLong; + + BSONObj shardCounts = cmdObj["shardCounts"].embeddedObjectUserCheck(); + BSONObj counts = cmdObj["counts"].embeddedObjectUserCheck(); + + ProgressMeterHolder pm( op->setMessage( "m/r: merge sort and reduce" ) ); + set<ServerAndQuery> servers; + vector< auto_ptr<DBClientCursor> > shardCursors; + + { + // parse per shard results + BSONObjIterator i( shardCounts ); + while ( i.more() ) { + BSONElement e = i.next(); + string shard = e.fieldName(); +// BSONObj res = e.embeddedObjectUserCheck(); + servers.insert( shard ); + } + } + + state.prepTempCollection(); + + BSONList values; + if (!config.outDB.empty()) { + BSONObjBuilder loc; + if ( !config.outDB.empty()) + loc.append( "db" , config.outDB ); + if ( !config.finalShort.empty() ) + loc.append( "collection" , config.finalShort ); + result.append("result", loc.obj()); + } + else { + if ( !config.finalShort.empty() ) + result.append( "result" , config.finalShort ); + } + + // fetch result from other shards 1 chunk at a time + // it would be better to do just one big $or query, but then the sorting would not be efficient + string shardName = shardingState.getShardName(); + DBConfigPtr confOut = grid.getDBConfig( dbname , false ); + vector<ChunkPtr> chunks; + if ( confOut->isSharded(config.finalLong) ) { + ChunkManagerPtr cm = confOut->getChunkManager( config.finalLong ); + const ChunkMap& chunkMap = cm->getChunkMap(); + for ( ChunkMap::const_iterator it = chunkMap.begin(); it != chunkMap.end(); ++it ) { + ChunkPtr chunk = it->second; + if (chunk->getShard().getName() == shardName) chunks.push_back(chunk); + } + } + + long long inputCount = 0; + unsigned int index = 0; + BSONObj query; + BSONArrayBuilder chunkSizes; + while (true) { + ChunkPtr chunk; + if (chunks.size() > 0) { + chunk = chunks[index]; + BSONObjBuilder b; + b.appendAs(chunk->getMin().firstElement(), "$gte"); + b.appendAs(chunk->getMax().firstElement(), "$lt"); + query = BSON("_id" << b.obj()); +// chunkSizes.append(min); + } + + // reduce from each shard for a chunk + BSONObj sortKey = BSON( "_id" << 1 ); + ParallelSortClusteredCursor cursor( servers , inputNS , Query( query ).sort( sortKey ) ); + cursor.init(); + int chunkSize = 0; + + while ( cursor.more() || !values.empty() ) { + BSONObj t; + if (cursor.more()) { + t = cursor.next().getOwned(); + ++inputCount; + + if ( values.size() == 0 ) { + values.push_back( t ); + continue; + } + + if ( t.woSortOrder( *(values.begin()) , sortKey ) == 0 ) { + values.push_back( t ); + continue; + } + } + + BSONObj res = config.reducer->finalReduce( values , config.finalizer.get()); + chunkSize += res.objsize(); + if (state.isOnDisk()) + state.insertToInc(res); + else + state.emit(res); + values.clear(); + if (!t.isEmpty()) + values.push_back( t ); + } + + if (chunk) { + chunkSizes.append(chunk->getMin()); + chunkSizes.append(chunkSize); + } + if (++index >= chunks.size()) + break; + } + + result.append( "chunkSizes" , chunkSizes.arr() ); + + long long outputCount = state.postProcessCollection(op, pm); + state.appendResults( result ); + + BSONObjBuilder countsB(32); + countsB.append("input", inputCount); + countsB.append("reduce", state.numReduces()); + countsB.append("output", outputCount); + result.append( "counts" , countsB.obj() ); + + return 1; + } + } mapReduceFinishCommand; + + } + +} + diff --git a/src/mongo/db/commands/mr.h b/src/mongo/db/commands/mr.h new file mode 100644 index 00000000000..592769d82da --- /dev/null +++ b/src/mongo/db/commands/mr.h @@ -0,0 +1,319 @@ +// mr.h + +/** + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "pch.h" + +namespace mongo { + + namespace mr { + + typedef vector<BSONObj> BSONList; + + class State; + + // ------------ function interfaces ----------- + + class Mapper : boost::noncopyable { + public: + virtual ~Mapper() {} + virtual void init( State * state ) = 0; + + virtual void map( const BSONObj& o ) = 0; + }; + + class Finalizer : boost::noncopyable { + public: + virtual ~Finalizer() {} + virtual void init( State * state ) = 0; + + /** + * this takes a tuple and returns a tuple + */ + virtual BSONObj finalize( const BSONObj& tuple ) = 0; + }; + + class Reducer : boost::noncopyable { + public: + Reducer() : numReduces(0) {} + virtual ~Reducer() {} + virtual void init( State * state ) = 0; + + virtual BSONObj reduce( const BSONList& tuples ) = 0; + /** this means its a final reduce, even if there is no finalizer */ + virtual BSONObj finalReduce( const BSONList& tuples , Finalizer * finalizer ) = 0; + + long long numReduces; + }; + + // ------------ js function implementations ----------- + + /** + * used as a holder for Scope and ScriptingFunction + * visitor like pattern as Scope is gotten from first access + */ + class JSFunction : boost::noncopyable { + public: + /** + * @param type (map|reduce|finalize) + */ + JSFunction( string type , const BSONElement& e ); + virtual ~JSFunction() {} + + virtual void init( State * state ); + + Scope * scope() const { return _scope; } + ScriptingFunction func() const { return _func; } + + private: + string _type; + string _code; // actual javascript code + BSONObj _wantedScope; // this is for CodeWScope + + Scope * _scope; // this is not owned by us, and might be shared + ScriptingFunction _func; + }; + + class JSMapper : public Mapper { + public: + JSMapper( const BSONElement & code ) : _func( "_map" , code ) {} + virtual void map( const BSONObj& o ); + virtual void init( State * state ); + + private: + JSFunction _func; + BSONObj _params; + }; + + class JSReducer : public Reducer { + public: + JSReducer( const BSONElement& code ) : _func( "_reduce" , code ) {} + virtual void init( State * state ); + + virtual BSONObj reduce( const BSONList& tuples ); + virtual BSONObj finalReduce( const BSONList& tuples , Finalizer * finalizer ); + + private: + + /** + * result in "return" + * @param key OUT + * @param endSizeEstimate OUT + */ + void _reduce( const BSONList& values , BSONObj& key , int& endSizeEstimate ); + + JSFunction _func; + }; + + class JSFinalizer : public Finalizer { + public: + JSFinalizer( const BSONElement& code ) : _func( "_finalize" , code ) {} + virtual BSONObj finalize( const BSONObj& o ); + virtual void init( State * state ) { _func.init( state ); } + private: + JSFunction _func; + + }; + + // ----------------- + + + class TupleKeyCmp { + public: + TupleKeyCmp() {} + bool operator()( const BSONObj &l, const BSONObj &r ) const { + return l.firstElement().woCompare( r.firstElement() ) < 0; + } + }; + + typedef map< BSONObj,BSONList,TupleKeyCmp > InMemory; // from key to list of tuples + + /** + * holds map/reduce config information + */ + class Config { + public: + Config( const string& _dbname , const BSONObj& cmdObj ); + + string dbname; + string ns; + + // options + bool verbose; + bool jsMode; + int splitInfo; + + // query options + + BSONObj filter; + BSONObj sort; + long long limit; + + // functions + + scoped_ptr<Mapper> mapper; + scoped_ptr<Reducer> reducer; + scoped_ptr<Finalizer> finalizer; + + BSONObj mapParams; + BSONObj scopeSetup; + + // output tables + string incLong; + string tempLong; + + string finalShort; + string finalLong; + + string outDB; + + // max number of keys allowed in JS map before switching mode + long jsMaxKeys; + // ratio of duplicates vs unique keys before reduce is triggered in js mode + float reduceTriggerRatio; + // maximum size of map before it gets dumped to disk + long maxInMemSize; + + enum { REPLACE , // atomically replace the collection + MERGE , // merge keys, override dups + REDUCE , // merge keys, reduce dups + INMEMORY // only store in memory, limited in size + } outType; + + // if true, no lock during output operation + bool outNonAtomic; + + static AtomicUInt JOB_NUMBER; + }; // end MRsetup + + /** + * stores information about intermediate map reduce state + * controls flow of data from map->reduce->finalize->output + */ + class State { + public: + State( const Config& c ); + ~State(); + + void init(); + + // ---- prep ----- + bool sourceExists(); + + long long incomingDocuments(); + + // ---- map stage ---- + + /** + * stages on in in-memory storage + */ + void emit( const BSONObj& a ); + + /** + * if size is big, run a reduce + * if its still big, dump to temp collection + */ + void checkSize(); + + /** + * run reduce on _temp + */ + void reduceInMemory(); + + /** + * transfers in memory storage to temp collection + */ + void dumpToInc(); + void insertToInc( BSONObj& o ); + void _insertToInc( BSONObj& o ); + + // ------ reduce stage ----------- + + void prepTempCollection(); + + void finalReduce( BSONList& values ); + + void finalReduce( CurOp * op , ProgressMeterHolder& pm ); + + // ------- cleanup/data positioning ---------- + + /** + @return number objects in collection + */ + long long postProcessCollection( CurOp* op , ProgressMeterHolder& pm ); + long long postProcessCollectionNonAtomic( CurOp* op , ProgressMeterHolder& pm ); + + /** + * if INMEMORY will append + * may also append stats or anything else it likes + */ + void appendResults( BSONObjBuilder& b ); + + // -------- util ------------ + + /** + * inserts with correct replication semantics + */ + void insert( const string& ns , const BSONObj& o ); + + // ------ simple accessors ----- + + /** State maintains ownership, do no use past State lifetime */ + Scope* scope() { return _scope.get(); } + + const Config& config() { return _config; } + + const bool isOnDisk() { return _onDisk; } + + long long numEmits() const { if (_jsMode) return _scope->getNumberLongLong("_emitCt"); return _numEmits; } + long long numReduces() const { if (_jsMode) return _scope->getNumberLongLong("_redCt"); return _config.reducer->numReduces; } + + bool jsMode() {return _jsMode;} + void switchMode(bool jsMode); + void bailFromJS(); + + const Config& _config; + DBDirectClient _db; + + protected: + + void _add( InMemory* im , const BSONObj& a , long& size ); + + scoped_ptr<Scope> _scope; + bool _onDisk; // if the end result of this map reduce is disk or not + + scoped_ptr<InMemory> _temp; + long _size; // bytes in _temp + long _dupCount; // number of duplicate key entries + + long long _numEmits; + + bool _jsMode; + ScriptingFunction _reduceAll; + ScriptingFunction _reduceAndEmit; + ScriptingFunction _reduceAndFinalize; + ScriptingFunction _reduceAndFinalizeAndInsert; + }; + + BSONObj fast_emit( const BSONObj& args, void* data ); + BSONObj _bailFromJS( const BSONObj& args, void* data ); + + } // end mr namespace +} + + diff --git a/src/mongo/db/commands/pipeline.cpp b/src/mongo/db/commands/pipeline.cpp new file mode 100755 index 00000000000..4ad5e342aed --- /dev/null +++ b/src/mongo/db/commands/pipeline.cpp @@ -0,0 +1,405 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/commands/pipeline.h"
+
+#include "db/cursor.h"
+#include "db/pipeline/accumulator.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/document_source.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pdfile.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+
+ const char Pipeline::commandName[] = "aggregate";
+ const char Pipeline::pipelineName[] = "pipeline";
+ const char Pipeline::fromRouterName[] = "fromRouter";
+ const char Pipeline::splitMongodPipelineName[] = "splitMongodPipeline";
+
+ Pipeline::~Pipeline() {
+ }
+
+ Pipeline::Pipeline(const intrusive_ptr<ExpressionContext> &pTheCtx):
+ collectionName(),
+ sourceVector(),
+ splitMongodPipeline(DEBUG_BUILD == 1), /* test: always split for DEV */
+ pCtx(pTheCtx) {
+ }
+
+
+
+ /* this structure is used to make a lookup table of operators */
+ struct StageDesc {
+ const char *pName;
+ intrusive_ptr<DocumentSource> (*pFactory)(
+ BSONElement *, const intrusive_ptr<ExpressionContext> &);
+ };
+
+ /* this table must be in alphabetical order by name for bsearch() */
+ static const StageDesc stageDesc[] = {
+#ifdef NEVER /* disabled for now in favor of $match */
+ {DocumentSourceFilter::filterName,
+ DocumentSourceFilter::createFromBson},
+#endif
+ {DocumentSourceGroup::groupName,
+ DocumentSourceGroup::createFromBson},
+ {DocumentSourceLimit::limitName,
+ DocumentSourceLimit::createFromBson},
+ {DocumentSourceMatch::matchName,
+ DocumentSourceMatch::createFromBson},
+#ifdef LATER /* https://jira.mongodb.org/browse/SERVER-3253 */
+ {DocumentSourceOut::outName,
+ DocumentSourceOut::createFromBson},
+#endif
+ {DocumentSourceProject::projectName,
+ DocumentSourceProject::createFromBson},
+ {DocumentSourceSkip::skipName,
+ DocumentSourceSkip::createFromBson},
+ {DocumentSourceSort::sortName,
+ DocumentSourceSort::createFromBson},
+ {DocumentSourceUnwind::unwindName,
+ DocumentSourceUnwind::createFromBson},
+ };
+ static const size_t nStageDesc = sizeof(stageDesc) / sizeof(StageDesc);
+
+ static int stageDescCmp(const void *pL, const void *pR) {
+ return strcmp(((const StageDesc *)pL)->pName,
+ ((const StageDesc *)pR)->pName);
+ }
+
+ boost::shared_ptr<Pipeline> Pipeline::parseCommand(
+ string &errmsg, BSONObj &cmdObj,
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ boost::shared_ptr<Pipeline> pPipeline(new Pipeline(pCtx));
+ vector<BSONElement> pipeline;
+
+ /* gather the specification for the aggregation */
+ for(BSONObj::iterator cmdIterator = cmdObj.begin();
+ cmdIterator.more(); ) {
+ BSONElement cmdElement(cmdIterator.next());
+ const char *pFieldName = cmdElement.fieldName();
+
+ /* look for the aggregation command */
+ if (!strcmp(pFieldName, commandName)) {
+ pPipeline->collectionName = cmdElement.String();
+ continue;
+ }
+
+ /* check for the collection name */
+ if (!strcmp(pFieldName, pipelineName)) {
+ pipeline = cmdElement.Array();
+ continue;
+ }
+
+ /* if the request came from the router, we're in a shard */
+ if (!strcmp(pFieldName, fromRouterName)) {
+ pCtx->setInShard(cmdElement.Bool());
+ continue;
+ }
+
+ /* check for debug options */
+ if (!strcmp(pFieldName, splitMongodPipelineName)) {
+ pPipeline->splitMongodPipeline = true;
+ continue;
+ }
+
+ /* we didn't recognize a field in the command */
+ ostringstream sb;
+ sb <<
+ "Pipeline::parseCommand(): unrecognized field \"" <<
+ cmdElement.fieldName();
+ errmsg = sb.str();
+ return boost::shared_ptr<Pipeline>();
+ }
+
+ /*
+ If we get here, we've harvested the fields we expect for a pipeline.
+
+ Set up the specified document source pipeline.
+ */
+ SourceVector *pSourceVector = &pPipeline->sourceVector; // shorthand
+
+ /* iterate over the steps in the pipeline */
+ const size_t nSteps = pipeline.size();
+ for(size_t iStep = 0; iStep < nSteps; ++iStep) {
+ /* pull out the pipeline element as an object */
+ BSONElement pipeElement(pipeline[iStep]);
+ uassert(15942, str::stream() << "pipeline element " <<
+ iStep << " is not an object",
+ pipeElement.type() == Object);
+ BSONObj bsonObj(pipeElement.Obj());
+
+ intrusive_ptr<DocumentSource> pSource;
+
+ /* use the object to add a DocumentSource to the processing chain */
+ BSONObjIterator bsonIterator(bsonObj);
+ while(bsonIterator.more()) {
+ BSONElement bsonElement(bsonIterator.next());
+ const char *pFieldName = bsonElement.fieldName();
+
+ /* select the appropriate operation and instantiate */
+ StageDesc key;
+ key.pName = pFieldName;
+ const StageDesc *pDesc = (const StageDesc *)
+ bsearch(&key, stageDesc, nStageDesc, sizeof(StageDesc),
+ stageDescCmp);
+ if (pDesc)
+ pSource = (*pDesc->pFactory)(&bsonElement, pCtx);
+ else {
+ ostringstream sb;
+ sb <<
+ "Pipeline::run(): unrecognized pipeline op \"" <<
+ pFieldName;
+ errmsg = sb.str();
+ return shared_ptr<Pipeline>();
+ }
+ }
+
+ pSourceVector->push_back(pSource);
+ }
+
+ /* if there aren't any pipeline stages, there's nothing more to do */
+ if (!pSourceVector->size())
+ return pPipeline;
+
+ /*
+ Move filters up where possible.
+
+ CW TODO -- move filter past projections where possible, and noting
+ corresponding field renaming.
+ */
+
+ /*
+ Wherever there is a match immediately following a sort, swap them.
+ This means we sort fewer items. Neither changes the documents in
+ the stream, so this transformation shouldn't affect the result.
+
+ We do this first, because then when we coalesce operators below,
+ any adjacent matches will be combined.
+ */
+ for(size_t srcn = pSourceVector->size(), srci = 1;
+ srci < srcn; ++srci) {
+ intrusive_ptr<DocumentSource> &pSource = pSourceVector->at(srci);
+ if (dynamic_cast<DocumentSourceMatch *>(pSource.get())) {
+ intrusive_ptr<DocumentSource> &pPrevious =
+ pSourceVector->at(srci - 1);
+ if (dynamic_cast<DocumentSourceSort *>(pPrevious.get())) {
+ /* swap this item with the previous */
+ intrusive_ptr<DocumentSource> pTemp(pPrevious);
+ pPrevious = pSource;
+ pSource = pTemp;
+ }
+ }
+ }
+
+ /*
+ Coalesce adjacent filters where possible. Two adjacent filters
+ are equivalent to one filter whose predicate is the conjunction of
+ the two original filters' predicates. For now, capture this by
+ giving any DocumentSource the option to absorb it's successor; this
+ will also allow adjacent projections to coalesce when possible.
+
+ Run through the DocumentSources, and give each one the opportunity
+ to coalesce with its successor. If successful, remove the
+ successor.
+
+ Move all document sources to a temporary list.
+ */
+ SourceVector tempVector(*pSourceVector);
+ pSourceVector->clear();
+
+ /* move the first one to the final list */
+ pSourceVector->push_back(tempVector[0]);
+
+ /* run through the sources, coalescing them or keeping them */
+ for(size_t tempn = tempVector.size(), tempi = 1;
+ tempi < tempn; ++tempi) {
+ /*
+ If we can't coalesce the source with the last, then move it
+ to the final list, and make it the new last. (If we succeeded,
+ then we're still on the same last, and there's no need to move
+ or do anything with the source -- the destruction of tempVector
+ will take care of the rest.)
+ */
+ intrusive_ptr<DocumentSource> &pLastSource = pSourceVector->back();
+ intrusive_ptr<DocumentSource> &pTemp = tempVector.at(tempi);
+ if (!pLastSource->coalesce(pTemp))
+ pSourceVector->push_back(pTemp);
+ }
+
+ /* optimize the elements in the pipeline */
+ for(SourceVector::iterator iter(pSourceVector->begin()),
+ listEnd(pSourceVector->end()); iter != listEnd; ++iter)
+ (*iter)->optimize();
+
+ return pPipeline;
+ }
+
+ shared_ptr<Pipeline> Pipeline::splitForSharded() {
+ /* create an initialize the shard spec we'll return */
+ shared_ptr<Pipeline> pShardPipeline(new Pipeline(pCtx));
+ pShardPipeline->collectionName = collectionName;
+
+ /* put the source list aside */
+ SourceVector tempVector(sourceVector);
+ sourceVector.clear();
+
+ /*
+ Run through the pipeline, looking for points to split it into
+ shard pipelines, and the rest.
+ */
+ while(!tempVector.empty()) {
+ intrusive_ptr<DocumentSource> &pSource = tempVector.front();
+
+#ifdef MONGODB_SERVER3832 /* see https://jira.mongodb.org/browse/SERVER-3832 */
+ DocumentSourceSort *pSort =
+ dynamic_cast<DocumentSourceSort *>(pSource.get());
+ if (pSort) {
+ /*
+ There's no point in sorting until the result is combined.
+ Therefore, sorts should be done in mongos, and not in
+ the shard at all. Add all the remaining operators to
+ the mongos list and quit.
+
+ TODO: unless the sort key is the shard key.
+ TODO: we could also do a merge sort in mongos in the
+ future, and split here.
+ */
+ for(size_t tempn = tempVector.size(), tempi = 0;
+ tempi < tempn; ++tempi)
+ sourceVector.push_back(tempVector[tempi]);
+ break;
+ }
+#endif
+
+ /* hang on to this in advance, in case it is a group */
+ DocumentSourceGroup *pGroup =
+ dynamic_cast<DocumentSourceGroup *>(pSource.get());
+
+ /* move the source from the tempVector to the shard sourceVector */
+ pShardPipeline->sourceVector.push_back(pSource);
+ tempVector.erase(tempVector.begin());
+
+ /*
+ If we found a group, that's a split point.
+ */
+ if (pGroup) {
+ /* start this pipeline with the group merger */
+ sourceVector.push_back(pGroup->createMerger());
+
+ /* and then add everything that remains and quit */
+ for(size_t tempn = tempVector.size(), tempi = 0;
+ tempi < tempn; ++tempi)
+ sourceVector.push_back(tempVector[tempi]);
+ break;
+ }
+ }
+
+ return pShardPipeline;
+ }
+
+ void Pipeline::getCursorMods(BSONObjBuilder *pQueryBuilder,
+ BSONObjBuilder *pSortBuilder) {
+ /* look for an initial $match */
+ if (!sourceVector.size())
+ return;
+ const intrusive_ptr<DocumentSource> &pMC = sourceVector.front();
+ const DocumentSourceMatch *pMatch =
+ dynamic_cast<DocumentSourceMatch *>(pMC.get());
+
+ if (pMatch) {
+ /* build the query */
+ pMatch->toMatcherBson(pQueryBuilder);
+
+ /* remove the match from the pipeline */
+ sourceVector.erase(sourceVector.begin());
+ }
+
+ /* look for an initial $sort */
+ if (!sourceVector.size())
+ return;
+#ifdef MONGODB_SERVER3832 /* see https://jira.mongodb.org/browse/SERVER-3832 */
+ const intrusive_ptr<DocumentSource> &pSC = sourceVector.front();
+ const DocumentSourceSort *pSort =
+ dynamic_cast<DocumentSourceSort *>(pSC.get());
+
+ if (pSort) {
+ /* build the sort key */
+ pSort->sortKeyToBson(pSortBuilder, false);
+
+ /* remove the sort from the pipeline */
+ sourceVector.erase(sourceVector.begin());
+ }
+#endif
+ }
+
+ void Pipeline::toBson(BSONObjBuilder *pBuilder) const {
+ /* create an array out of the pipeline operations */
+ BSONArrayBuilder arrayBuilder;
+ for(SourceVector::const_iterator iter(sourceVector.begin()),
+ listEnd(sourceVector.end()); iter != listEnd; ++iter) {
+ intrusive_ptr<DocumentSource> pSource(*iter);
+ pSource->addToBsonArray(&arrayBuilder);
+ }
+
+ /* add the top-level items to the command */
+ pBuilder->append(commandName, getCollectionName());
+ pBuilder->append(pipelineName, arrayBuilder.arr());
+
+ bool btemp;
+ if ((btemp = getSplitMongodPipeline())) {
+ pBuilder->append(splitMongodPipelineName, btemp);
+ }
+ if ((btemp = pCtx->getInRouter())) {
+ pBuilder->append(fromRouterName, btemp);
+ }
+ }
+
+ bool Pipeline::run(BSONObjBuilder &result, string &errmsg,
+ intrusive_ptr<DocumentSource> pSource) {
+ /* chain together the sources we found */
+ for(SourceVector::iterator iter(sourceVector.begin()),
+ listEnd(sourceVector.end()); iter != listEnd; ++iter) {
+ intrusive_ptr<DocumentSource> pTemp(*iter);
+ pTemp->setSource(pSource);
+ pSource = pTemp;
+ }
+ /* pSource is left pointing at the last source in the chain */
+
+ /*
+ Iterate through the resulting documents, and add them to the result.
+ */
+ BSONArrayBuilder resultArray; // where we'll stash the results
+ for(bool hasDocument = !pSource->eof(); hasDocument;
+ hasDocument = pSource->advance()) {
+ boost::intrusive_ptr<Document> pDocument(pSource->getCurrent());
+
+ /* add the document to the result set */
+ BSONObjBuilder documentBuilder;
+ pDocument->toBson(&documentBuilder);
+ resultArray.append(documentBuilder.done());
+ }
+
+ result.appendArray("result", resultArray.arr());
+
+ return true;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/commands/pipeline.h b/src/mongo/db/commands/pipeline.h new file mode 100755 index 00000000000..ef9cc6afe51 --- /dev/null +++ b/src/mongo/db/commands/pipeline.h @@ -0,0 +1,183 @@ +/** + * Copyright 2011 (c) 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "pch.h" + +#include "db/jsobj.h" +#include "util/timer.h" +#include "db/commands.h" + +namespace mongo { + class BSONObj; + class BSONObjBuilder; + class DocumentSource; + class DocumentSourceProject; + class Expression; + class ExpressionContext; + class ExpressionNary; + struct OpDesc; // local private struct + + /** mongodb "commands" (sent via db.$cmd.findOne(...)) + subclass to make a command. define a singleton object for it. + */ + class Pipeline : + boost::noncopyable { + public: + virtual ~Pipeline(); + + /* + Create a pipeline from the command. + + @param errmsg where to write errors, if there are any + @param cmdObj the command object sent from the client + @returns the pipeline, if created, otherwise a NULL reference + */ + static boost::shared_ptr<Pipeline> parseCommand( + string &errmsg, BSONObj &cmdObj, + const intrusive_ptr<ExpressionContext> &pCtx); + + /* + Get the collection name from the command. + + @returns the collection name + */ + string getCollectionName() const; + + /* + Split the current Pipeline into a Pipeline for each shard, and + a Pipeline that combines the results within mongos. + + This permanently alters this pipeline for the merging operation. + + @returns the Spec for the pipeline command that should be sent + to the shards + */ + boost::shared_ptr<Pipeline> splitForSharded(); + + /* + Get Cursor creation modifiers. + + If we have a $match or a $sort at the beginning of the pipeline, + these can be extracted and used to modify the cursor we'll use for + the initial collection scan. + + If there is a Matcher query at the beginning of the pipeline, + get it, by adding its terms to the object under construction. If + not, this adds nothing to the object under construction. + + If there is a sort at the beginning of the pipeline, get it, by + adding its terms to the object under construction. If not, this adds + nothing. + + Optimization steps in parseCommand make sure that for any pairs + of adjacent matches and sorts, the match comes first. This ensures + that we sort a minimum of items, and doesn't change the result. + When getCursorMods() examines the pipeline, it looks for an initial + $match. If present, that is put into pQueryBuilder. If there is + a query, then the next stage is checked for a $sort, which will go + into pSortBuilder. If there is no initial $match, then a check is + made for an initial $sort, which will then still be put into + pSortBuilder. + + As a side-effect, retrieving the Cursor modifications removes them + from the pipeline. + + @param pQueryBuilder an initialized object builder + @param pSortBuilder an initialized object builder + */ + void getCursorMods(BSONObjBuilder *pQueryBuilder, + BSONObjBuilder *pSortBuilder); + + /* + Write the Pipeline as a BSONObj command. This should be the + inverse of parseCommand(). + + This is only intended to be used by the shard command obtained + from splitForSharded(). Some pipeline operations in the merge + process do not have equivalent command forms, and using this on + the mongos Pipeline will cause assertions. + + @param the builder to write the command to + */ + void toBson(BSONObjBuilder *pBuilder) const; + + /* + Run the Pipeline on the given source. + + @param result builder to write the result to + @param errmsg place to put error messages, if any + @param pSource the document source to use at the head of the chain + @returns true on success, false if an error occurs + */ + bool run(BSONObjBuilder &result, string &errmsg, + intrusive_ptr<DocumentSource> pSource); + + /* + Debugging: should the processing pipeline be split within + mongod, simulating the real mongos/mongod split? This is determined + by setting the splitMongodPipeline field in an "aggregate" + command. + + The split itself is handled by the caller, which is currently + pipeline_command.cpp. + + @returns true if the pipeline is to be split + */ + bool getSplitMongodPipeline() const; + + /* + The aggregation command name. + */ + static const char commandName[]; + + private: + static const char pipelineName[]; + static const char fromRouterName[]; + static const char splitMongodPipelineName[]; + + Pipeline(const intrusive_ptr<ExpressionContext> &pCtx); + + string collectionName; + typedef vector<intrusive_ptr<DocumentSource> > SourceVector; + SourceVector sourceVector; + + bool splitMongodPipeline; + intrusive_ptr<ExpressionContext> pCtx; + }; + +} // namespace mongo + + +/* ======================= INLINED IMPLEMENTATIONS ========================== */ + +namespace mongo { + + inline string Pipeline::getCollectionName() const { + return collectionName; + } + + inline bool Pipeline::getSplitMongodPipeline() const { + if (!DEBUG_BUILD) + return false; + + return splitMongodPipeline; + } + +} // namespace mongo + + diff --git a/src/mongo/db/commands/pipeline_command.cpp b/src/mongo/db/commands/pipeline_command.cpp new file mode 100755 index 00000000000..9863e14556c --- /dev/null +++ b/src/mongo/db/commands/pipeline_command.cpp @@ -0,0 +1,187 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/commands/pipeline.h"
+#include "db/cursor.h"
+#include "db/pdfile.h"
+#include "db/pipeline/accumulator.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/document_source.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/queryoptimizer.h"
+
+namespace mongo {
+
+ /** mongodb "commands" (sent via db.$cmd.findOne(...))
+ subclass to make a command. define a singleton object for it.
+ */
+ class PipelineCommand :
+ public Command {
+ public:
+ // virtuals from Command
+ virtual ~PipelineCommand();
+ virtual bool run(const string &db, BSONObj &cmdObj, int options,
+ string &errmsg, BSONObjBuilder &result, bool fromRepl);
+ virtual LockType locktype() const;
+ virtual bool slaveOk() const;
+ virtual void help(stringstream &help) const;
+
+ PipelineCommand();
+ };
+
+ // self-registering singleton static instance
+ static PipelineCommand pipelineCommand;
+
+ PipelineCommand::PipelineCommand():
+ Command(Pipeline::commandName) {
+ }
+
+ Command::LockType PipelineCommand::locktype() const {
+ return READ;
+ }
+
+ bool PipelineCommand::slaveOk() const {
+ return true;
+ }
+
+ void PipelineCommand::help(stringstream &help) const {
+ help << "{ pipeline : [ { <data-pipe-op>: {...}}, ... ] }";
+ }
+
+ PipelineCommand::~PipelineCommand() {
+ }
+
+ bool PipelineCommand::run(const string &db, BSONObj &cmdObj,
+ int options, string &errmsg,
+ BSONObjBuilder &result, bool fromRepl) {
+
+ intrusive_ptr<ExpressionContext> pCtx(ExpressionContext::create());
+
+ /* try to parse the command; if this fails, then we didn't run */
+ boost::shared_ptr<Pipeline> pPipeline(
+ Pipeline::parseCommand(errmsg, cmdObj, pCtx));
+ if (!pPipeline.get())
+ return false;
+
+ /* get a query to use, if any */
+ BSONObjBuilder queryBuilder;
+ BSONObjBuilder sortBuilder;
+ pPipeline->getCursorMods(&queryBuilder, &sortBuilder);
+ BSONObj query(queryBuilder.done());
+ BSONObj sort(sortBuilder.done());
+
+ /* for debugging purposes, show what the query and sort are */
+ DEV {
+ (log() << "\n---- query BSON\n" <<
+ query.jsonString(Strict, 1) << "\n----\n").flush();
+ (log() << "\n---- sort BSON\n" <<
+ sort.jsonString(Strict, 1) << "\n----\n").flush();
+ }
+
+ /* create a cursor for that query */
+ string fullName(db + "." + pPipeline->getCollectionName());
+ shared_ptr<Cursor> pCursor(
+ NamespaceDetailsTransient::getCursor(
+ fullName.c_str(), query
+#ifdef MONGODB_SERVER3832 /* see https://jira.mongodb.org/browse/SERVER-3832 */
+ , sort
+#endif
+ ));
+
+ /* wrap the cursor with a DocumentSource */
+ intrusive_ptr<DocumentSource> pSource(
+ DocumentSourceCursor::create(pCursor));
+
+ /* this is the normal non-debug path */
+ if (!pPipeline->getSplitMongodPipeline())
+ return pPipeline->run(result, errmsg, pSource);
+
+ /* setup as if we're in the router */
+ pCtx->setInRouter(true);
+
+ /*
+ Here, we'll split the pipeline in the same way we would for sharding,
+ for testing purposes.
+
+ Run the shard pipeline first, then feed the results into the remains
+ of the existing pipeline.
+
+ Start by splitting the pipeline.
+ */
+ shared_ptr<Pipeline> pShardSplit(
+ pPipeline->splitForSharded());
+
+ /*
+ Write the split pipeline as we would in order to transmit it to
+ the shard servers.
+ */
+ BSONObjBuilder shardBuilder;
+ pShardSplit->toBson(&shardBuilder);
+ BSONObj shardBson(shardBuilder.done());
+
+ DEV (log() << "\n---- shardBson\n" <<
+ shardBson.jsonString(Strict, 1) << "\n----\n").flush();
+
+ /* for debugging purposes, show what the pipeline now looks like */
+ DEV {
+ BSONObjBuilder pipelineBuilder;
+ pPipeline->toBson(&pipelineBuilder);
+ BSONObj pipelineBson(pipelineBuilder.done());
+ (log() << "\n---- pipelineBson\n" <<
+ pipelineBson.jsonString(Strict, 1) << "\n----\n").flush();
+ }
+
+ /* on the shard servers, create the local pipeline */
+ intrusive_ptr<ExpressionContext> pShardCtx(ExpressionContext::create());
+ shared_ptr<Pipeline> pShardPipeline(
+ Pipeline::parseCommand(errmsg, shardBson, pShardCtx));
+ if (!pShardPipeline.get()) {
+ return false;
+ }
+
+ /* run the shard pipeline */
+ BSONObjBuilder shardResultBuilder;
+ string shardErrmsg;
+ pShardPipeline->run(shardResultBuilder, shardErrmsg, pSource);
+ BSONObj shardResult(shardResultBuilder.done());
+
+ /* pick out the shard result, and prepare to read it */
+ intrusive_ptr<DocumentSourceBsonArray> pShardSource;
+ BSONObjIterator shardIter(shardResult);
+ while(shardIter.more()) {
+ BSONElement shardElement(shardIter.next());
+ const char *pFieldName = shardElement.fieldName();
+
+ if (strcmp(pFieldName, "result") == 0) {
+ pShardSource = DocumentSourceBsonArray::create(&shardElement);
+
+ /*
+ Connect the output of the shard pipeline with the mongos
+ pipeline that will merge the results.
+ */
+ return pPipeline->run(result, errmsg, pShardSource);
+ }
+ }
+
+ /* NOTREACHED */
+ assert(false);
+ return false;
+ }
+
+} // namespace mongo
diff --git a/src/mongo/db/common.cpp b/src/mongo/db/common.cpp new file mode 100644 index 00000000000..cd073f8b059 --- /dev/null +++ b/src/mongo/db/common.cpp @@ -0,0 +1,73 @@ +/** @file common.cpp + Common code for server binaries (mongos, mongod, test). + Nothing used by driver should be here. + */ + +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +//#include "pch.h" +//#include "concurrency.h" +#include "jsobjmanipulator.h" + +/** + * this just has globals + */ +namespace mongo { + + /** called by mongos, mongod, test. do not call from clients and such. + invoked before about everything except global var construction. + */ + void doPreServerStartupInits() { +#if defined(RLIMIT_NPROC) && defined(RLIMIT_NOFILE) + //Check that # of files rlmit > 1000 , and # of processes > # of files/2 + const unsigned int minNumFiles = 1000; + const double filesToProcsRatio = 2.0; + struct rlimit rlnproc; + struct rlimit rlnofile; + + if(!getrlimit(RLIMIT_NPROC,&rlnproc) && !getrlimit(RLIMIT_NOFILE,&rlnofile)){ + if(rlnofile.rlim_cur < minNumFiles){ + log() << "Warning: soft rlimits too low. Number of files is " << rlnofile.rlim_cur << ", should be at least " << minNumFiles << endl; + } + if(rlnproc.rlim_cur < rlnofile.rlim_cur/filesToProcsRatio){ + log() << "Warning: soft rlimits too low. " << rlnproc.rlim_cur << " processes, " << rlnofile.rlim_cur << " files. Number of processes should be at least "<< 1/filesToProcsRatio << " times number of files." << endl; + } + } + else{ + log() << "Warning: getrlimit failed" << endl; + } +#endif + } + + NOINLINE_DECL OpTime OpTime::skewed() { + bool toLog = false; + ONCE toLog = true; + RARELY toLog = true; + last.i++; + if ( last.i & 0x80000000 ) + toLog = true; + if ( toLog ) { + log() << "clock skew detected prev: " << last.secs << " now: " << (unsigned) time(0) << endl; + } + if ( last.i & 0x80000000 ) { + log() << "error large clock skew detected, shutting down" << endl; + throw ClockSkewException(); + } + return last; + } + +} diff --git a/src/mongo/db/compact.cpp b/src/mongo/db/compact.cpp new file mode 100644 index 00000000000..32931b6c5fd --- /dev/null +++ b/src/mongo/db/compact.cpp @@ -0,0 +1,376 @@ +/** @file compact.cpp + compaction of deleted space in pdfiles (datafiles) +*/ + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful,b +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "pdfile.h" +#include "concurrency.h" +#include "commands.h" +#include "curop-inl.h" +#include "background.h" +#include "extsort.h" +#include "compact.h" +#include "../util/concurrency/task.h" +#include "../util/timer.h" + +namespace mongo { + + char faux; + + void addRecordToRecListInExtent(Record *r, DiskLoc loc); + DiskLoc allocateSpaceForANewRecord(const char *ns, NamespaceDetails *d, int lenWHdr, bool god); + void freeExtents(DiskLoc firstExt, DiskLoc lastExt); + + /* this should be done in alloc record not here, but doing here for now. + really dumb; it's a start. + */ + unsigned quantizeMask(unsigned x) { + if( x > 4096 * 20 ) + return ~4095; + if( x >= 512 ) + return ~63; + return ~0; + } + + /** @return number of skipped (invalid) documents */ + unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc ext, int n, + const scoped_array<IndexSpec> &indexSpecs, + scoped_array<SortPhaseOne>& phase1, int nidx, bool validate, + double pf, int pb) + { + log() << "compact extent #" << n << endl; + unsigned oldObjSize = 0; // we'll report what the old padding was + unsigned oldObjSizeWithPadding = 0; + + Extent *e = ext.ext(); + e->assertOk(); + assert( e->validates() ); + unsigned skipped = 0; + + { + // the next/prev pointers within the extent might not be in order so we first page the whole thing in + // sequentially + log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; + Timer t; + MAdvise adv(e, e->length, MAdvise::Sequential); + const char *p = (const char *) e; + for( int i = 0; i < e->length; i += 4096 ) { + faux += p[i]; + } + int ms = t.millis(); + if( ms > 1000 ) + log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl; + } + + { + log() << "compact copying records" << endl; + unsigned totalSize = 0; + int nrecs = 0; + DiskLoc L = e->firstRecord; + if( !L.isNull() ) { + while( 1 ) { + Record *recOld = L.rec(); + L = recOld->nextInExtent(L); + nrecs++; + BSONObj objOld(recOld); + + if( !validate || objOld.valid() ) { + unsigned sz = objOld.objsize(); + + oldObjSize += sz; + oldObjSizeWithPadding += recOld->netLength(); + + unsigned lenWHdr = sz + Record::HeaderSize; + unsigned lenWPadding = lenWHdr; + { + lenWPadding = static_cast<unsigned>(pf*lenWPadding); + lenWPadding += pb; + lenWPadding = lenWPadding & quantizeMask(lenWPadding); + if( lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { + lenWPadding = lenWHdr; + } + } + totalSize += lenWPadding; + DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false); + uassert(14024, "compact error out of space during compaction", !loc.isNull()); + Record *recNew = loc.rec(); + recNew = (Record *) getDur().writingPtr(recNew, lenWHdr); + addRecordToRecListInExtent(recNew, loc); + memcpy(recNew->data, objOld.objdata(), sz); + + { + // extract keys for all indexes we will be rebuilding + for( int x = 0; x < nidx; x++ ) { + phase1[x].addKeys(indexSpecs[x], objOld, loc); + } + } + } + else { + if( ++skipped <= 10 ) + log() << "compact skipping invalid object" << endl; + } + + if( L.isNull() ) { + // we just did the very last record from the old extent. it's still pointed to + // by the old extent ext, but that will be fixed below after this loop + break; + } + + // remove the old records (orphan them) periodically so our commit block doesn't get too large + bool stopping = false; + RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0; + if( stopping || getDur().aCommitIsNeeded() ) { + e->firstRecord.writing() = L; + Record *r = L.rec(); + getDur().writingInt(r->prevOfs) = DiskLoc::NullOfs; + getDur().commitIfNeeded(); + killCurrentOp.checkForInterrupt(false); + } + } + } // if !L.isNull() + + assert( d->firstExtent == ext ); + assert( d->lastExtent != ext ); + DiskLoc newFirst = e->xnext; + d->firstExtent.writing() = newFirst; + newFirst.ext()->xprev.writing().Null(); + getDur().writing(e)->markEmpty(); + freeExtents(ext,ext); + getDur().commitIfNeeded(); + + { + double op = 1.0; + if( oldObjSize ) + op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; + log() << "compact " << nrecs << " documents " << totalSize/1000000.0 << "MB" + << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100 + << endl; + } + } + + return skipped; + } + + extern SortPhaseOne *precalced; + + bool _compact(const char *ns, NamespaceDetails *d, string& errmsg, bool validate, BSONObjBuilder& result, double pf, int pb) { + //int les = d->lastExtentSize; + + // this is a big job, so might as well make things tidy before we start just to be nice. + getDur().commitNow(); + + list<DiskLoc> extents; + for( DiskLoc L = d->firstExtent; !L.isNull(); L = L.ext()->xnext ) + extents.push_back(L); + log() << "compact " << extents.size() << " extents" << endl; + + ProgressMeterHolder pm( cc().curop()->setMessage( "compact extent" , extents.size() ) ); + + // same data, but might perform a little different after compact? + NamespaceDetailsTransient::get(ns).clearQueryCache(); + + int nidx = d->nIndexes; + scoped_array<IndexSpec> indexSpecs( new IndexSpec[nidx] ); + scoped_array<SortPhaseOne> phase1( new SortPhaseOne[nidx] ); + { + NamespaceDetails::IndexIterator ii = d->ii(); + int x = 0; + while( ii.more() ) { + BSONObjBuilder b; + IndexDetails& idx = ii.next(); + BSONObj::iterator i(idx.info.obj()); + while( i.more() ) { + BSONElement e = i.next(); + if( !str::equals(e.fieldName(), "v") && !str::equals(e.fieldName(), "background") ) { + b.append(e); + } + } + BSONObj o = b.obj().getOwned(); + phase1[x].sorter.reset( new BSONObjExternalSorter( idx.idxInterface(), o.getObjectField("key") ) ); + phase1[x].sorter->hintNumObjects( d->stats.nrecords ); + indexSpecs[x++].reset(o); + } + } + + log() << "compact orphan deleted lists" << endl; + for( int i = 0; i < Buckets; i++ ) { + d->deletedList[i].writing().Null(); + } + + + + // Start over from scratch with our extent sizing and growth + d->lastExtentSize=0; + + // before dropping indexes, at least make sure we can allocate one extent! + uassert(14025, "compact error no space available to allocate", !allocateSpaceForANewRecord(ns, d, Record::HeaderSize+1, false).isNull()); + + // note that the drop indexes call also invalidates all clientcursors for the namespace, which is important and wanted here + log() << "compact dropping indexes" << endl; + BSONObjBuilder b; + if( !dropIndexes(d, ns, "*", errmsg, b, true) ) { + errmsg = "compact drop indexes failed"; + log() << errmsg << endl; + return false; + } + + getDur().commitNow(); + + long long skipped = 0; + int n = 0; + for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { + skipped += compactExtent(ns, d, *i, n++, indexSpecs, phase1, nidx, validate, pf, pb); + pm.hit(); + } + + if( skipped ) { + result.append("invalidObjects", skipped); + } + + assert( d->firstExtent.ext()->xprev.isNull() ); + + // indexes will do their own progress meter? + pm.finished(); + + // build indexes + NamespaceString s(ns); + string si = s.db + ".system.indexes"; + for( int i = 0; i < nidx; i++ ) { + killCurrentOp.checkForInterrupt(false); + BSONObj info = indexSpecs[i].info; + log() << "compact create index " << info["key"].Obj().toString() << endl; + try { + precalced = &phase1[i]; + theDataFileMgr.insert(si.c_str(), info.objdata(), info.objsize()); + } + catch(...) { + precalced = 0; + throw; + } + precalced = 0; + } + + return true; + } + + bool compact(const string& ns, string &errmsg, bool validate, BSONObjBuilder& result, double pf, int pb) { + massert( 14028, "bad ns", NamespaceString::normal(ns.c_str()) ); + massert( 14027, "can't compact a system namespace", !str::contains(ns, ".system.") ); // items in system.indexes cannot be moved there are pointers to those disklocs in NamespaceDetails + + bool ok; + { + writelock lk; + BackgroundOperation::assertNoBgOpInProgForNs(ns.c_str()); + Client::Context ctx(ns); + NamespaceDetails *d = nsdetails(ns.c_str()); + massert( 13660, str::stream() << "namespace " << ns << " does not exist", d ); + massert( 13661, "cannot compact capped collection", !d->capped ); + log() << "compact " << ns << " begin" << endl; + if( pf != 0 || pb != 0 ) { + log() << "paddingFactor:" << pf << " paddingBytes:" << pb << endl; + } + try { + ok = _compact(ns.c_str(), d, errmsg, validate, result, pf, pb); + } + catch(...) { + log() << "compact " << ns << " end (with error)" << endl; + throw; + } + log() << "compact " << ns << " end" << endl; + } + return ok; + } + + bool isCurrentlyAReplSetPrimary(); + + class CompactCmd : public Command { + public: + virtual LockType locktype() const { return NONE; } + virtual bool adminOnly() const { return false; } + virtual bool slaveOk() const { return true; } + virtual bool maintenanceMode() const { return true; } + virtual bool logTheOp() { return false; } + virtual void help( stringstream& help ) const { + help << "compact collection\n" + "warning: this operation blocks the server and is slow. you can cancel with cancelOp()\n" + "{ compact : <collection_name>, [force:true], [validate:true] }\n" + " force - allows to run on a replica set primary\n" + " validate - check records are noncorrupt before adding to newly compacting extents. slower but safer (default is true in this version)\n"; + } + virtual bool requiresAuth() { return true; } + CompactCmd() : Command("compact") { } + + virtual bool run(const string& db, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + string coll = cmdObj.firstElement().valuestr(); + if( coll.empty() || db.empty() ) { + errmsg = "no collection name specified"; + return false; + } + + if( isCurrentlyAReplSetPrimary() && !cmdObj["force"].trueValue() ) { + errmsg = "will not run compact on an active replica set primary as this is a slow blocking operation. use force:true to force"; + return false; + } + + string ns = db + '.' + coll; + if ( ! NamespaceString::normal(ns.c_str()) ) { + errmsg = "bad namespace name"; + return false; + } + + // parameter validation to avoid triggering assertions in compact() + if ( str::contains(ns, ".system.") ) { + errmsg = "can't compact a system namespace"; + return false; + } + + { + writelock lk; + Client::Context ctx(ns); + NamespaceDetails *d = nsdetails(ns.c_str()); + if( ! d ) { + errmsg = "namespace does not exist"; + return false; + } + + if ( d->capped ) { + errmsg = "cannot compact a capped collection"; + return false; + } + } + + double pf = 1.0; + int pb = 0; + if( cmdObj.hasElement("paddingFactor") ) { + pf = cmdObj["paddingFactor"].Number(); + assert( pf >= 1.0 && pf <= 4.0 ); + } + if( cmdObj.hasElement("paddingBytes") ) { + pb = (int) cmdObj["paddingBytes"].Number(); + assert( pb >= 0 && pb <= 1024 * 1024 ); + } + + bool validate = !cmdObj.hasElement("validate") || cmdObj["validate"].trueValue(); // default is true at the moment + bool ok = compact(ns, errmsg, validate, result, pf, pb); + return ok; + } + }; + static CompactCmd compactCmd; + +} diff --git a/src/mongo/db/compact.h b/src/mongo/db/compact.h new file mode 100644 index 00000000000..7bf49c8e1b8 --- /dev/null +++ b/src/mongo/db/compact.h @@ -0,0 +1,50 @@ +// compact.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +namespace mongo { + + /** for bottom up fastbuildindex (where we presort keys) */ + struct SortPhaseOne { + SortPhaseOne() { + n = 0; + nkeys = 0; + multi = false; + } + shared_ptr<BSONObjExternalSorter> sorter; + unsigned long long n; // # of records + unsigned long long nkeys; + bool multi; // multikey index + + void addKeys(const IndexSpec& spec, const BSONObj& o, DiskLoc loc) { + BSONObjSet keys; + spec.getKeys(o, keys); + int k = 0; + for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) { + if( ++k == 2 ) { + multi = true; + } + sorter->add(*i, loc); + nkeys++; + } + n++; + } + }; + +} diff --git a/src/mongo/db/concurrency.h b/src/mongo/db/concurrency.h new file mode 100644 index 00000000000..33bc0caac77 --- /dev/null +++ b/src/mongo/db/concurrency.h @@ -0,0 +1,21 @@ +// @file concurrency.h + +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "mongomutex.h" diff --git a/src/mongo/db/curop-inl.h b/src/mongo/db/curop-inl.h new file mode 100644 index 00000000000..7dd678b185d --- /dev/null +++ b/src/mongo/db/curop-inl.h @@ -0,0 +1 @@ +#include "curop.h" diff --git a/src/mongo/db/curop.cpp b/src/mongo/db/curop.cpp new file mode 100644 index 00000000000..3cc452b46cc --- /dev/null +++ b/src/mongo/db/curop.cpp @@ -0,0 +1,173 @@ +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "curop.h" +#include "database.h" + +namespace mongo { + + // todo : move more here + + CurOp::CurOp( Client * client , CurOp * wrapped ) : + _client(client), + _wrapped(wrapped) + { + if ( _wrapped ) + _client->_curOp = this; + _start = _checkpoint = 0; + _active = false; + _reset(); + _op = 0; + // These addresses should never be written to again. The zeroes are + // placed here as a precaution because currentOp may be accessed + // without the db mutex. + memset(_ns, 0, sizeof(_ns)); + } + + void CurOp::_reset() { + _command = false; + _lockType = 0; + _dbprofile = 0; + _end = 0; + _waitingForLock = false; + _message = ""; + _progressMeter.finished(); + _killed = false; + _numYields = 0; + } + + void CurOp::reset() { + _reset(); + _start = _checkpoint = 0; + _opNum = _nextOpNum++; + _ns[0] = 0; + _debug.reset(); + _query.reset(); + _active = true; // this should be last for ui clarity + } + + void CurOp::reset( const HostAndPort& remote, int op ) { + reset(); + if( _remote != remote ) { + // todo : _remote is not thread safe yet is used as such! + _remote = remote; + } + _op = op; + } + + ProgressMeter& CurOp::setMessage( const char * msg , unsigned long long progressMeterTotal , int secondsBetween ) { + if ( progressMeterTotal ) { + if ( _progressMeter.isActive() ) { + cout << "about to assert, old _message: " << _message << " new message:" << msg << endl; + assert( ! _progressMeter.isActive() ); + } + _progressMeter.reset( progressMeterTotal , secondsBetween ); + } + else { + _progressMeter.finished(); + } + _message = msg; + return _progressMeter; + } + + + BSONObj CurOp::info() { + if( ! cc().getAuthenticationInfo()->isAuthorized("admin") ) { + BSONObjBuilder b; + b.append("err", "unauthorized"); + return b.obj(); + } + return infoNoauth(); + } + + CurOp::~CurOp() { + if ( _wrapped ) { + scoped_lock bl(Client::clientsMutex); + _client->_curOp = _wrapped; + } + _client = 0; + } + + void CurOp::enter( Client::Context * context ) { + ensureStarted(); + setNS( context->ns() ); + _dbprofile = context->_db ? context->_db->profile : 0; + } + + void CurOp::leave( Client::Context * context ) { + unsigned long long now = curTimeMicros64(); + Top::global.record( _ns , _op , _lockType , now - _checkpoint , _command ); + _checkpoint = now; + } + + BSONObj CurOp::infoNoauth() { + BSONObjBuilder b; + b.append("opid", _opNum); + bool a = _active && _start; + b.append("active", a); + if ( _lockType ) + b.append("lockType" , _lockType > 0 ? "write" : "read" ); + b.append("waitingForLock" , _waitingForLock ); + + if( a ) { + b.append("secs_running", elapsedSeconds() ); + } + + b.append( "op" , opToString( _op ) ); + + b.append("ns", _ns); + + _query.append( b , "query" ); + + if( !_remote.empty() ) { + b.append("client", _remote.toString()); + } + + if ( _client ) { + b.append( "desc" , _client->desc() ); + if ( _client->_threadId.size() ) + b.append( "threadId" , _client->_threadId ); + if ( _client->_connectionId ) + b.appendNumber( "connectionId" , _client->_connectionId ); + } + + if ( ! _message.empty() ) { + if ( _progressMeter.isActive() ) { + StringBuilder buf(128); + buf << _message.toString() << " " << _progressMeter.toString(); + b.append( "msg" , buf.str() ); + BSONObjBuilder sub( b.subobjStart( "progress" ) ); + sub.appendNumber( "done" , (long long)_progressMeter.done() ); + sub.appendNumber( "total" , (long long)_progressMeter.total() ); + sub.done(); + } + else { + b.append( "msg" , _message.toString() ); + } + } + + if( killed() ) + b.append("killed", true); + + b.append( "numYields" , _numYields ); + + return b.obj(); + } + + AtomicUInt CurOp::_nextOpNum; + +} diff --git a/src/mongo/db/curop.h b/src/mongo/db/curop.h new file mode 100644 index 00000000000..192404d8796 --- /dev/null +++ b/src/mongo/db/curop.h @@ -0,0 +1,313 @@ +// @file curop.h + +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + +#pragma once + +#include "namespace-inl.h" +#include "client.h" +#include "../bson/util/atomic_int.h" +#include "../util/concurrency/spin_lock.h" +#include "../util/time_support.h" +#include "../util/net/hostandport.h" + +namespace mongo { + + class CurOp; + + /* lifespan is different than CurOp because of recursives with DBDirectClient */ + class OpDebug { + public: + OpDebug() : ns(""){ reset(); } + + void reset(); + + string toString() const; + void append( const CurOp& curop, BSONObjBuilder& b ) const; + + // ------------------- + + StringBuilder extra; // weird things we need to fix later + + // basic options + int op; + bool iscommand; + Namespace ns; + BSONObj query; + BSONObj updateobj; + + // detailed options + long long cursorid; + int ntoreturn; + int ntoskip; + bool exhaust; + + // debugging/profile info + int nscanned; + bool idhack; // indicates short circuited code path on an update to make the update faster + bool scanAndOrder; // scanandorder query plan aspect was used + bool moved; // update resulted in a move (moves are expensive) + bool fastmod; + bool fastmodinsert; // upsert of an $operation. builds a default object + bool upsert; // true if the update actually did an insert + int keyUpdates; + + // error handling + ExceptionInfo exceptionInfo; + + // response info + int executionTime; + int nreturned; + int responseLength; + }; + + /** + * stores a copy of a bson obj in a fixed size buffer + * if its too big for the buffer, says "too big" + * useful for keeping a copy around indefinitely without wasting a lot of space or doing malloc + */ + class CachedBSONObj { + public: + enum { TOO_BIG_SENTINEL = 1 } ; + static BSONObj _tooBig; // { $msg : "query not recording (too large)" } + + CachedBSONObj() { + _size = (int*)_buf; + reset(); + } + + void reset( int sz = 0 ) { + _lock.lock(); + _reset( sz ); + _lock.unlock(); + } + + void set( const BSONObj& o ) { + scoped_spinlock lk(_lock); + int sz = o.objsize(); + if ( sz > (int) sizeof(_buf) ) { + _reset(TOO_BIG_SENTINEL); + } + else { + memcpy(_buf, o.objdata(), sz ); + } + } + + int size() const { return *_size; } + bool have() const { return size() > 0; } + + BSONObj get() const { + scoped_spinlock lk(_lock); + return _get(); + } + + void append( BSONObjBuilder& b , const StringData& name ) const { + scoped_spinlock lk(_lock); + BSONObj temp = _get(); + b.append( name , temp ); + } + + private: + /** you have to be locked when you call this */ + BSONObj _get() const { + int sz = size(); + if ( sz == 0 ) + return BSONObj(); + if ( sz == TOO_BIG_SENTINEL ) + return _tooBig; + return BSONObj( _buf ).copy(); + } + + /** you have to be locked when you call this */ + void _reset( int sz ) { _size[0] = sz; } + + mutable SpinLock _lock; + int * _size; + char _buf[512]; + }; + + /* Current operation (for the current Client). + an embedded member of Client class, and typically used from within the mutex there. + */ + class CurOp : boost::noncopyable { + public: + CurOp( Client * client , CurOp * wrapped = 0 ); + ~CurOp(); + + bool haveQuery() const { return _query.have(); } + BSONObj query() { return _query.get(); } + void appendQuery( BSONObjBuilder& b , const StringData& name ) const { _query.append( b , name ); } + + void ensureStarted() { + if ( _start == 0 ) + _start = _checkpoint = curTimeMicros64(); + } + bool isStarted() const { return _start > 0; } + void enter( Client::Context * context ); + void leave( Client::Context * context ); + void reset(); + void reset( const HostAndPort& remote, int op ); + void markCommand() { _command = true; } + + void waitingForLock( int type ) { + _waitingForLock = true; + if ( type > 0 ) + _lockType = 1; + else + _lockType = -1; + } + void gotLock() { _waitingForLock = false; } + OpDebug& debug() { return _debug; } + int profileLevel() const { return _dbprofile; } + const char * getNS() const { return _ns; } + + bool shouldDBProfile( int ms ) const { + if ( _dbprofile <= 0 ) + return false; + + return _dbprofile >= 2 || ms >= cmdLine.slowMS; + } + + AtomicUInt opNum() const { return _opNum; } + + /** if this op is running */ + bool active() const { return _active; } + + int getLockType() const { return _lockType; } + bool isWaitingForLock() const { return _waitingForLock; } + int getOp() const { return _op; } + unsigned long long startTime() { // micros + ensureStarted(); + return _start; + } + void done() { + _active = false; + _end = curTimeMicros64(); + } + unsigned long long totalTimeMicros() { + massert( 12601 , "CurOp not marked done yet" , ! _active ); + return _end - startTime(); + } + int totalTimeMillis() { return (int) (totalTimeMicros() / 1000); } + int elapsedMillis() { + unsigned long long total = curTimeMicros64() - startTime(); + return (int) (total / 1000); + } + int elapsedSeconds() { return elapsedMillis() / 1000; } + void setQuery(const BSONObj& query) { _query.set( query ); } + Client * getClient() const { return _client; } + BSONObj info(); + BSONObj infoNoauth(); + string getRemoteString( bool includePort = true ) { return _remote.toString(includePort); } + ProgressMeter& setMessage( const char * msg , unsigned long long progressMeterTotal = 0 , int secondsBetween = 3 ); + string getMessage() const { return _message.toString(); } + ProgressMeter& getProgressMeter() { return _progressMeter; } + CurOp *parent() const { return _wrapped; } + void kill() { _killed = true; } + bool killed() const { return _killed; } + void yielded() { _numYields++; } + void setNS(const char *ns) { + strncpy(_ns, ns, Namespace::MaxNsLen); + _ns[Namespace::MaxNsLen] = 0; + } + + private: + friend class Client; + void _reset(); + + static AtomicUInt _nextOpNum; + Client * _client; + CurOp * _wrapped; + unsigned long long _start; + unsigned long long _checkpoint; + unsigned long long _end; + bool _active; + int _op; + bool _command; + int _lockType; // see concurrency.h for values + bool _waitingForLock; + int _dbprofile; // 0=off, 1=slow, 2=all + AtomicUInt _opNum; // todo: simple being "unsigned" may make more sense here + char _ns[Namespace::MaxNsLen+2]; + HostAndPort _remote; // CAREFUL here with thread safety + CachedBSONObj _query; // CachedBSONObj is thread safe + OpDebug _debug; + ThreadSafeString _message; + ProgressMeter _progressMeter; + volatile bool _killed; + int _numYields; + }; + + /* _globalKill: we are shutting down + otherwise kill attribute set on specified CurOp + this class does not handle races between interruptJs and the checkForInterrupt functions - those must be + handled by the client of this class + */ + extern class KillCurrentOp { + public: + void killAll(); + void kill(AtomicUInt i); + + /** @return true if global interrupt and should terminate the operation */ + bool globalInterruptCheck() const { return _globalKill; } + + void checkForInterrupt( bool heedMutex = true ) { + Client& c = cc(); + if ( heedMutex && d.dbMutex.isWriteLocked() ) + return; + if( _globalKill ) + uasserted(11600,"interrupted at shutdown"); + if( c.curop()->killed() ) + uasserted(11601,"interrupted"); + if( c.sometimes(1024) ) { + AbstractMessagingPort *p = cc().port(); + if( p ) + p->assertStillConnected(); + } + } + + /** @return "" if not interrupted. otherwise, you should stop. */ + const char *checkForInterruptNoAssert( /*bool heedMutex = true*/ ) { + Client& c = cc(); + // always called withi false so commented out: + /*if ( heedMutex && d.dbMutex.isWriteLocked() ) + return "";*/ + if( _globalKill ) + return "interrupted at shutdown"; + if( c.curop()->killed() ) + return "interrupted"; + if( c.sometimes(1024) ) { + try { + AbstractMessagingPort *p = cc().port(); + if( p ) + p->assertStillConnected(); + } + catch(...) { + log() << "no longer connected to client"; + return "no longer connected to client"; + } + } + return ""; + } + + private: + void interruptJs( AtomicUInt *op ); + volatile bool _globalKill; + } killCurrentOp; + +} diff --git a/src/mongo/db/cursor.cpp b/src/mongo/db/cursor.cpp new file mode 100644 index 00000000000..ac7afc1532b --- /dev/null +++ b/src/mongo/db/cursor.cpp @@ -0,0 +1,166 @@ +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "pch.h" +#include "pdfile.h" +#include "curop-inl.h" + +namespace mongo { + + bool BasicCursor::advance() { + killCurrentOp.checkForInterrupt(); + if ( eof() ) { + if ( tailable_ && !last.isNull() ) { + curr = s->next( last ); + } + else { + return false; + } + } + else { + last = curr; + curr = s->next( curr ); + } + incNscanned(); + return ok(); + } + + /* these will be used outside of mutexes - really functors - thus the const */ + class Forward : public AdvanceStrategy { + virtual DiskLoc next( const DiskLoc &prev ) const { + return prev.rec()->getNext( prev ); + } + } _forward; + + class Reverse : public AdvanceStrategy { + virtual DiskLoc next( const DiskLoc &prev ) const { + return prev.rec()->getPrev( prev ); + } + } _reverse; + + const AdvanceStrategy *forward() { + return &_forward; + } + const AdvanceStrategy *reverse() { + return &_reverse; + } + + DiskLoc nextLoop( NamespaceDetails *nsd, const DiskLoc &prev ) { + assert( nsd->capLooped() ); + DiskLoc next = forward()->next( prev ); + if ( !next.isNull() ) + return next; + return nsd->firstRecord(); + } + + DiskLoc prevLoop( NamespaceDetails *nsd, const DiskLoc &curr ) { + assert( nsd->capLooped() ); + DiskLoc prev = reverse()->next( curr ); + if ( !prev.isNull() ) + return prev; + return nsd->lastRecord(); + } + + ForwardCappedCursor::ForwardCappedCursor( NamespaceDetails *_nsd, const DiskLoc &startLoc ) : + nsd( _nsd ) { + if ( !nsd ) + return; + DiskLoc start = startLoc; + if ( start.isNull() ) { + if ( !nsd->capLooped() ) + start = nsd->firstRecord(); + else { + start = nsd->capExtent.ext()->firstRecord; + if ( !start.isNull() && start == nsd->capFirstNewRecord ) { + start = nsd->capExtent.ext()->lastRecord; + start = nextLoop( nsd, start ); + } + } + } + curr = start; + s = this; + incNscanned(); + } + + DiskLoc ForwardCappedCursor::next( const DiskLoc &prev ) const { + assert( nsd ); + if ( !nsd->capLooped() ) + return forward()->next( prev ); + + DiskLoc i = prev; + // Last record + if ( i == nsd->capExtent.ext()->lastRecord ) + return DiskLoc(); + i = nextLoop( nsd, i ); + // If we become capFirstNewRecord from same extent, advance to next extent. + if ( i == nsd->capFirstNewRecord && + i != nsd->capExtent.ext()->firstRecord ) + i = nextLoop( nsd, nsd->capExtent.ext()->lastRecord ); + // If we have just gotten to beginning of capExtent, skip to capFirstNewRecord + if ( i == nsd->capExtent.ext()->firstRecord ) + i = nsd->capFirstNewRecord; + return i; + } + + ReverseCappedCursor::ReverseCappedCursor( NamespaceDetails *_nsd, const DiskLoc &startLoc ) : + nsd( _nsd ) { + if ( !nsd ) + return; + DiskLoc start = startLoc; + if ( start.isNull() ) { + if ( !nsd->capLooped() ) { + start = nsd->lastRecord(); + } + else { + start = nsd->capExtent.ext()->lastRecord; + } + } + curr = start; + s = this; + incNscanned(); + } + + DiskLoc ReverseCappedCursor::next( const DiskLoc &prev ) const { + assert( nsd ); + if ( !nsd->capLooped() ) + return reverse()->next( prev ); + + DiskLoc i = prev; + // Last record + if ( nsd->capFirstNewRecord == nsd->capExtent.ext()->firstRecord ) { + if ( i == nextLoop( nsd, nsd->capExtent.ext()->lastRecord ) ) { + return DiskLoc(); + } + } + else { + if ( i == nsd->capExtent.ext()->firstRecord ) { + return DiskLoc(); + } + } + // If we are capFirstNewRecord, advance to prev extent, otherwise just get prev. + if ( i == nsd->capFirstNewRecord ) + i = prevLoop( nsd, nsd->capExtent.ext()->firstRecord ); + else + i = prevLoop( nsd, i ); + // If we just became last in cap extent, advance past capFirstNewRecord + // (We know capExtent.ext()->firstRecord != capFirstNewRecord, since would + // have returned DiskLoc() earlier otherwise.) + if ( i == nsd->capExtent.ext()->lastRecord ) + i = reverse()->next( nsd->capFirstNewRecord ); + + return i; + } +} // namespace mongo diff --git a/src/mongo/db/cursor.h b/src/mongo/db/cursor.h new file mode 100644 index 00000000000..8e9e922733d --- /dev/null +++ b/src/mongo/db/cursor.h @@ -0,0 +1,246 @@ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../pch.h" + +#include "jsobj.h" +#include "diskloc.h" +#include "matcher.h" + +namespace mongo { + + class NamespaceDetails; + class Record; + class CoveredIndexMatcher; + + /* Query cursors, base class. This is for our internal cursors. "ClientCursor" is a separate + concept and is for the user's cursor. + + WARNING concurrency: the vfunctions below are called back from within a + ClientCursor::ccmutex. Don't cause a deadlock, you've been warned. + */ + class Cursor : boost::noncopyable { + public: + virtual ~Cursor() {} + virtual bool ok() = 0; + bool eof() { return !ok(); } + virtual Record* _current() = 0; + virtual BSONObj current() = 0; + virtual DiskLoc currLoc() = 0; + virtual bool advance() = 0; /*true=ok*/ + virtual BSONObj currKey() const { return BSONObj(); } + + // DiskLoc the cursor requires for continued operation. Before this + // DiskLoc is deleted, the cursor must be incremented or destroyed. + virtual DiskLoc refLoc() = 0; + + /* Implement these if you want the cursor to be "tailable" */ + + /* Request that the cursor starts tailing after advancing past last record. */ + /* The implementation may or may not honor this request. */ + virtual void setTailable() {} + /* indicates if tailing is enabled. */ + virtual bool tailable() { + return false; + } + + virtual void aboutToDeleteBucket(const DiskLoc& b) { } + + /* optional to implement. if implemented, means 'this' is a prototype */ + virtual Cursor* clone() { + return 0; + } + + virtual BSONObj indexKeyPattern() { + return BSONObj(); + } + + virtual bool supportGetMore() = 0; + + /* called after every query block is iterated -- i.e. between getMore() blocks + so you can note where we are, if necessary. + */ + virtual void noteLocation() { } + + /* called before query getmore block is iterated */ + virtual void checkLocation() { } + + /** + * Called before a document pointed at by an earlier iterate of this cursor is to be + * modified. It is ok if the current iterate also points to the document to be modified. + */ + virtual void prepareToTouchEarlierIterate() { noteLocation(); } + + /** Recover from a previous call to prepareToTouchEarlierIterate(). */ + virtual void recoverFromTouchingEarlierIterate() { checkLocation(); } + + virtual bool supportYields() = 0; + + /** Called before a ClientCursor yield. */ + virtual bool prepareToYield() { noteLocation(); return supportYields(); } + + /** Called after a ClientCursor yield. Recovers from a previous call to prepareToYield(). */ + virtual void recoverFromYield() { checkLocation(); } + + virtual string toString() { return "abstract?"; } + + /* used for multikey index traversal to avoid sending back dups. see Matcher::matches(). + if a multikey index traversal: + if loc has already been sent, returns true. + otherwise, marks loc as sent. + */ + virtual bool getsetdup(DiskLoc loc) = 0; + + virtual bool isMultiKey() const = 0; + + virtual bool autoDedup() const { return true; } + + /** + * return true if the keys in the index have been modified from the main doc + * if you have { a : 1 , b : [ 1 , 2 ] } + * an index on { a : 1 } would not be modified + * an index on { b : 1 } would be since the values of the array are put in the index + * not the array + */ + virtual bool modifiedKeys() const = 0; + + virtual BSONObj prettyIndexBounds() const { return BSONArray(); } + + virtual bool capped() const { return false; } + + virtual long long nscanned() = 0; + + // The implementation may return different matchers depending on the + // position of the cursor. If matcher() is nonzero at the start, + // matcher() should be checked each time advance() is called. + // Implementations which generate their own matcher should return this + // to avoid a matcher being set manually. + // Note that the return values differ subtly here + + // Used when we want fast matcher lookup + virtual CoveredIndexMatcher *matcher() const { return 0; } + // Used when we need to share this matcher with someone else + virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return shared_ptr< CoveredIndexMatcher >(); } + + virtual bool currentMatches( MatchDetails *details = 0 ) { + return !matcher() || matcher()->matchesCurrent( this, details ); + } + + // A convenience function for setting the value of matcher() manually + // so it may accessed later. Implementations which must generate + // their own matcher() should assert here. + virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { + massert( 13285, "manual matcher config not allowed", false ); + } + + virtual void explainDetails( BSONObjBuilder& b ) { return; } + }; + + // strategy object implementing direction of traversal. + class AdvanceStrategy { + public: + virtual ~AdvanceStrategy() { } + virtual DiskLoc next( const DiskLoc &prev ) const = 0; + }; + + const AdvanceStrategy *forward(); + const AdvanceStrategy *reverse(); + + /* table-scan style cursor */ + class BasicCursor : public Cursor { + public: + BasicCursor(DiskLoc dl, const AdvanceStrategy *_s = forward()) : curr(dl), s( _s ), _nscanned() { + incNscanned(); + init(); + } + BasicCursor(const AdvanceStrategy *_s = forward()) : s( _s ), _nscanned() { + init(); + } + bool ok() { return !curr.isNull(); } + Record* _current() { + assert( ok() ); + return curr.rec(); + } + BSONObj current() { + Record *r = _current(); + BSONObj j(r); + return j; + } + virtual DiskLoc currLoc() { return curr; } + virtual DiskLoc refLoc() { return curr.isNull() ? last : curr; } + bool advance(); + virtual string toString() { return "BasicCursor"; } + virtual void setTailable() { + if ( !curr.isNull() || !last.isNull() ) + tailable_ = true; + } + virtual bool tailable() { return tailable_; } + virtual bool getsetdup(DiskLoc loc) { return false; } + virtual bool isMultiKey() const { return false; } + virtual bool modifiedKeys() const { return false; } + virtual bool supportGetMore() { return true; } + virtual bool supportYields() { return true; } + virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); } + virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; } + virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher; } + virtual long long nscanned() { return _nscanned; } + + protected: + DiskLoc curr, last; + const AdvanceStrategy *s; + void incNscanned() { if ( !curr.isNull() ) { ++_nscanned; } } + private: + bool tailable_; + shared_ptr< CoveredIndexMatcher > _matcher; + long long _nscanned; + void init() { tailable_ = false; } + }; + + /* used for order { $natural: -1 } */ + class ReverseCursor : public BasicCursor { + public: + ReverseCursor(DiskLoc dl) : BasicCursor( dl, reverse() ) { } + ReverseCursor() : BasicCursor( reverse() ) { } + virtual string toString() { return "ReverseCursor"; } + }; + + class ForwardCappedCursor : public BasicCursor, public AdvanceStrategy { + public: + ForwardCappedCursor( NamespaceDetails *nsd = 0, const DiskLoc &startLoc = DiskLoc() ); + virtual string toString() { + return "ForwardCappedCursor"; + } + virtual DiskLoc next( const DiskLoc &prev ) const; + virtual bool capped() const { return true; } + private: + NamespaceDetails *nsd; + }; + + class ReverseCappedCursor : public BasicCursor, public AdvanceStrategy { + public: + ReverseCappedCursor( NamespaceDetails *nsd = 0, const DiskLoc &startLoc = DiskLoc() ); + virtual string toString() { + return "ReverseCappedCursor"; + } + virtual DiskLoc next( const DiskLoc &prev ) const; + virtual bool capped() const { return true; } + private: + NamespaceDetails *nsd; + }; + +} // namespace mongo diff --git a/src/mongo/db/d_concurrency.cpp b/src/mongo/db/d_concurrency.cpp new file mode 100755 index 00000000000..e3ad974cbfc --- /dev/null +++ b/src/mongo/db/d_concurrency.cpp @@ -0,0 +1,231 @@ +// @file d_concurrency.cpp + +#include "pch.h" +#include "d_concurrency.h" +#include "../util/concurrency/threadlocal.h" +#include "../util/concurrency/rwlock.h" +#include "../util/concurrency/value.h" +#include "../util/assert_util.h" +#include "client.h" +#include "namespacestring.h" +#include "d_globals.h" + +// oplog locking +// no top level read locks +// system.profile writing +// oplog now +// yielding +// commitIfNeeded + +namespace mongo { + + using namespace clcimpl; + + Client::LockStatus::LockStatus() { + excluder=global=collection=0; + } + + namespace clcimpl { + Shared::Shared(unsigned& _state, RWLock& lock) : state(_state) { + rw = 0; + if( state ) { + // already locked + dassert( (state & (AcquireShared|AcquireExclusive)) == 0 ); + return; + } + rw = &lock; + state = AcquireShared; + rw->lock_shared(); + state = LockedShared; + } + Shared::~Shared() { + if( rw ) { + state = Unlocked; + rw->unlock_shared(); + } + } + Exclusive::Exclusive(unsigned& _state, RWLock& lock) : state(_state) { + rw = 0; + if( state ) { + // already locked + dassert( (state & (AcquireShared|AcquireExclusive)) == 0 ); + assert( state == LockedExclusive ); // can't be in shared state + return; + } + rw = &lock; + state = AcquireExclusive; + rw->lock(); + state = LockedExclusive; + } + Exclusive::~Exclusive() { + if( rw ) { + state = Unlocked; + rw->unlock(); + } + } + } // clcimpl namespace + + // this tie-in temporary until MongoMutex is folded in more directly. + // called when the lock has been achieved + void MongoMutex::lockedExclusively() { + Client& c = cc(); + curopGotLock(&c); // hopefully lockStatus replaces one day + c.lockStatus.global = clcimpl::LockedExclusive; + _minfo.entered(); // hopefully eliminate one day + } + + void MongoMutex::unlockingExclusively() { + Client& c = cc(); + _minfo.leaving(); + c.lockStatus.global = Unlocked; + } + + MongoMutex::MongoMutex(const char *name) : _m(name) { + static int n = 0; + assert( ++n == 1 ); // below releasingWriteLock we assume MongoMutex is a singleton, and uses dbMutex ref above + _remapPrivateViewRequested = false; + } + + bool subcollectionOf(const string& parent, const char *child) { + if( parent == child ) + return true; + if( !str::startsWith(child, parent) ) + return false; + const char *p = child + parent.size(); + uassert(15963, str::stream() << "bad collection name: " << child, !str::endsWith(p, '.')); + return *p == '.' && p[1] == '$'; + } + + // (maybe tbd) ... + // we will use the global write lock for writing to system.* collections for simplicity + // for now; this has some advantages in speed as we don't need to latch just for that then; + // also there are cases to be handled carefully otherwise such as namespacedetails methods + // reaching into system.indexes implicitly + // exception : system.profile + static bool lkspecial(const string& ns) { + NamespaceString s(ns); + return s.isSystem() && s.coll != "system.profile"; + } + + /** Notes on d.writeExcluder + we want to be able to block any attempted write while allowing reads; additionally + force non-greedy acquisition so that reads can continue -- + that is, disallow greediness of write lock acquisitions. This is for that purpose. The + #1 need is by groupCommitWithLimitedLocks() but useful elsewhere such as for lock and fsync. + */ + + ExcludeAllWrites::ExcludeAllWrites() : + lk(cc().lockStatus.excluder, d.writeExcluder), + gslk() + { + LOG(3) << "ExcludeAllWrites" << endl; + wassert( !d.dbMutex.isWriteLocked() ); + }; + ExcludeAllWrites::~ExcludeAllWrites() { + } + + // CLC turns on the "collection level concurrency" code + // (which is under development and not finished) +#if defined(CLC) + // called after a context is set. check that the correct collection is locked + void Client::checkLocks() const { + DEV { + if( !d.dbMutex.isWriteLocked() ) { + const char *n = ns(); + if( lockStatus.whichCollection.empty() ) { + log() << "DEBUG checkLocks error expected to already be locked: " << n << endl; + dassert(false); + } + dassert( subcollectionOf(lockStatus.whichCollection, n) || lkspecial(n) ); + } + } + } +#endif + + // we don't keep these locks in the namespacedetailstransient and Database + // objects -- that makes things safer as we need not prove to ourselves that they + // are always in scope when we need them. + // todo: we don't clean these locks up yet. + // todo: avoiding the mutex here might be nice. + class LockObjectForEachCollection { + //mapsf<string,RWLock*> dblocks; + mapsf<string,RWLock*> nslocks; + public: + /*RWLock& fordb(string db) { + mapsf<string,RWLock*>::ref r(dblocks); + RWLock*& rw = r[db]; + if( rw == 0 ) + rw = new RWLock(0); + return *rw; + }*/ + RWLock& forns(string ns) { + mapsf<string,RWLock*>::ref r(nslocks); +#if defined(CLC) + massert(15964, str::stream() << "bad collection name to lock: " << ns, str::contains(ns, '.')); +#endif + RWLock*& rw = r[ns]; + if( rw == 0 ) { + rw = new RWLock(0); + } + return *rw; + } + } theLocks; + +#if defined(CLC) + LockCollectionForWriting::Locks::Locks(string ns) : + excluder(d.writeExcluder), + gslk(), + clk(theLocks.forns(ns),true) + { } + LockCollectionForWriting::~LockCollectionForWriting() { + if( locks.get() ) { + Client::LockStatus& s = cc().lockStatus; + s.whichCollection.clear(); + } + } + LockCollectionForWriting::LockCollectionForWriting(string coll) + { + Client::LockStatus& s = cc().lockStatus; + LockBits b(s.state); + if( !s.whichCollection.empty() ) { + if( !subcollectionOf(s.whichCollection, coll.c_str()) ) { + massert(15937, str::stream() << "can't nest lock of " << coll << " beneath " << s.whichCollection, false); + } + if( b.get(LockBits::Collection) != LockBits::Exclusive ) { + massert(15938, str::stream() << "want collection write lock but it is already read locked " << s.state, false); + } + return; + } + verify(15965, !lkspecial(coll)); // you must global write lock for writes to special's + s.whichCollection = coll; + b.set(LockBits::Collection, LockBits::NotLocked, LockBits::Exclusive); + locks.reset( new Locks(coll) ); + } +#endif + + LockCollectionForReading::LockCollectionForReading(string ns) : + gslk(), + clk( cc().lockStatus.collection, theLocks.forns(ns) ) + { + Client::LockStatus& s = cc().lockStatus; + if( s.whichCollection.empty() ) { + s.whichCollection = ns; + } + else { + if( !subcollectionOf(s.whichCollection, ns.c_str()) ) { + if( lkspecial(ns) ) + return; + massert(15939, + str::stream() << "can't nest lock of " << ns << " beneath " << s.whichCollection, + false); + } + } + } + LockCollectionForReading::~LockCollectionForReading() { + if( !clk.recursed() ) { + Client::LockStatus& s = cc().lockStatus; + s.whichCollection.clear(); + } + } + +} diff --git a/src/mongo/db/d_concurrency.h b/src/mongo/db/d_concurrency.h new file mode 100644 index 00000000000..ba2f64f5126 --- /dev/null +++ b/src/mongo/db/d_concurrency.h @@ -0,0 +1,67 @@ +// @file d_concurrency.h + +#pragma once + +#include "../util/concurrency/rwlock.h" +#include "db/mongomutex.h" + +namespace mongo { + + namespace clcimpl { + enum LockStates { Unlocked, AcquireShared=1, LockedShared=2, AcquireExclusive=4, LockedExclusive=8 }; + class Shared : boost::noncopyable { + unsigned& state; + RWLock *rw; + public: + Shared(unsigned& state, RWLock& lock); + ~Shared(); + bool recursed() const { return rw == 0; } + }; + class Exclusive : boost::noncopyable { + unsigned& state; + RWLock *rw; + public: + Exclusive(unsigned& state, RWLock& lock); + ~Exclusive(); + }; + } + + typedef readlock GlobalSharedLock; + + class ExcludeAllWrites : boost::noncopyable { + clcimpl::Exclusive lk; + GlobalSharedLock gslk; + public: + ExcludeAllWrites(); + ~ExcludeAllWrites(); + }; + + class todoGlobalWriteLock : boost::noncopyable { + public: + }; + + class LockCollectionForReading : boost::noncopyable { + GlobalSharedLock gslk; + clcimpl::Shared clk; + public: + LockCollectionForReading(string coll); + ~LockCollectionForReading(); + }; + +#if defined(CLC) + class LockCollectionForWriting : boost::noncopyable { + struct Locks { + Locks(string ns); + SimpleRWLock::Shared excluder; + GlobalSharedLock gslk; + rwlock clk; + }; + scoped_ptr<Locks> locks; + public: + LockCollectionForWriting(string db); + ~LockCollectionForWriting(); + }; +#else +#endif + +} diff --git a/src/mongo/db/d_globals.cpp b/src/mongo/db/d_globals.cpp new file mode 100644 index 00000000000..7e0fd9e8cb0 --- /dev/null +++ b/src/mongo/db/d_globals.cpp @@ -0,0 +1,20 @@ +// @file d_globals.cpp + +#include "pch.h" +#include "d_globals.h" +#include "../util/concurrency/rwlock.h" +#include "clientcursor.h" +#include "mongomutex.h" + +namespace mongo { + + DGlobals::DGlobals() : + writeExcluder( *(new RWLock("writeexcluder")) ), + dbMutex( *(new MongoMutex("dbMutex")) ), + clientCursorMonitor( *(new ClientCursorMonitor()) ) + { + } + + DGlobals d; + +} diff --git a/src/mongo/db/d_globals.h b/src/mongo/db/d_globals.h new file mode 100644 index 00000000000..7c95d463cc3 --- /dev/null +++ b/src/mongo/db/d_globals.h @@ -0,0 +1,27 @@ +// @file d_globals.h +// +// these are global variables used in mongod ("d"). also used in test binary as that is effectively a variation on mongod code. +// that is, these are not in mongos. +// + +#pragma once + +namespace mongo { + + class RWLock; + class MongoMutex; + class ClientCursorMonitor; + + struct DGlobals : boost::noncopyable { + DGlobals(); + + // these are intentionally never deleted: + RWLock& writeExcluder; + MongoMutex &dbMutex; + ClientCursorMonitor& clientCursorMonitor; + + }; + + extern DGlobals d; + +}; diff --git a/src/mongo/db/database.cpp b/src/mongo/db/database.cpp new file mode 100644 index 00000000000..2d55fd35626 --- /dev/null +++ b/src/mongo/db/database.cpp @@ -0,0 +1,423 @@ +// database.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "pdfile.h" +#include "database.h" +#include "instance.h" +#include "clientcursor.h" +#include "databaseholder.h" + +namespace mongo { + + bool Database::_openAllFiles = true; + + void assertDbAtLeastReadLocked(const Database *) { + // temp impl + d.dbMutex.assertAtLeastReadLocked(); + } + + void assertDbWriteLocked(const Database *) { + // temp impl + d.dbMutex.assertWriteLocked(); + } + + Database::~Database() { + d.dbMutex.assertWriteLocked(); + magic = 0; + size_t n = _files.size(); + for ( size_t i = 0; i < n; i++ ) + delete _files[i]; + if( ccByLoc.size() ) { + log() << "\n\n\nWARNING: ccByLoc not empty on database close! " << ccByLoc.size() << ' ' << name << endl; + } + } + + Database::Database(const char *nm, bool& newDb, const string& _path ) + : name(nm), path(_path), namespaceIndex( path, name ), + profileName(name + ".system.profile") + { + try { + { + // check db name is valid + size_t L = strlen(nm); + uassert( 10028 , "db name is empty", L > 0 ); + uassert( 10032 , "db name too long", L < 64 ); + uassert( 10029 , "bad db name [1]", *nm != '.' ); + uassert( 10030 , "bad db name [2]", nm[L-1] != '.' ); + uassert( 10031 , "bad char(s) in db name", strchr(nm, ' ') == 0 ); + } + newDb = namespaceIndex.exists(); + profile = cmdLine.defaultProfile; + checkDuplicateUncasedNames(true); + // If already exists, open. Otherwise behave as if empty until + // there's a write, then open. + if ( ! newDb || cmdLine.defaultProfile ) { + namespaceIndex.init(); + if( _openAllFiles ) + openAllFiles(); + } + magic = 781231; + } catch(std::exception& e) { + log() << "warning database " << path << ' ' << nm << " could not be opened" << endl; + log() << e.what() << endl; + // since destructor won't be called: + for ( size_t i = 0; i < _files.size(); i++ ) + delete _files[i]; + throw; + } + } + + void Database::checkDuplicateUncasedNames(bool inholderlock) const { + string duplicate = duplicateUncasedName(inholderlock, name, path ); + if ( !duplicate.empty() ) { + stringstream ss; + ss << "db already exists with different case other: [" << duplicate << "] me [" << name << "]"; + uasserted( DatabaseDifferCaseCode , ss.str() ); + } + } + + /*static*/ + string Database::duplicateUncasedName( bool inholderlock, const string &name, const string &path, set< string > *duplicates ) { + d.dbMutex.assertAtLeastReadLocked(); + + if ( duplicates ) { + duplicates->clear(); + } + + vector<string> others; + getDatabaseNames( others , path ); + + set<string> allShortNames; + dbHolder().getAllShortNames( inholderlock, allShortNames ); + + others.insert( others.end(), allShortNames.begin(), allShortNames.end() ); + + for ( unsigned i=0; i<others.size(); i++ ) { + + if ( strcasecmp( others[i].c_str() , name.c_str() ) ) + continue; + + if ( strcmp( others[i].c_str() , name.c_str() ) == 0 ) + continue; + + if ( duplicates ) { + duplicates->insert( others[i] ); + } else { + return others[i]; + } + } + if ( duplicates ) { + return duplicates->empty() ? "" : *duplicates->begin(); + } + return ""; + } + + boost::filesystem::path Database::fileName( int n ) const { + stringstream ss; + ss << name << '.' << n; + boost::filesystem::path fullName; + fullName = boost::filesystem::path(path); + if ( directoryperdb ) + fullName /= name; + fullName /= ss.str(); + return fullName; + } + + bool Database::openExistingFile( int n ) { + assert(this); + d.dbMutex.assertWriteLocked(); + { + // must not yet be visible to others as we aren't in the db's write lock and + // we will write to _files vector - thus this assert. + bool loaded = dbHolder().__isLoaded(name, path); + assert( !loaded ); + } + // additionally must be in the dbholder mutex (no assert for that yet) + + // todo: why here? that could be bad as we may be read locked only here + namespaceIndex.init(); + + if ( n < 0 || n >= DiskLoc::MaxFiles ) { + massert( 15924 , str::stream() << "getFile(): bad file number value " << n << " (corrupt db?): run repair", false); + } + + { + if( n < (int) _files.size() && _files[n] ) { + dlog(2) << "openExistingFile " << n << " is already open" << endl; + return true; + } + } + + { + boost::filesystem::path fullName = fileName( n ); + string fullNameString = fullName.string(); + MongoDataFile *df = new MongoDataFile(n); + try { + if( !df->openExisting( fullNameString.c_str() ) ) { + delete df; + return false; + } + } + catch ( AssertionException& ) { + delete df; + throw; + } + while ( n >= (int) _files.size() ) { + _files.push_back(0); + } + _files[n] = df; + } + + return true; + } + + // todo : we stop once a datafile dne. + // if one datafile were missing we should keep going for + // repair purposes yet we do not. + void Database::openAllFiles() { + //log() << "TEMP openallfiles " << path << ' ' << name << endl; + assert(this); + int n = 0; + while( openExistingFile(n) ) { + n++; + } + + /* + int n = 0; + while( exists(n) ) { + getFile(n); + n++; + } + // If last file is empty, consider it preallocated and make sure it's not mapped + // until a write is requested + if ( n > 1 && getFile( n - 1 )->getHeader()->isEmpty() ) { + delete _files[ n - 1 ]; + _files.pop_back(); + }*/ + } + + // todo: this is called a lot. streamline the common case + MongoDataFile* Database::getFile( int n, int sizeNeeded , bool preallocateOnly) { + assert(this); + DEV assertDbAtLeastReadLocked(this); + + namespaceIndex.init(); + if ( n < 0 || n >= DiskLoc::MaxFiles ) { + out() << "getFile(): n=" << n << endl; + massert( 10295 , "getFile(): bad file number value (corrupt db?): run repair", false); + } + DEV { + if ( n > 100 ) { + out() << "getFile(): n=" << n << endl; + } + } + MongoDataFile* p = 0; + if ( !preallocateOnly ) { + while ( n >= (int) _files.size() ) { + DEV if( !d.dbMutex.isWriteLocked() ) { + log() << "error: getFile() called in a read lock, yet file to return is not yet open" << endl; + log() << " getFile(" << n << ") _files.size:" <<_files.size() << ' ' << fileName(n).string() << endl; + log() << " context ns: " << cc().ns() << " openallfiles:" << _openAllFiles << endl; + } + assertDbWriteLocked(this); + _files.push_back(0); + } + p = _files[n]; + } + if ( p == 0 ) { + assertDbWriteLocked(this); + boost::filesystem::path fullName = fileName( n ); + string fullNameString = fullName.string(); + p = new MongoDataFile(n); + int minSize = 0; + if ( n != 0 && _files[ n - 1 ] ) + minSize = _files[ n - 1 ]->getHeader()->fileLength; + if ( sizeNeeded + DataFileHeader::HeaderSize > minSize ) + minSize = sizeNeeded + DataFileHeader::HeaderSize; + try { + p->open( fullNameString.c_str(), minSize, preallocateOnly ); + } + catch ( AssertionException& ) { + delete p; + throw; + } + if ( preallocateOnly ) + delete p; + else + _files[n] = p; + } + return preallocateOnly ? 0 : p; + } + + MongoDataFile* Database::addAFile( int sizeNeeded, bool preallocateNextFile ) { + assertDbWriteLocked(this); + int n = (int) _files.size(); + MongoDataFile *ret = getFile( n, sizeNeeded ); + if ( preallocateNextFile ) + preallocateAFile(); + return ret; + } + + bool fileIndexExceedsQuota( const char *ns, int fileIndex, bool enforceQuota ) { + return + cmdLine.quota && + enforceQuota && + fileIndex >= cmdLine.quotaFiles && + // we don't enforce the quota on "special" namespaces as that could lead to problems -- e.g. + // rejecting an index insert after inserting the main record. + !NamespaceString::special( ns ) && + NamespaceString( ns ).db != "local"; + } + + MongoDataFile* Database::suitableFile( const char *ns, int sizeNeeded, bool preallocate, bool enforceQuota ) { + + // check existing files + for ( int i=numFiles()-1; i>=0; i-- ) { + MongoDataFile* f = getFile( i ); + if ( f->getHeader()->unusedLength >= sizeNeeded ) { + if ( fileIndexExceedsQuota( ns, i-1, enforceQuota ) ) // NOTE i-1 is the value used historically for this check. + ; + else + return f; + } + } + + if ( fileIndexExceedsQuota( ns, numFiles(), enforceQuota ) ) + uasserted(12501, "quota exceeded"); + + // allocate files until we either get one big enough or hit maxSize + for ( int i = 0; i < 8; i++ ) { + MongoDataFile* f = addAFile( sizeNeeded, preallocate ); + + if ( f->getHeader()->unusedLength >= sizeNeeded ) + return f; + + if ( f->getHeader()->fileLength >= MongoDataFile::maxSize() ) // this is as big as they get so might as well stop + return f; + } + + uasserted(14810, "couldn't allocate space (suitableFile)"); // callers don't check for null return code + return 0; + } + + MongoDataFile* Database::newestFile() { + int n = numFiles(); + if ( n == 0 ) + return 0; + return getFile(n-1); + } + + + Extent* Database::allocExtent( const char *ns, int size, bool capped, bool enforceQuota ) { + // todo: when profiling, these may be worth logging into profile collection + bool fromFreeList = true; + Extent *e = DataFileMgr::allocFromFreeList( ns, size, capped ); + if( e == 0 ) { + fromFreeList = false; + e = suitableFile( ns, size, !capped, enforceQuota )->createExtent( ns, size, capped ); + } + LOG(1) << "allocExtent " << ns << " size " << size << ' ' << fromFreeList << endl; + return e; + } + + + bool Database::setProfilingLevel( int newLevel , string& errmsg ) { + if ( profile == newLevel ) + return true; + + if ( newLevel < 0 || newLevel > 2 ) { + errmsg = "profiling level has to be >=0 and <= 2"; + return false; + } + + if ( newLevel == 0 ) { + profile = 0; + return true; + } + + assert( cc().database() == this ); + + if ( ! namespaceIndex.details( profileName.c_str() ) ) { + log() << "creating profile collection: " << profileName << endl; + BSONObjBuilder spec; + spec.appendBool( "capped", true ); + spec.append( "size", 1024*1024 ); + if ( ! userCreateNS( profileName.c_str(), spec.done(), errmsg , false /* we don't replica profile messages */ ) ) { + return false; + } + } + profile = newLevel; + return true; + } + + bool Database::exists(int n) const { + return boost::filesystem::exists( fileName( n ) ); + } + + int Database::numFiles() const { + DEV assertDbAtLeastReadLocked(this); + return (int) _files.size(); + } + + void Database::flushFiles( bool sync ) { + assertDbAtLeastReadLocked(this); + for( vector<MongoDataFile*>::iterator i = _files.begin(); i != _files.end(); i++ ) { + MongoDataFile *f = *i; + f->flush(sync); + } + } + + long long Database::fileSize() const { + long long size=0; + for (int n=0; exists(n); n++) + size += boost::filesystem::file_size( fileName(n) ); + return size; + } + + Database* DatabaseHolder::getOrCreate( const string& ns , const string& path , bool& justCreated ) { + d.dbMutex.assertAtLeastReadLocked(); + + DBs& m = _paths[path]; + + string dbname = _todb( ns ); + + { + DBs::iterator i = m.find(dbname); + if( i != m.end() ) { + justCreated = false; + return i->second; + } + } + + // todo: protect against getting sprayed with requests for different db names that DNE - + // that would make the DBs map very large. not clear what to do to handle though, + // perhaps just log it, which is what we do here with the "> 40" : + bool cant = !d.dbMutex.isWriteLocked(); + if( logLevel >= 1 || m.size() > 40 || cant || DEBUG_BUILD ) { + log() << "opening db: " << (path==dbpath?"":path) << ' ' << dbname << endl; + } + massert(15927, "can't open database in a read lock. if db was just closed, consider retrying the query. might otherwise indicate an internal error", !cant); + + Database *db = new Database( dbname.c_str() , justCreated , path ); + m[dbname] = db; + _size++; + return db; + } + +} // namespace mongo diff --git a/src/mongo/db/database.h b/src/mongo/db/database.h new file mode 100644 index 00000000000..a7867e20e8c --- /dev/null +++ b/src/mongo/db/database.h @@ -0,0 +1,145 @@ +// database.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "cmdline.h" +#include "namespace.h" + +namespace mongo { + + class Extent; + class MongoDataFile; + class ClientCursor; + struct ByLocKey; + typedef map<ByLocKey, ClientCursor*> CCByLoc; + + /** + * Database represents a database database + * Each database database has its own set of files -- dbname.ns, dbname.0, dbname.1, ... + * NOT memory mapped + */ + class Database { + public: + static bool _openAllFiles; + + // you probably need to be in dbHolderMutex when constructing this + Database(const char *nm, /*out*/ bool& newDb, const string& _path = dbpath); + private: + ~Database(); // closes files and other cleanup see below. + public: + /* you must use this to close - there is essential code in this method that is not in the ~Database destructor. + thus the destructor is private. this could be cleaned up one day... + */ + static void closeDatabase( const char *db, const string& path ); + + void openAllFiles(); + + /** + * tries to make sure that this hasn't been deleted + */ + bool isOk() const { return magic == 781231; } + + bool isEmpty() { return ! namespaceIndex.allocated(); } + + /** + * total file size of Database in bytes + */ + long long fileSize() const; + + int numFiles() const; + + /** + * returns file valid for file number n + */ + boost::filesystem::path fileName( int n ) const; + + private: + bool exists(int n) const; + bool openExistingFile( int n ); + + public: + /** + * return file n. if it doesn't exist, create it + */ + MongoDataFile* getFile( int n, int sizeNeeded = 0, bool preallocateOnly = false ); + + MongoDataFile* addAFile( int sizeNeeded, bool preallocateNextFile ); + + /** + * makes sure we have an extra file at the end that is empty + * safe to call this multiple times - the implementation will only preallocate one file + */ + void preallocateAFile() { getFile( numFiles() , 0, true ); } + + MongoDataFile* suitableFile( const char *ns, int sizeNeeded, bool preallocate, bool enforceQuota ); + + Extent* allocExtent( const char *ns, int size, bool capped, bool enforceQuota ); + + MongoDataFile* newestFile(); + + /** + * @return true if success. false if bad level or error creating profile ns + */ + bool setProfilingLevel( int newLevel , string& errmsg ); + + void flushFiles( bool sync ); + + /** + * @return true if ns is part of the database + * ns=foo.bar, db=foo returns true + */ + bool ownsNS( const string& ns ) const { + if ( ! startsWith( ns , name ) ) + return false; + return ns[name.size()] == '.'; + } + private: + /** + * @throws DatabaseDifferCaseCode if the name is a duplicate based on + * case insensitive matching. + */ + void checkDuplicateUncasedNames(bool inholderlockalready) const; + public: + /** + * @return name of an existing database with same text name but different + * casing, if one exists. Otherwise the empty string is returned. If + * 'duplicates' is specified, it is filled with all duplicate names. + */ + static string duplicateUncasedName( bool inholderlockalready, const string &name, const string &path, set< string > *duplicates = 0 ); + + const string name; // "alleyinsider" + const string path; + + private: + + // must be in the dbLock when touching this (and write locked when writing to of course) + // however during Database object construction we aren't, which is ok as it isn't yet visible + // to others and we are in the dbholder lock then. + vector<MongoDataFile*> _files; + + public: // this should be private later + + NamespaceIndex namespaceIndex; + int profile; // 0=off. + const string profileName; // "alleyinsider.system.profile" + CCByLoc ccByLoc; + int magic; // used for making sure the object is still loaded in memory + }; + +} // namespace mongo diff --git a/src/mongo/db/databaseholder.h b/src/mongo/db/databaseholder.h new file mode 100644 index 00000000000..7c878c4ed63 --- /dev/null +++ b/src/mongo/db/databaseholder.h @@ -0,0 +1,126 @@ +// @file databaseholder.h + +#pragma once + +namespace mongo { + + /** + * path + dbname -> Database + */ + class DatabaseHolder { + typedef map<string,Database*> DBs; + typedef map<string,DBs> Paths; + public: + DatabaseHolder() : _size(0) { } + + bool __isLoaded( const string& ns , const string& path ) const { + Paths::const_iterator x = _paths.find( path ); + if ( x == _paths.end() ) + return false; + const DBs& m = x->second; + + string db = _todb( ns ); + + DBs::const_iterator it = m.find(db); + return it != m.end(); + } + // must be write locked as otherwise isLoaded could go false->true on you + // in the background and you might not expect that. + bool _isLoaded( const string& ns , const string& path ) const { + d.dbMutex.assertWriteLocked(); + return __isLoaded(ns,path); + } + + Database * get( const string& ns , const string& path ) const { + d.dbMutex.assertAtLeastReadLocked(); + Paths::const_iterator x = _paths.find( path ); + if ( x == _paths.end() ) + return 0; + const DBs& m = x->second; + string db = _todb( ns ); + DBs::const_iterator it = m.find(db); + if ( it != m.end() ) + return it->second; + return 0; + } + + void _put( const string& ns , const string& path , Database * db ) { + d.dbMutex.assertAtLeastReadLocked(); + DBs& m = _paths[path]; + Database*& d = m[_todb(ns)]; + if( d ) { + dlog(2) << "info dbholder put db was already set " << ns << endl; + } + else { + _size++; + } + d = db; + } + + Database* getOrCreate( const string& ns , const string& path , bool& justCreated ); + + void erase( const string& ns , const string& path ) { + d.dbMutex.assertWriteLocked(); // write lock req'd as a Database obj can be in use dbHolderMutex is mainly just to control the holder itself + DBs& m = _paths[path]; + _size -= (int)m.erase( _todb( ns ) ); + } + + /** @param force - force close even if something underway - use at shutdown */ + bool closeAll( const string& path , BSONObjBuilder& result, bool force ); + + // "info" as this is informational only could change on you if you are not write locked + int sizeInfo() const { return _size; } + + void forEach(boost::function<void(Database *)> f) const { + d.dbMutex.assertWriteLocked(); + for ( Paths::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ) { + DBs m = i->second; + for( DBs::const_iterator j=m.begin(); j!=m.end(); j++ ) { + f(j->second); + } + } + } + + /** + * gets all unique db names, ignoring paths + */ + void getAllShortNames( bool locked, set<string>& all ) const { + d.dbMutex.assertAtLeastReadLocked(); + for ( Paths::const_iterator i=_paths.begin(); i!=_paths.end(); i++ ) { + DBs m = i->second; + for( DBs::const_iterator j=m.begin(); j!=m.end(); j++ ) { + all.insert( j->first ); + } + } + } + + private: + static string _todb( const string& ns ) { + string d = __todb( ns ); + uassert( 13280 , (string)"invalid db name: " + ns , NamespaceString::validDBName( d ) ); + return d; + } + static string __todb( const string& ns ) { + size_t i = ns.find( '.' ); + if ( i == string::npos ) { + uassert( 13074 , "db name can't be empty" , ns.size() ); + return ns; + } + uassert( 13075 , "db name can't be empty" , i > 0 ); + return ns.substr( 0 , i ); + } + Paths _paths; + int _size; + }; + + DatabaseHolder& dbHolderUnchecked(); + inline const DatabaseHolder& dbHolder() { + dassert( d.dbMutex.atLeastReadLocked() ); + return dbHolderUnchecked(); + } + inline DatabaseHolder& dbHolderW() { + dassert( d.dbMutex.isWriteLocked() ); + return dbHolderUnchecked(); + } + +} diff --git a/src/mongo/db/db.cpp b/src/mongo/db/db.cpp new file mode 100644 index 00000000000..af03b447976 --- /dev/null +++ b/src/mongo/db/db.cpp @@ -0,0 +1,1309 @@ +// @file db.cpp : Defines main() for the mongod program. + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "db.h" +#include "introspect.h" +#include "repl.h" +#include "../util/unittest.h" +#include "../util/file_allocator.h" +#include "../util/background.h" +#include "../util/text.h" +#include "dbmessage.h" +#include "instance.h" +#include "clientcursor.h" +#include "pdfile.h" +#include "stats/counters.h" +#include "repl/rs.h" +#include "../scripting/engine.h" +#include "module.h" +#include "cmdline.h" +#include "stats/snapshots.h" +#include "../util/concurrency/task.h" +#include "../util/version.h" +#include "../util/ramlog.h" +#include "../util/net/message_server.h" +#include "client.h" +#include "restapi.h" +#include "dbwebserver.h" +#include "dur.h" +#include "concurrency.h" +#include "../s/d_writeback.h" +#include "d_globals.h" + +#if defined(_WIN32) +# include "../util/ntservice.h" +#else +# include <sys/file.h> +#endif + +namespace mongo { + + namespace dur { + extern unsigned long long DataLimitPerJournalFile; + } + + /* only off if --nocursors which is for debugging. */ + extern bool useCursors; + + /* only off if --nohints */ + extern bool useHints; + + extern int diagLogging; + extern unsigned lenForNewNsFiles; + extern int lockFile; + extern bool checkNsFilesOnLoad; + extern string repairpath; + + void setupSignals( bool inFork ); + void startReplication(); + void exitCleanly( ExitCode code ); + + CmdLine cmdLine; + static bool scriptingEnabled = true; + bool noHttpInterface = false; + bool shouldRepairDatabases = 0; + static bool forceRepair = 0; + Timer startupSrandTimer; + + const char *ourgetns() { + Client *c = currentClient.get(); + if ( ! c ) + return ""; + Client::Context* cc = c->getContext(); + return cc ? cc->ns() : ""; + } + + struct MyStartupTests { + MyStartupTests() { + assert( sizeof(OID) == 12 ); + } + } mystartupdbcpp; + + QueryResult* emptyMoreResult(long long); + + + /* todo: make this a real test. the stuff in dbtests/ seem to do all dbdirectclient which exhaust doesn't support yet. */ +// QueryOption_Exhaust +#define TESTEXHAUST 0 +#if( TESTEXHAUST ) + void testExhaust() { + sleepsecs(1); + unsigned n = 0; + auto f = [&n](const BSONObj& o) { + assert( o.valid() ); + //cout << o << endl; + n++; + bool testClosingSocketOnError = false; + if( testClosingSocketOnError ) + assert(false); + }; + DBClientConnection db(false); + db.connect("localhost"); + const char *ns = "local.foo"; + if( db.count(ns) < 10000 ) + for( int i = 0; i < 20000; i++ ) + db.insert(ns, BSON("aaa" << 3 << "b" << "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")); + + try { + db.query(f, ns, Query() ); + } + catch(...) { + cout << "hmmm" << endl; + } + + try { + db.query(f, ns, Query() ); + } + catch(...) { + cout << "caught" << endl; + } + + cout << n << endl; + }; +#endif + + void sysRuntimeInfo() { + out() << "sysinfo:" << endl; +#if defined(_SC_PAGE_SIZE) + out() << " page size: " << (int) sysconf(_SC_PAGE_SIZE) << endl; +#endif +#if defined(_SC_PHYS_PAGES) + out() << " _SC_PHYS_PAGES: " << sysconf(_SC_PHYS_PAGES) << endl; +#endif +#if defined(_SC_AVPHYS_PAGES) + out() << " _SC_AVPHYS_PAGES: " << sysconf(_SC_AVPHYS_PAGES) << endl; +#endif + } + + /* if server is really busy, wait a bit */ + void beNice() { + sleepmicros( Client::recommendedYieldMicros() ); + } + + class MyMessageHandler : public MessageHandler { + public: + virtual void connected( AbstractMessagingPort* p ) { + Client& c = Client::initThread("conn", p); + c.getAuthenticationInfo()->isLocalHost = p->remote().isLocalHost(); + } + + virtual void process( Message& m , AbstractMessagingPort* port , LastError * le) { + while ( true ) { + if ( inShutdown() ) { + log() << "got request after shutdown()" << endl; + break; + } + + lastError.startRequest( m , le ); + + DbResponse dbresponse; + assembleResponse( m, dbresponse, port->remote() ); + + if ( dbresponse.response ) { + port->reply(m, *dbresponse.response, dbresponse.responseTo); + if( dbresponse.exhaust ) { + MsgData *header = dbresponse.response->header(); + QueryResult *qr = (QueryResult *) header; + long long cursorid = qr->cursorId; + if( cursorid ) { + assert( dbresponse.exhaust && *dbresponse.exhaust != 0 ); + string ns = dbresponse.exhaust; // before reset() free's it... + m.reset(); + BufBuilder b(512); + b.appendNum((int) 0 /*size set later in appendData()*/); + b.appendNum(header->id); + b.appendNum(header->responseTo); + b.appendNum((int) dbGetMore); + b.appendNum((int) 0); + b.appendStr(ns); + b.appendNum((int) 0); // ntoreturn + b.appendNum(cursorid); + m.appendData(b.buf(), b.len()); + b.decouple(); + DEV log() << "exhaust=true sending more" << endl; + beNice(); + continue; // this goes back to top loop + } + } + } + break; + } + } + + virtual void disconnected( AbstractMessagingPort* p ) { + Client * c = currentClient.get(); + if( c ) c->shutdown(); + globalScriptEngine->threadDone(); + } + + }; + + void listen(int port) { + //testTheDb(); + MessageServer::Options options; + options.port = port; + options.ipList = cmdLine.bind_ip; + + MessageServer * server = createServer( options , new MyMessageHandler() ); + server->setAsTimeTracker(); + + startReplication(); + if ( !noHttpInterface ) + boost::thread web( boost::bind(&webServerThread, new RestAdminAccess() /* takes ownership */)); + +#if(TESTEXHAUST) + boost::thread thr(testExhaust); +#endif + server->run(); + } + + + bool doDBUpgrade( const string& dbName , string errmsg , DataFileHeader * h ) { + static DBDirectClient db; + + if ( h->version == 4 && h->versionMinor == 4 ) { + assert( PDFILE_VERSION == 4 ); + assert( PDFILE_VERSION_MINOR == 5 ); + + list<string> colls = db.getCollectionNames( dbName ); + for ( list<string>::iterator i=colls.begin(); i!=colls.end(); i++) { + string c = *i; + log() << "\t upgrading collection:" << c << endl; + BSONObj out; + bool ok = db.runCommand( dbName , BSON( "reIndex" << c.substr( dbName.size() + 1 ) ) , out ); + if ( ! ok ) { + errmsg = "reindex failed"; + log() << "\t\t reindex failed: " << out << endl; + return false; + } + } + + h->versionMinor = 5; + return true; + } + + // do this in the general case + return repairDatabase( dbName.c_str(), errmsg ); + } + + // ran at startup. + static void repairDatabasesAndCheckVersion() { + // LastError * le = lastError.get( true ); + Client::GodScope gs; + log(1) << "enter repairDatabases (to check pdfile version #)" << endl; + + //assert(checkNsFilesOnLoad); + checkNsFilesOnLoad = false; // we are mainly just checking the header - don't scan the whole .ns file for every db here. + + dblock lk; + vector< string > dbNames; + getDatabaseNames( dbNames ); + for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) { + string dbName = *i; + log(1) << "\t" << dbName << endl; + Client::Context ctx( dbName ); + MongoDataFile *p = cc().database()->getFile( 0 ); + DataFileHeader *h = p->getHeader(); + if ( !h->isCurrentVersion() || forceRepair ) { + + if( h->version <= 0 ) { + uasserted(14026, + str::stream() << "db " << dbName << " appears corrupt pdfile version: " << h->version + << " info: " << h->versionMinor << ' ' << h->fileLength); + } + + log() << "****" << endl; + log() << "****" << endl; + log() << "need to upgrade database " << dbName << " with pdfile version " << h->version << "." << h->versionMinor << ", " + << "new version: " << PDFILE_VERSION << "." << PDFILE_VERSION_MINOR << endl; + if ( shouldRepairDatabases ) { + // QUESTION: Repair even if file format is higher version than code? + log() << "\t starting upgrade" << endl; + string errmsg; + assert( doDBUpgrade( dbName , errmsg , h ) ); + } + else { + log() << "\t Not upgrading, exiting" << endl; + log() << "\t run --upgrade to upgrade dbs, then start again" << endl; + log() << "****" << endl; + dbexit( EXIT_NEED_UPGRADE ); + shouldRepairDatabases = 1; + return; + } + } + else { + Database::closeDatabase( dbName.c_str(), dbpath ); + } + } + + log(1) << "done repairDatabases" << endl; + + if ( shouldRepairDatabases ) { + log() << "finished checking dbs" << endl; + cc().shutdown(); + dbexit( EXIT_CLEAN ); + } + + checkNsFilesOnLoad = true; + } + + void clearTmpFiles() { + boost::filesystem::path path( dbpath ); + for ( boost::filesystem::directory_iterator i( path ); + i != boost::filesystem::directory_iterator(); ++i ) { + string fileName = boost::filesystem::path(*i).leaf(); + if ( boost::filesystem::is_directory( *i ) && + fileName.length() && fileName[ 0 ] == '$' ) + boost::filesystem::remove_all( *i ); + } + } + + void checkIfReplMissingFromCommandLine() { + if( !cmdLine.usingReplSets() ) { + Client::GodScope gs; + DBDirectClient c; + unsigned long long x = + c.count("local.system.replset"); + if( x ) { + log() << endl; + log() << "** warning: mongod started without --replSet yet " << x << " documents are present in local.system.replset" << endl; + log() << "** restart with --replSet unless you are doing maintenance and no other clients are connected" << endl; + log() << endl; + } + } + } + + void clearTmpCollections() { + writelock lk; // _openAllFiles is false at this point, so this is helpful for the query below to work as you can't open files when readlocked + Client::GodScope gs; + vector< string > toDelete; + DBDirectClient cli; + auto_ptr< DBClientCursor > c = cli.query( "local.system.namespaces", Query( fromjson( "{name:/^local.temp./}" ) ) ); + while( c->more() ) { + BSONObj o = c->next(); + toDelete.push_back( o.getStringField( "name" ) ); + } + for( vector< string >::iterator i = toDelete.begin(); i != toDelete.end(); ++i ) { + log() << "Dropping old temporary collection: " << *i << endl; + cli.dropCollection( *i ); + } + } + + /** + * does background async flushes of mmapped files + */ + class DataFileSync : public BackgroundJob { + public: + string name() const { return "DataFileSync"; } + void run() { + if( cmdLine.syncdelay == 0 ) + log() << "warning: --syncdelay 0 is not recommended and can have strange performance" << endl; + else if( cmdLine.syncdelay == 1 ) + log() << "--syncdelay 1" << endl; + else if( cmdLine.syncdelay != 60 ) + log(1) << "--syncdelay " << cmdLine.syncdelay << endl; + int time_flushing = 0; + while ( ! inShutdown() ) { + _diaglog.flush(); + if ( cmdLine.syncdelay == 0 ) { + // in case at some point we add an option to change at runtime + sleepsecs(5); + continue; + } + + sleepmillis( (long long) std::max(0.0, (cmdLine.syncdelay * 1000) - time_flushing) ); + + if ( inShutdown() ) { + // occasional issue trying to flush during shutdown when sleep interrupted + break; + } + + Date_t start = jsTime(); + int numFiles = MemoryMappedFile::flushAll( true ); + time_flushing = (int) (jsTime() - start); + + globalFlushCounters.flushed(time_flushing); + + if( logLevel >= 1 || time_flushing >= 10000 ) { + log() << "flushing mmaps took " << time_flushing << "ms " << " for " << numFiles << " files" << endl; + } + } + } + + } dataFileSync; + + const char * jsInterruptCallback() { + // should be safe to interrupt in js code, even if we have a write lock + return killCurrentOp.checkForInterruptNoAssert(); + } + + unsigned jsGetInterruptSpecCallback() { + return cc().curop()->opNum(); + } + + void _initAndListen(int listenPort ) { + + Client::initThread("initandlisten"); + + Database::_openAllFiles = false; + + Logstream::get().addGlobalTee( new RamLog("global") ); + + bool is32bit = sizeof(int*) == 4; + + { +#if !defined(_WIN32) + pid_t pid = getpid(); +#else + DWORD pid=GetCurrentProcessId(); +#endif + Nullstream& l = log(); + l << "MongoDB starting : pid=" << pid << " port=" << cmdLine.port << " dbpath=" << dbpath; + if( replSettings.master ) l << " master=" << replSettings.master; + if( replSettings.slave ) l << " slave=" << (int) replSettings.slave; + l << ( is32bit ? " 32" : " 64" ) << "-bit host=" << getHostNameCached() << endl; + } + DEV log() << "_DEBUG build (which is slower)" << endl; + show_warnings(); + log() << mongodVersion() << endl; + printGitVersion(); + printSysInfo(); + printCommandLineOpts(); + + { + stringstream ss; + ss << endl; + ss << "*********************************************************************" << endl; + ss << " ERROR: dbpath (" << dbpath << ") does not exist." << endl; + ss << " Create this directory or give existing directory in --dbpath." << endl; + ss << " See http://www.mongodb.org/display/DOCS/Starting+and+Stopping+Mongo" << endl; + ss << "*********************************************************************" << endl; + uassert( 10296 , ss.str().c_str(), boost::filesystem::exists( dbpath ) ); + } + { + stringstream ss; + ss << "repairpath (" << repairpath << ") does not exist"; + uassert( 12590 , ss.str().c_str(), boost::filesystem::exists( repairpath ) ); + } + + acquirePathLock(forceRepair); + remove_all( dbpath + "/_tmp/" ); + + FileAllocator::get()->start(); + + MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( clearTmpFiles(), "clear tmp files" ); + + dur::startup(); + + if( cmdLine.durOptions & CmdLine::DurRecoverOnly ) + return; + + // comes after getDur().startup() because this reads from the database + clearTmpCollections(); + + checkIfReplMissingFromCommandLine(); + + Module::initAll(); + + if ( scriptingEnabled ) { + ScriptEngine::setup(); + globalScriptEngine->setCheckInterruptCallback( jsInterruptCallback ); + globalScriptEngine->setGetInterruptSpecCallback( jsGetInterruptSpecCallback ); + } + + repairDatabasesAndCheckVersion(); + + /* we didn't want to pre-open all files for the repair check above. for regular + operation we do for read/write lock concurrency reasons. + */ + Database::_openAllFiles = true; + + if ( shouldRepairDatabases ) + return; + + /* this is for security on certain platforms (nonce generation) */ + srand((unsigned) (curTimeMicros() ^ startupSrandTimer.micros())); + + snapshotThread.go(); + d.clientCursorMonitor.go(); + PeriodicTask::theRunner->go(); + +#ifndef _WIN32 + CmdLine::launchOk(); +#endif + listen(listenPort); + + // listen() will return when exit code closes its socket. + exitCleanly(EXIT_NET_ERROR); + } + + void testPretouch(); + + void initAndListen(int listenPort) { + try { + _initAndListen(listenPort); + } + catch ( DBException &e ) { + log() << "exception in initAndListen: " << e.toString() << ", terminating" << endl; + dbexit( EXIT_UNCAUGHT ); + } + catch ( std::exception &e ) { + log() << "exception in initAndListen std::exception: " << e.what() << ", terminating" << endl; + dbexit( EXIT_UNCAUGHT ); + } + catch ( int& n ) { + log() << "exception in initAndListen int: " << n << ", terminating" << endl; + dbexit( EXIT_UNCAUGHT ); + } + catch(...) { + log() << "exception in initAndListen, terminating" << endl; + dbexit( EXIT_UNCAUGHT ); + } + } + +#if defined(_WIN32) + bool initService() { + ServiceController::reportStatus( SERVICE_RUNNING ); + initAndListen( cmdLine.port ); + return true; + } +#endif + +} // namespace mongo + +using namespace mongo; + +#include <boost/program_options.hpp> +#undef assert +#define assert MONGO_assert + +namespace po = boost::program_options; + +void show_help_text(po::options_description options) { + show_warnings(); + cout << options << endl; +}; + +/* Return error string or "" if no errors. */ +string arg_error_check(int argc, char* argv[]) { + return ""; +} + +int main(int argc, char* argv[]) { + static StaticObserver staticObserver; + doPreServerStartupInits(); + getcurns = ourgetns; + + po::options_description general_options("General options"); +#if defined(_WIN32) + po::options_description windows_scm_options("Windows Service Control Manager options"); +#endif + po::options_description replication_options("Replication options"); + po::options_description ms_options("Master/slave options"); + po::options_description rs_options("Replica set options"); + po::options_description sharding_options("Sharding options"); + po::options_description visible_options("Allowed options"); + po::options_description hidden_options("Hidden options"); + + po::positional_options_description positional_options; + + CmdLine::addGlobalOptions( general_options , hidden_options ); + + general_options.add_options() + ("auth", "run with security") + ("cpu", "periodically show cpu and iowait utilization") + ("dbpath", po::value<string>() , "directory for datafiles") + ("diaglog", po::value<int>(), "0=off 1=W 2=R 3=both 7=W+some reads") + ("directoryperdb", "each database will be stored in a separate directory") + ("journal", "enable journaling") + ("journalOptions", po::value<int>(), "journal diagnostic options") + ("journalCommitInterval", po::value<unsigned>(), "how often to group/batch commit (ms)") + ("ipv6", "enable IPv6 support (disabled by default)") + ("jsonp","allow JSONP access via http (has security implications)") + ("noauth", "run without security") + ("nohttpinterface", "disable http interface") + ("nojournal", "disable journaling (journaling is on by default for 64 bit)") + ("noprealloc", "disable data file preallocation - will often hurt performance") + ("noscripting", "disable scripting engine") + ("notablescan", "do not allow table scans") + ("nssize", po::value<int>()->default_value(16), ".ns file size (in MB) for new databases") + ("profile",po::value<int>(), "0=off 1=slow, 2=all") + ("quota", "limits each database to a certain number of files (8 default)") + ("quotaFiles", po::value<int>(), "number of files allower per db, requires --quota") + ("rest","turn on simple rest api") + ("repair", "run repair on all dbs") + ("repairpath", po::value<string>() , "root directory for repair files - defaults to dbpath" ) + ("slowms",po::value<int>(&cmdLine.slowMS)->default_value(100), "value of slow for profile and console log" ) + ("smallfiles", "use a smaller default file size") +#if defined(__linux__) + ("shutdown", "kill a running server (for init scripts)") +#endif + ("syncdelay",po::value<double>(&cmdLine.syncdelay)->default_value(60), "seconds between disk syncs (0=never, but not recommended)") + ("sysinfo", "print some diagnostic system information") + ("upgrade", "upgrade db if needed") + ; + +#if defined(_WIN32) + CmdLine::addWindowsOptions( windows_scm_options, hidden_options ); +#endif + + replication_options.add_options() + ("oplogSize", po::value<int>(), "size limit (in MB) for op log") + ; + + ms_options.add_options() + ("master", "master mode") + ("slave", "slave mode") + ("source", po::value<string>(), "when slave: specify master as <server:port>") + ("only", po::value<string>(), "when slave: specify a single database to replicate") + ("slavedelay", po::value<int>(), "specify delay (in seconds) to be used when applying master ops to slave") + ("autoresync", "automatically resync if slave data is stale") + ; + + rs_options.add_options() + ("replSet", po::value<string>(), "arg is <setname>[/<optionalseedhostlist>]") + ; + + sharding_options.add_options() + ("configsvr", "declare this is a config db of a cluster; default port 27019; default dir /data/configdb") + ("shardsvr", "declare this is a shard db of a cluster; default port 27018") + ("noMoveParanoia" , "turn off paranoid saving of data for moveChunk. this is on by default for now, but default will switch" ) + ; + + hidden_options.add_options() + ("fastsync", "indicate that this instance is starting from a dbpath snapshot of the repl peer") + ("pretouch", po::value<int>(), "n pretouch threads for applying replicationed operations") // experimental + ("command", po::value< vector<string> >(), "command") + ("cacheSize", po::value<long>(), "cache size (in MB) for rec store") + ("nodur", "disable journaling") + // things we don't want people to use + ("nocursors", "diagnostic/debugging option that turns off cursors DO NOT USE IN PRODUCTION") + ("nohints", "ignore query hints") + ("nopreallocj", "don't preallocate journal files") + ("dur", "enable journaling") // old name for --journal + ("durOptions", po::value<int>(), "durability diagnostic options") // deprecated name + // deprecated pairing command line options + ("pairwith", "DEPRECATED") + ("arbiter", "DEPRECATED") + ("opIdMem", "DEPRECATED") + ; + + + positional_options.add("command", 3); + visible_options.add(general_options); +#if defined(_WIN32) + visible_options.add(windows_scm_options); +#endif + visible_options.add(replication_options); + visible_options.add(ms_options); + visible_options.add(rs_options); + visible_options.add(sharding_options); + Module::addOptions( visible_options ); + + setupCoreSignals(); + setupSignals( false ); + + dbExecCommand = argv[0]; + + srand(curTimeMicros()); +#if( BOOST_VERSION >= 104500 ) + boost::filesystem::path::default_name_check( boost::filesystem2::no_check ); +#else + boost::filesystem::path::default_name_check( boost::filesystem::no_check ); +#endif + + { + unsigned x = 0x12345678; + unsigned char& b = (unsigned char&) x; + if ( b != 0x78 ) { + out() << "big endian cpus not yet supported" << endl; + return 33; + } + } + + if( argc == 1 ) + cout << dbExecCommand << " --help for help and startup options" << endl; + + { + po::variables_map params; + + string error_message = arg_error_check(argc, argv); + if (error_message != "") { + cout << error_message << endl << endl; + show_help_text(visible_options); + return 0; + } + + if ( ! CmdLine::store( argc , argv , visible_options , hidden_options , positional_options , params ) ) + return 0; + + if (params.count("help")) { + show_help_text(visible_options); + return 0; + } + if (params.count("version")) { + cout << mongodVersion() << endl; + printGitVersion(); + return 0; + } + if ( params.count( "dbpath" ) ) { + dbpath = params["dbpath"].as<string>(); + if ( params.count( "fork" ) && dbpath[0] != '/' ) { + // we need to change dbpath if we fork since we change + // cwd to "/" + // fork only exists on *nix + // so '/' is safe + dbpath = cmdLine.cwd + "/" + dbpath; + } + } + else { + dbpath = "/data/db/"; + } +#ifdef _WIN32 + if (dbpath.size() > 1 && dbpath[dbpath.size()-1] == '/') { + // size() check is for the unlikely possibility of --dbpath "/" + dbpath = dbpath.erase(dbpath.size()-1); + } +#endif + + if ( params.count("directoryperdb")) { + directoryperdb = true; + } + if (params.count("cpu")) { + cmdLine.cpu = true; + } + if (params.count("noauth")) { + noauth = true; + } + if (params.count("auth")) { + noauth = false; + } + if (params.count("quota")) { + cmdLine.quota = true; + } + if (params.count("quotaFiles")) { + cmdLine.quota = true; + cmdLine.quotaFiles = params["quotaFiles"].as<int>() - 1; + } + bool journalExplicit = false; + if( params.count("nodur") || params.count( "nojournal" ) ) { + journalExplicit = true; + cmdLine.dur = false; + } + if( params.count("dur") || params.count( "journal" ) ) { + if (journalExplicit) { + log() << "Can't specify both --journal and --nojournal options." << endl; + return EXIT_BADOPTIONS; + } + journalExplicit = true; + cmdLine.dur = true; + } + if (params.count("durOptions")) { + cmdLine.durOptions = params["durOptions"].as<int>(); + } + if( params.count("journalCommitInterval") ) { + // don't check if dur is false here as many will just use the default, and will default to off on win32. + // ie no point making life a little more complex by giving an error on a dev environment. + cmdLine.journalCommitInterval = params["journalCommitInterval"].as<unsigned>(); + if( cmdLine.journalCommitInterval <= 1 || cmdLine.journalCommitInterval > 300 ) { + out() << "--journalCommitInterval out of allowed range (0-300ms)" << endl; + dbexit( EXIT_BADOPTIONS ); + } + } + if (params.count("journalOptions")) { + cmdLine.durOptions = params["journalOptions"].as<int>(); + } + if (params.count("repairpath")) { + repairpath = params["repairpath"].as<string>(); + if (!repairpath.size()) { + out() << "repairpath is empty" << endl; + dbexit( EXIT_BADOPTIONS ); + } + } + if (params.count("nocursors")) { + useCursors = false; + } + if (params.count("nohints")) { + useHints = false; + } + if (params.count("nopreallocj")) { + cmdLine.preallocj = false; + } + if (params.count("nohttpinterface")) { + noHttpInterface = true; + } + if (params.count("rest")) { + cmdLine.rest = true; + } + if (params.count("jsonp")) { + cmdLine.jsonp = true; + } + if (params.count("noscripting")) { + scriptingEnabled = false; + } + if (params.count("noprealloc")) { + cmdLine.prealloc = false; + cout << "note: noprealloc may hurt performance in many applications" << endl; + } + if (params.count("smallfiles")) { + cmdLine.smallfiles = true; + assert( dur::DataLimitPerJournalFile >= 128 * 1024 * 1024 ); + dur::DataLimitPerJournalFile = 128 * 1024 * 1024; + } + if (params.count("diaglog")) { + int x = params["diaglog"].as<int>(); + if ( x < 0 || x > 7 ) { + out() << "can't interpret --diaglog setting" << endl; + dbexit( EXIT_BADOPTIONS ); + } + _diaglog.setLevel(x); + } + if (params.count("sysinfo")) { + sysRuntimeInfo(); + return 0; + } + if (params.count("repair")) { + Record::MemoryTrackingEnabled = false; + shouldRepairDatabases = 1; + forceRepair = 1; + } + if (params.count("upgrade")) { + Record::MemoryTrackingEnabled = false; + shouldRepairDatabases = 1; + } + if (params.count("notablescan")) { + cmdLine.noTableScan = true; + } + if (params.count("master")) { + replSettings.master = true; + } + if (params.count("slave")) { + replSettings.slave = SimpleSlave; + } + if (params.count("slavedelay")) { + replSettings.slavedelay = params["slavedelay"].as<int>(); + } + if (params.count("fastsync")) { + replSettings.fastsync = true; + } + if (params.count("autoresync")) { + replSettings.autoresync = true; + if( params.count("replSet") ) { + out() << "--autoresync is not used with --replSet" << endl; + out() << "see http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << endl; + dbexit( EXIT_BADOPTIONS ); + } + } + if (params.count("source")) { + /* specifies what the source in local.sources should be */ + cmdLine.source = params["source"].as<string>().c_str(); + } + if( params.count("pretouch") ) { + cmdLine.pretouch = params["pretouch"].as<int>(); + } + if (params.count("replSet")) { + if (params.count("slavedelay")) { + out() << "--slavedelay cannot be used with --replSet" << endl; + dbexit( EXIT_BADOPTIONS ); + } + else if (params.count("only")) { + out() << "--only cannot be used with --replSet" << endl; + dbexit( EXIT_BADOPTIONS ); + } + /* seed list of hosts for the repl set */ + cmdLine._replSet = params["replSet"].as<string>().c_str(); + } + if (params.count("only")) { + cmdLine.only = params["only"].as<string>().c_str(); + } + if( params.count("nssize") ) { + int x = params["nssize"].as<int>(); + if (x <= 0 || x > (0x7fffffff/1024/1024)) { + out() << "bad --nssize arg" << endl; + dbexit( EXIT_BADOPTIONS ); + } + lenForNewNsFiles = x * 1024 * 1024; + assert(lenForNewNsFiles > 0); + } + if (params.count("oplogSize")) { + long long x = params["oplogSize"].as<int>(); + if (x <= 0) { + out() << "bad --oplogSize arg" << endl; + dbexit( EXIT_BADOPTIONS ); + } + // note a small size such as x==1 is ok for an arbiter. + if( x > 1000 && sizeof(void*) == 4 ) { + out() << "--oplogSize of " << x << "MB is too big for 32 bit version. Use 64 bit build instead." << endl; + dbexit( EXIT_BADOPTIONS ); + } + cmdLine.oplogSize = x * 1024 * 1024; + assert(cmdLine.oplogSize > 0); + } + if (params.count("cacheSize")) { + long x = params["cacheSize"].as<long>(); + if (x <= 0) { + out() << "bad --cacheSize arg" << endl; + dbexit( EXIT_BADOPTIONS ); + } + log() << "--cacheSize option not currently supported" << endl; + } + if (params.count("port") == 0 ) { + if( params.count("configsvr") ) { + cmdLine.port = CmdLine::ConfigServerPort; + } + if( params.count("shardsvr") ) { + if( params.count("configsvr") ) { + log() << "can't do --shardsvr and --configsvr at the same time" << endl; + dbexit( EXIT_BADOPTIONS ); + } + cmdLine.port = CmdLine::ShardServerPort; + } + } + else { + if ( cmdLine.port <= 0 || cmdLine.port > 65535 ) { + out() << "bad --port number" << endl; + dbexit( EXIT_BADOPTIONS ); + } + } + if ( params.count("configsvr" ) ) { + cmdLine.configsvr = true; + if (cmdLine.usingReplSets() || replSettings.master || replSettings.slave) { + log() << "replication should not be enabled on a config server" << endl; + ::exit(-1); + } + if ( params.count( "nodur" ) == 0 && params.count( "nojournal" ) == 0 ) + cmdLine.dur = true; + if ( params.count( "dbpath" ) == 0 ) + dbpath = "/data/configdb"; + } + if ( params.count( "profile" ) ) { + cmdLine.defaultProfile = params["profile"].as<int>(); + } + if (params.count("ipv6")) { + enableIPv6(); + } + if (params.count("noMoveParanoia")) { + cmdLine.moveParanoia = false; + } + if (params.count("pairwith") || params.count("arbiter") || params.count("opIdMem")) { + out() << "****" << endl; + out() << "Replica Pairs have been deprecated. Invalid options: --pairwith, --arbiter, and/or --opIdMem" << endl; + out() << "<http://www.mongodb.org/display/DOCS/Replica+Pairs>" << endl; + out() << "****" << endl; + dbexit( EXIT_BADOPTIONS ); + } + + // needs to be after things like --configsvr parsing, thus here. + if( repairpath.empty() ) + repairpath = dbpath; + + Module::configAll( params ); + dataFileSync.go(); + + if (params.count("command")) { + vector<string> command = params["command"].as< vector<string> >(); + + if (command[0].compare("run") == 0) { + if (command.size() > 1) { + cout << "Too many parameters to 'run' command" << endl; + cout << visible_options << endl; + return 0; + } + + initAndListen(cmdLine.port); + return 0; + } + + if (command[0].compare("dbpath") == 0) { + cout << dbpath << endl; + return 0; + } + + cout << "Invalid command: " << command[0] << endl; + cout << visible_options << endl; + return 0; + } + + if( cmdLine.pretouch ) + log() << "--pretouch " << cmdLine.pretouch << endl; + +#ifdef __linux__ + if (params.count("shutdown")){ + bool failed = false; + + string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string(); + if ( !boost::filesystem::exists( name ) || boost::filesystem::file_size( name ) == 0 ) + failed = true; + + pid_t pid; + string procPath; + if (!failed){ + try { + ifstream f (name.c_str()); + f >> pid; + procPath = (str::stream() << "/proc/" << pid); + if (!boost::filesystem::exists(procPath)) + failed = true; + + string exePath = procPath + "/exe"; + if (boost::filesystem::exists(exePath)){ + char buf[256]; + int ret = readlink(exePath.c_str(), buf, sizeof(buf)-1); + buf[ret] = '\0'; // readlink doesn't terminate string + if (ret == -1) { + int e = errno; + cerr << "Error resolving " << exePath << ": " << errnoWithDescription(e); + failed = true; + } + else if (!endsWith(buf, "mongod")){ + cerr << "Process " << pid << " is running " << buf << " not mongod" << endl; + ::exit(-1); + } + } + } + catch (const std::exception& e){ + cerr << "Error reading pid from lock file [" << name << "]: " << e.what() << endl; + failed = true; + } + } + + if (failed) { + cerr << "There doesn't seem to be a server running with dbpath: " << dbpath << endl; + ::exit(-1); + } + + cout << "killing process with pid: " << pid << endl; + int ret = kill(pid, SIGTERM); + if (ret) { + int e = errno; + cerr << "failed to kill process: " << errnoWithDescription(e) << endl; + ::exit(-1); + } + + while (boost::filesystem::exists(procPath)) { + sleepsecs(1); + } + + ::exit(0); + } +#endif + +#if defined(_WIN32) + if (serviceParamsCheck( params, dbpath, argc, argv )) { + return 0; + } +#endif + + + if (sizeof(void*) == 4 && !journalExplicit){ + // trying to make this stand out more like startup warnings + log() << endl; + warning() << "32-bit servers don't have journaling enabled by default. Please use --journal if you want durability." << endl; + log() << endl; + } + + } + + UnitTest::runTests(); + initAndListen(cmdLine.port); + dbexit(EXIT_CLEAN); + return 0; +} + +namespace mongo { + + string getDbContext(); + +#undef out + + +#if !defined(_WIN32) + +} // namespace mongo + +#include <signal.h> +#include <string.h> + +namespace mongo { + + void pipeSigHandler( int signal ) { +#ifdef psignal + psignal( signal, "Signal Received : "); +#else + cout << "got pipe signal:" << signal << endl; +#endif + } + + void abruptQuit(int x) { + ostringstream ossSig; + ossSig << "Got signal: " << x << " (" << strsignal( x ) << ")." << endl; + rawOut( ossSig.str() ); + + /* + ostringstream ossOp; + ossOp << "Last op: " << currentOp.infoNoauth() << endl; + rawOut( ossOp.str() ); + */ + + ostringstream oss; + oss << "Backtrace:" << endl; + printStackTrace( oss ); + rawOut( oss.str() ); + + // Don't go through normal shutdown procedure. It may make things worse. + ::exit(EXIT_ABRUPT); + + } + + void abruptQuitWithAddrSignal( int signal, siginfo_t *siginfo, void * ) { + ostringstream oss; + oss << "Invalid"; + if ( signal == SIGSEGV || signal == SIGBUS ) { + oss << " access"; + } else { + oss << " operation"; + } + oss << " at address: " << siginfo->si_addr << endl; + rawOut( oss.str() ); + abruptQuit( signal ); + } + + sigset_t asyncSignals; + // The above signals will be processed by this thread only, in order to + // ensure the db and log mutexes aren't held. + void interruptThread() { + int x; + sigwait( &asyncSignals, &x ); + log() << "got kill or ctrl c or hup signal " << x << " (" << strsignal( x ) << "), will terminate after current cmd ends" << endl; + Client::initThread( "interruptThread" ); + exitCleanly( EXIT_KILL ); + } + + // this will be called in certain c++ error cases, for example if there are two active + // exceptions + void myterminate() { + rawOut( "terminate() called, printing stack:" ); + printStackTrace(); + ::abort(); + } + + // this gets called when new fails to allocate memory + void my_new_handler() { + rawOut( "out of memory, printing stack and exiting:" ); + printStackTrace(); + ::exit(EXIT_ABRUPT); + } + + void setupSignals_ignoreHelper( int signal ) {} + + void setupSignals( bool inFork ) { + struct sigaction addrSignals; + memset( &addrSignals, 0, sizeof( struct sigaction ) ); + addrSignals.sa_sigaction = abruptQuitWithAddrSignal; + sigemptyset( &addrSignals.sa_mask ); + addrSignals.sa_flags = SA_SIGINFO; + + assert( sigaction(SIGSEGV, &addrSignals, 0) == 0 ); + assert( sigaction(SIGBUS, &addrSignals, 0) == 0 ); + assert( sigaction(SIGILL, &addrSignals, 0) == 0 ); + assert( sigaction(SIGFPE, &addrSignals, 0) == 0 ); + + assert( signal(SIGABRT, abruptQuit) != SIG_ERR ); + assert( signal(SIGQUIT, abruptQuit) != SIG_ERR ); + assert( signal(SIGPIPE, pipeSigHandler) != SIG_ERR ); + + setupSIGTRAPforGDB(); + + sigemptyset( &asyncSignals ); + + if ( inFork ) + assert( signal( SIGHUP , setupSignals_ignoreHelper ) != SIG_ERR ); + else + sigaddset( &asyncSignals, SIGHUP ); + + sigaddset( &asyncSignals, SIGINT ); + sigaddset( &asyncSignals, SIGTERM ); + assert( pthread_sigmask( SIG_SETMASK, &asyncSignals, 0 ) == 0 ); + boost::thread it( interruptThread ); + + set_terminate( myterminate ); + set_new_handler( my_new_handler ); + } + +#else + void consoleTerminate( const char* controlCodeName ) { + Client::initThread( "consoleTerminate" ); + log() << "got " << controlCodeName << ", will terminate after current cmd ends" << endl; + exitCleanly( EXIT_KILL ); + } + + BOOL CtrlHandler( DWORD fdwCtrlType ) { + + switch( fdwCtrlType ) { + + case CTRL_C_EVENT: + rawOut( "Ctrl-C signal" ); + consoleTerminate( "CTRL_C_EVENT" ); + return TRUE ; + + case CTRL_CLOSE_EVENT: + rawOut( "CTRL_CLOSE_EVENT signal" ); + consoleTerminate( "CTRL_CLOSE_EVENT" ); + return TRUE ; + + case CTRL_BREAK_EVENT: + rawOut( "CTRL_BREAK_EVENT signal" ); + consoleTerminate( "CTRL_BREAK_EVENT" ); + return TRUE; + + case CTRL_LOGOFF_EVENT: + rawOut( "CTRL_LOGOFF_EVENT signal" ); + consoleTerminate( "CTRL_LOGOFF_EVENT" ); + return TRUE; + + case CTRL_SHUTDOWN_EVENT: + rawOut( "CTRL_SHUTDOWN_EVENT signal" ); + consoleTerminate( "CTRL_SHUTDOWN_EVENT" ); + return TRUE; + + default: + return FALSE; + } + } + + LPTOP_LEVEL_EXCEPTION_FILTER filtLast = 0; + ::HANDLE standardOut = GetStdHandle(STD_OUTPUT_HANDLE); + LONG WINAPI exceptionFilter(struct _EXCEPTION_POINTERS *ExceptionInfo) { + { + // given the severity of the event we write to console in addition to the --logFile + // (rawOut writes to the logfile, if a special one were specified) + DWORD written; + WriteFile(standardOut, "unhandled windows exception\n", 20, &written, 0); + FlushFileBuffers(standardOut); + } + + DWORD ec = ExceptionInfo->ExceptionRecord->ExceptionCode; + if( ec == EXCEPTION_ACCESS_VIOLATION ) { + rawOut("access violation"); + } + else { + rawOut("unhandled windows exception"); + char buf[64]; + strcpy(buf, "ec=0x"); + _ui64toa(ec, buf+5, 16); + rawOut(buf); + } + if( filtLast ) + return filtLast(ExceptionInfo); + return EXCEPTION_EXECUTE_HANDLER; + } + + // called by mongoAbort() + extern void (*reportEventToSystem)(const char *msg); + void reportEventToSystemImpl(const char *msg) { + static ::HANDLE hEventLog = RegisterEventSource( NULL, TEXT("mongod") ); + if( hEventLog ) { + std::wstring s = toNativeString(msg); + LPCTSTR txt = s.c_str(); + BOOL ok = ReportEvent( + hEventLog, EVENTLOG_ERROR_TYPE, + 0, 0, NULL, + 1, + 0, + &txt, + 0); + wassert(ok); + } + } + + void myPurecallHandler() { + printStackTrace(); + mongoAbort("pure virtual"); + } + + void setupSignals( bool inFork ) { + reportEventToSystem = reportEventToSystemImpl; + filtLast = SetUnhandledExceptionFilter(exceptionFilter); + massert(10297 , "Couldn't register Windows Ctrl-C handler", SetConsoleCtrlHandler((PHANDLER_ROUTINE) CtrlHandler, TRUE)); + _set_purecall_handler( myPurecallHandler ); + } + +#endif + +} // namespace mongo diff --git a/src/mongo/db/db.h b/src/mongo/db/db.h new file mode 100644 index 00000000000..6a31a06f77c --- /dev/null +++ b/src/mongo/db/db.h @@ -0,0 +1,120 @@ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../pch.h" +#include "../util/net/message.h" +#include "concurrency.h" +#include "pdfile.h" +#include "curop.h" +#include "client.h" +#include "databaseholder.h" + +namespace mongo { + + struct dbtemprelease { + Client::Context * _context; + int _locktype; + + dbtemprelease() { + const Client& c = cc(); + _context = c.getContext(); + _locktype = d.dbMutex.getState(); + assert( _locktype ); + + if ( _locktype > 0 ) { + massert( 10298 , "can't temprelease nested write lock", _locktype == 1); + if ( _context ) _context->unlocked(); + d.dbMutex.unlock(); + } + else { + massert( 10299 , "can't temprelease nested read lock", _locktype == -1); + if ( _context ) _context->unlocked(); + d.dbMutex.unlock_shared(); + } + + verify( 14814 , c.curop() ); + c.curop()->yielded(); + + } + ~dbtemprelease() { + if ( _locktype > 0 ) + d.dbMutex.lock(); + else + d.dbMutex.lock_shared(); + + if ( _context ) _context->relocked(); + } + }; + + /** must be write locked + no assert (and no release) if nested write lock + a lot like dbtempreleasecond but no malloc so should be a tiny bit faster + */ + struct dbtempreleasewritelock { + Client::Context * _context; + int _locktype; + dbtempreleasewritelock() { + const Client& c = cc(); + _context = c.getContext(); + _locktype = d.dbMutex.getState(); + assert( _locktype >= 1 ); + if( _locktype > 1 ) + return; // nested + if ( _context ) + _context->unlocked(); + d.dbMutex.unlock(); + verify( 14845 , c.curop() ); + c.curop()->yielded(); + } + ~dbtempreleasewritelock() { + if ( _locktype == 1 ) + d.dbMutex.lock(); + if ( _context ) + _context->relocked(); + } + }; + + /** + only does a temp release if we're not nested and have a lock + */ + struct dbtempreleasecond { + dbtemprelease * real; + int locktype; + + dbtempreleasecond() { + real = 0; + locktype = d.dbMutex.getState(); + if ( locktype == 1 || locktype == -1 ) + real = new dbtemprelease(); + } + + ~dbtempreleasecond() { + if ( real ) { + delete real; + real = 0; + } + } + + bool unlocked() { + return real != 0; + } + }; + +} // namespace mongo + +#include "concurrency.h" diff --git a/src/mongo/db/db.rc b/src/mongo/db/db.rc new file mode 100755 index 00000000000..b589458cf73 --- /dev/null +++ b/src/mongo/db/db.rc @@ -0,0 +1,12 @@ +// Microsoft Visual C++ generated resource script. +// +#include "resource.h" + +///////////////////////////////////////////////////////////////////////////// +// +// Icon +// +// Icon with lowest ID value placed first to ensure application icon +// remains consistent on all systems. +IDI_ICON2 ICON "mongo.ico" +/////////////////////////////////////////////////////////////////////////////
\ No newline at end of file diff --git a/src/mongo/db/db.vcxproj b/src/mongo/db/db.vcxproj new file mode 100755 index 00000000000..8963f0af580 --- /dev/null +++ b/src/mongo/db/db.vcxproj @@ -0,0 +1,934 @@ +<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectName>mongod</ProjectName>
+ <ProjectGuid>{215B2D68-0A70-4D10-8E75-B31010C62A91}</ProjectGuid>
+ <RootNamespace>db</RootNamespace>
+ <Keyword>Win32Proj</Keyword>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseOfMfc>false</UseOfMfc>
+ <UseOfAtl>false</UseOfAtl>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseOfMfc>false</UseOfMfc>
+ <UseOfAtl>false</UseOfAtl>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup>
+ <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(SolutionDir)$(Configuration)\</OutDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(Configuration)\</IntDir>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</LinkIncremental>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(SolutionDir)$(Configuration)\</OutDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(Configuration)\</IntDir>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</LinkIncremental>
+ <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+ <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+ <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+ <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+ <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+ <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+ <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+ <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+ <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+ <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+ <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+ <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+ <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.;..;$(IncludePath)</IncludePath>
+ <IncludePath Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..;$(IncludePath)</IncludePath>
+ <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..;$(IncludePath)</IncludePath>
+ <IncludePath Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..;$(IncludePath)</IncludePath>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <Optimization>Disabled</Optimization>
+ <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <MinimalRebuild>No</MinimalRebuild>
+ <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+ <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+ <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+ <MultiProcessorCompilation>true</MultiProcessorCompilation>
+ </ClCompile>
+ <Link>
+ <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ <AdditionalLibraryDirectories>c:\boost\lib\vs2010_32;\boost\lib\vs2010_32;\boost\lib</AdditionalLibraryDirectories>
+ <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+ <IgnoreSpecificDefaultLibraries>%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ <TargetMachine>MachineX86</TargetMachine>
+ </Link>
+ <PreBuildEvent>
+ <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+ <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+ </PreBuildEvent>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <ClCompile>
+ <Optimization>Disabled</Optimization>
+ <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+ <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+ <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+ <MultiProcessorCompilation>true</MultiProcessorCompilation>
+ <MinimalRebuild>No</MinimalRebuild>
+ </ClCompile>
+ <Link>
+ <AdditionalDependencies>ws2_32.lib;Psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ <AdditionalLibraryDirectories>c:\boost\lib\vs2010_64;\boost\lib\vs2010_64;\boost\lib</AdditionalLibraryDirectories>
+ <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+ <IgnoreSpecificDefaultLibraries>%(IgnoreSpecificDefaultLibraries)</IgnoreSpecificDefaultLibraries>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ </Link>
+ <PreBuildEvent>
+ <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+ <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+ </PreBuildEvent>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <Optimization>MaxSpeed</Optimization>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>_UNICODE;UNICODE;;;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+ <DisableSpecificWarnings>4355;4800;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+ <MultiProcessorCompilation>true</MultiProcessorCompilation>
+ <MinimalRebuild>No</MinimalRebuild>
+ </ClCompile>
+ <Link>
+ <AdditionalDependencies>ws2_32.lib;psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ <AdditionalLibraryDirectories>c:\boost\lib\vs2010_32;\boost\lib\vs2010_32;\boost\lib</AdditionalLibraryDirectories>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ <OptimizeReferences>true</OptimizeReferences>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <TargetMachine>MachineX86</TargetMachine>
+ <IgnoreAllDefaultLibraries>false</IgnoreAllDefaultLibraries>
+ </Link>
+ <PreBuildEvent>
+ <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+ <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+ </PreBuildEvent>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <ClCompile>
+ <Optimization>MaxSpeed</Optimization>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <AdditionalIncludeDirectories>..\..\js\src;..\third_party\pcre-7.4;c:\boost;\boost</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>;;MONGO_EXPOSE_MACROS;OLDJS;STATIC_JS_API;XP_WIN;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;HAVE_CONFIG_H;;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <PrecompiledHeader>Use</PrecompiledHeader>
+ <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+ <DisableSpecificWarnings>4355;4800;4267;4244;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+ <MultiProcessorCompilation>true</MultiProcessorCompilation>
+ <MinimalRebuild>No</MinimalRebuild>
+ </ClCompile>
+ <Link>
+ <AdditionalDependencies>ws2_32.lib;psapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ <AdditionalLibraryDirectories>c:\boost\lib\vs2010_64;\boost\lib\vs2010_64;\boost\lib</AdditionalLibraryDirectories>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ <OptimizeReferences>true</OptimizeReferences>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ </Link>
+ <PreBuildEvent>
+ <Command>cscript //Nologo ..\shell\msvc\createCPPfromJavaScriptFiles.js "$(ProjectDir).."</Command>
+ <Message>Create mongo.cpp and mongo-server.cpp from JavaScript source files</Message>
+ </PreBuildEvent>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClCompile Include="..\bson\oid.cpp" />
+ <ClCompile Include="..\client\dbclientcursor.cpp" />
+ <ClCompile Include="..\client\dbclient_rs.cpp" />
+ <ClCompile Include="..\client\distlock.cpp" />
+ <ClCompile Include="..\client\model.cpp" />
+ <ClCompile Include="..\s\default_version.cpp" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcrecpp.cc">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_chartables.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_compile.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_config.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_dfa_exec.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_exec.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_fullinfo.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_get.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_globals.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_info.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_maketables.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_newline.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_ord2utf8.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_refcount.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_scanner.cc">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_stringpiece.cc">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_study.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_tables.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_try_flipped.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_ucp_searchfuncs.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_valid_utf8.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_version.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_xclass.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\pcre-7.4\pcreposix.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\scripting\bench.cpp" />
+ <ClCompile Include="..\shell\mongo.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\s\chunk.cpp" />
+ <ClCompile Include="..\s\config.cpp" />
+ <ClCompile Include="..\s\d_chunk_manager.cpp" />
+ <ClCompile Include="..\s\d_migrate.cpp" />
+ <ClCompile Include="..\s\d_split.cpp" />
+ <ClCompile Include="..\s\d_state.cpp" />
+ <ClCompile Include="..\s\d_writeback.cpp" />
+ <ClCompile Include="..\s\grid.cpp" />
+ <ClCompile Include="..\s\shard.cpp" />
+ <ClCompile Include="..\s\shardconnection.cpp" />
+ <ClCompile Include="..\s\shardkey.cpp" />
+ <ClCompile Include="..\third_party\snappy\snappy-sinksource.cc">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\third_party\snappy\snappy.cc">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\util\alignedbuilder.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\util\compress.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\util\concurrency\spin_lock.cpp" />
+ <ClCompile Include="..\util\concurrency\synchronization.cpp" />
+ <ClCompile Include="..\util\concurrency\task.cpp" />
+ <ClCompile Include="..\util\concurrency\thread_pool.cpp" />
+ <ClCompile Include="..\util\concurrency\vars.cpp" />
+ <ClCompile Include="..\util\file_allocator.cpp" />
+ <ClCompile Include="..\util\intrusive_counter.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\util\log.cpp" />
+ <ClCompile Include="..\util\logfile.cpp" />
+ <ClCompile Include="..\util\net\listen.cpp" />
+ <ClCompile Include="..\util\net\miniwebserver.cpp" />
+ <ClCompile Include="..\util\processinfo.cpp" />
+ <ClCompile Include="..\util\ramlog.cpp" />
+ <ClCompile Include="..\util\stringutils.cpp" />
+ <ClCompile Include="..\util\systeminfo_win32.cpp" />
+ <ClCompile Include="..\util\text.cpp" />
+ <ClCompile Include="..\util\version.cpp" />
+ <ClCompile Include="btreebuilder.cpp" />
+ <ClCompile Include="cap.cpp" />
+ <ClCompile Include="commands\cloud.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="commands\distinct.cpp">
+ <PrecompiledHeader>NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="commands\document_source_cursor.cpp" />
+ <ClCompile Include="commands\find_and_modify.cpp" />
+ <ClCompile Include="commands\group.cpp" />
+ <ClCompile Include="commands\isself.cpp" />
+ <ClCompile Include="commands\mr.cpp" />
+ <ClCompile Include="commands\pipeline_command.cpp" />
+ <ClCompile Include="commands\pipeline.cpp" />
+ <ClCompile Include="compact.cpp" />
+ <ClCompile Include="curop.cpp" />
+ <ClCompile Include="dbcommands_generic.cpp" />
+ <ClCompile Include="dbmessage.cpp" />
+ <ClCompile Include="dur.cpp" />
+ <ClCompile Include="durop.cpp" />
+ <ClCompile Include="dur_commitjob.cpp" />
+ <ClCompile Include="dur_journal.cpp" />
+ <ClCompile Include="dur_preplogbuffer.cpp" />
+ <ClCompile Include="dur_recover.cpp" />
+ <ClCompile Include="dur_writetodatafiles.cpp" />
+ <ClCompile Include="d_concurrency.cpp" />
+ <ClCompile Include="d_globals.cpp" />
+ <ClCompile Include="geo\2d.cpp" />
+ <ClCompile Include="geo\haystack.cpp" />
+ <ClCompile Include="key.cpp" />
+ <ClCompile Include="mongommf.cpp" />
+ <ClCompile Include="oplog.cpp" />
+ <ClCompile Include="ops\count.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="ops\delete.cpp" />
+ <ClCompile Include="ops\query.cpp" />
+ <ClCompile Include="ops\update.cpp" />
+ <ClCompile Include="pagefault.cpp" />
+ <ClCompile Include="pipeline\accumulator.cpp" />
+ <ClCompile Include="pipeline\accumulator_add_to_set.cpp" />
+ <ClCompile Include="pipeline\accumulator_avg.cpp" />
+ <ClCompile Include="pipeline\accumulator_first.cpp" />
+ <ClCompile Include="pipeline\accumulator_last.cpp" />
+ <ClCompile Include="pipeline\accumulator_min_max.cpp" />
+ <ClCompile Include="pipeline\accumulator_push.cpp" />
+ <ClCompile Include="pipeline\accumulator_single_value.cpp" />
+ <ClCompile Include="pipeline\accumulator_sum.cpp" />
+ <ClCompile Include="pipeline\builder.cpp" />
+ <ClCompile Include="pipeline\document.cpp" />
+ <ClCompile Include="pipeline\document_source.cpp" />
+ <ClCompile Include="pipeline\document_source_bson_array.cpp" />
+ <ClCompile Include="pipeline\document_source_command_futures.cpp" />
+ <ClCompile Include="pipeline\document_source_filter.cpp" />
+ <ClCompile Include="pipeline\document_source_filter_base.cpp" />
+ <ClCompile Include="pipeline\document_source_group.cpp" />
+ <ClCompile Include="pipeline\document_source_limit.cpp" />
+ <ClCompile Include="pipeline\document_source_match.cpp" />
+ <ClCompile Include="pipeline\document_source_out.cpp" />
+ <ClCompile Include="pipeline\document_source_project.cpp" />
+ <ClCompile Include="pipeline\document_source_skip.cpp" />
+ <ClCompile Include="pipeline\document_source_sort.cpp" />
+ <ClCompile Include="pipeline\document_source_unwind.cpp" />
+ <ClCompile Include="pipeline\doc_mem_monitor.cpp" />
+ <ClCompile Include="pipeline\expression.cpp" />
+ <ClCompile Include="pipeline\expression_context.cpp" />
+ <ClCompile Include="pipeline\field_path.cpp" />
+ <ClCompile Include="pipeline\value.cpp" />
+ <ClCompile Include="projection.cpp" />
+ <ClCompile Include="queryoptimizercursor.cpp" />
+ <ClCompile Include="querypattern.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="record.cpp" />
+ <ClCompile Include="repl.cpp" />
+ <ClCompile Include="repl\consensus.cpp" />
+ <ClCompile Include="repl\heartbeat.cpp" />
+ <ClCompile Include="repl\manager.cpp" />
+ <ClCompile Include="repl\rs_initialsync.cpp" />
+ <ClCompile Include="repl\rs_initiate.cpp" />
+ <ClCompile Include="repl\rs_rollback.cpp" />
+ <ClCompile Include="repl\rs_sync.cpp" />
+ <ClCompile Include="repl_block.cpp" />
+ <ClCompile Include="restapi.cpp" />
+ <ClCompile Include="..\client\connpool.cpp" />
+ <ClCompile Include="..\client\dbclient.cpp" />
+ <ClCompile Include="..\client\syncclusterconnection.cpp" />
+ <ClCompile Include="..\pch.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="client.cpp" />
+ <ClCompile Include="clientcursor.cpp" />
+ <ClCompile Include="cloner.cpp" />
+ <ClCompile Include="commands.cpp" />
+ <ClCompile Include="common.cpp">
+ <PrecompiledHeader>NotUsing</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="cursor.cpp" />
+ <ClCompile Include="database.cpp" />
+ <ClCompile Include="db.cpp" />
+ <ClCompile Include="dbcommands.cpp" />
+ <ClCompile Include="dbcommands_admin.cpp" />
+ <ClCompile Include="dbeval.cpp" />
+ <ClCompile Include="dbhelpers.cpp" />
+ <ClCompile Include="dbwebserver.cpp" />
+ <ClCompile Include="extsort.cpp" />
+ <ClCompile Include="index.cpp" />
+ <ClCompile Include="indexkey.cpp" />
+ <ClCompile Include="instance.cpp" />
+ <ClCompile Include="introspect.cpp" />
+ <ClCompile Include="jsobj.cpp" />
+ <ClCompile Include="json.cpp" />
+ <ClCompile Include="lasterror.cpp" />
+ <ClCompile Include="matcher.cpp" />
+ <ClCompile Include="matcher_covered.cpp" />
+ <ClCompile Include="..\util\mmap_win.cpp" />
+ <ClCompile Include="modules\mms.cpp" />
+ <ClCompile Include="module.cpp" />
+ <ClCompile Include="namespace.cpp" />
+ <ClCompile Include="nonce.cpp" />
+ <ClCompile Include="..\client\parallel.cpp" />
+ <ClCompile Include="pdfile.cpp" />
+ <ClCompile Include="queryoptimizer.cpp" />
+ <ClCompile Include="scanandorder.cpp" />
+ <ClCompile Include="security.cpp" />
+ <ClCompile Include="security_commands.cpp" />
+ <ClCompile Include="security_common.cpp" />
+ <ClCompile Include="tests.cpp" />
+ <ClCompile Include="cmdline.cpp" />
+ <ClCompile Include="queryutil.cpp" />
+ <ClCompile Include="..\util\assert_util.cpp" />
+ <ClCompile Include="..\util\background.cpp" />
+ <ClCompile Include="..\util\base64.cpp" />
+ <ClCompile Include="..\util\mmap.cpp" />
+ <ClCompile Include="..\util\ntservice.cpp" />
+ <ClCompile Include="..\util\processinfo_win32.cpp" />
+ <ClCompile Include="..\util\util.cpp" />
+ <ClCompile Include="..\util\net\httpclient.cpp" />
+ <ClCompile Include="..\util\md5.c">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeader>
+ <PrecompiledHeaderFile Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ </PrecompiledHeaderFile>
+ <PrecompiledHeaderFile Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ </PrecompiledHeaderFile>
+ </ClCompile>
+ <ClCompile Include="..\util\md5main.cpp">
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Use</PrecompiledHeader>
+ <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Use</PrecompiledHeader>
+ </ClCompile>
+ <ClCompile Include="..\util\net\message.cpp" />
+ <ClCompile Include="..\util\net\message_port.cpp" />
+ <ClCompile Include="..\util\net\message_server_port.cpp" />
+ <ClCompile Include="..\util\net\sock.cpp" />
+ <ClCompile Include="..\s\d_logic.cpp" />
+ <ClCompile Include="..\scripting\engine.cpp" />
+ <ClCompile Include="..\scripting\engine_spidermonkey.cpp" />
+ <ClCompile Include="..\scripting\utils.cpp" />
+ <ClCompile Include="stats\counters.cpp" />
+ <ClCompile Include="stats\snapshots.cpp" />
+ <ClCompile Include="stats\top.cpp" />
+ <ClCompile Include="btree.cpp" />
+ <ClCompile Include="btreecursor.cpp" />
+ <ClCompile Include="repl\health.cpp" />
+ <ClCompile Include="repl\rs.cpp" />
+ <ClCompile Include="repl\replset_commands.cpp" />
+ <ClCompile Include="repl\rs_config.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <None Include="..\jstests\dur\basic1.sh" />
+ <None Include="..\jstests\dur\dur1.js" />
+ <None Include="..\jstests\replsets\replset1.js" />
+ <None Include="..\jstests\replsets\replset2.js" />
+ <None Include="..\jstests\replsets\replset3.js" />
+ <None Include="..\jstests\replsets\replset4.js" />
+ <None Include="..\jstests\replsets\replset5.js" />
+ <None Include="..\jstests\replsets\replsetadd.js" />
+ <None Include="..\jstests\replsets\replsetarb1.js" />
+ <None Include="..\jstests\replsets\replsetarb2.js" />
+ <None Include="..\jstests\replsets\replsetprio1.js" />
+ <None Include="..\jstests\replsets\replsetrestart1.js" />
+ <None Include="..\jstests\replsets\replsetrestart2.js" />
+ <None Include="..\jstests\replsets\replset_remove_node.js" />
+ <None Include="..\jstests\replsets\rollback.js" />
+ <None Include="..\jstests\replsets\rollback2.js" />
+ <None Include="..\jstests\replsets\sync1.js" />
+ <None Include="..\jstests\replsets\twosets.js" />
+ <None Include="..\SConstruct" />
+ <None Include="..\util\mongoutils\README" />
+ <None Include="mongo.ico" />
+ <None Include="repl\notes.txt" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="..\bson\bson-inl.h" />
+ <ClInclude Include="..\bson\bson.h" />
+ <ClInclude Include="..\bson\bson_db.h" />
+ <ClInclude Include="..\bson\inline_decls.h" />
+ <ClInclude Include="..\bson\stringdata.h" />
+ <ClInclude Include="..\bson\util\atomic_int.h" />
+ <ClInclude Include="..\bson\util\builder.h" />
+ <ClInclude Include="..\bson\util\misc.h" />
+ <ClInclude Include="..\client\dbclientcursor.h" />
+ <ClInclude Include="..\client\distlock.h" />
+ <ClInclude Include="..\client\gridfs.h" />
+ <ClInclude Include="..\client\parallel.h" />
+ <ClInclude Include="..\s\d_logic.h" />
+ <ClInclude Include="..\targetver.h" />
+ <ClInclude Include="..\third_party\pcre-7.4\config.h" />
+ <ClInclude Include="..\third_party\pcre-7.4\pcre.h" />
+ <ClInclude Include="..\third_party\snappy\config.h" />
+ <ClInclude Include="..\third_party\snappy\snappy.h" />
+ <ClInclude Include="..\util\alignedbuilder.h" />
+ <ClInclude Include="..\util\concurrency\race.h" />
+ <ClInclude Include="..\util\concurrency\rwlock.h" />
+ <ClInclude Include="..\util\concurrency\msg.h" />
+ <ClInclude Include="..\util\concurrency\mutex.h" />
+ <ClInclude Include="..\util\concurrency\mvar.h" />
+ <ClInclude Include="..\util\concurrency\task.h" />
+ <ClInclude Include="..\util\concurrency\thread_pool.h" />
+ <ClInclude Include="..\util\intrusive_counter.h" />
+ <ClInclude Include="..\util\logfile.h" />
+ <ClInclude Include="..\util\mongoutils\checksum.h" />
+ <ClInclude Include="..\util\mongoutils\html.h" />
+ <ClInclude Include="..\util\mongoutils\str.h" />
+ <ClInclude Include="..\util\net\hostandport.h" />
+ <ClInclude Include="..\util\net\listen.h" />
+ <ClInclude Include="..\util\net\message_port.h" />
+ <ClInclude Include="..\util\net\miniwebserver.h" />
+ <ClInclude Include="..\util\paths.h" />
+ <ClInclude Include="..\util\ramlog.h" />
+ <ClInclude Include="..\util\systeminfo.h" />
+ <ClInclude Include="..\util\text.h" />
+ <ClInclude Include="..\util\time_support.h" />
+ <ClInclude Include="databaseholder.h" />
+ <ClInclude Include="durop.h" />
+ <ClInclude Include="dur_commitjob.h" />
+ <ClInclude Include="dur_journal.h" />
+ <ClInclude Include="dur_journalformat.h" />
+ <ClInclude Include="dur_journalimpl.h" />
+ <ClInclude Include="dur_stats.h" />
+ <ClInclude Include="d_globals.h" />
+ <ClInclude Include="geo\core.h" />
+ <ClInclude Include="globals.h" />
+ <ClInclude Include="helpers\dblogger.h" />
+ <ClInclude Include="instance.h" />
+ <ClInclude Include="mongommf.h" />
+ <ClInclude Include="mongomutex.h" />
+ <ClInclude Include="namespace-inl.h" />
+ <ClInclude Include="namespacestring.h" />
+ <ClInclude Include="oplogreader.h" />
+ <ClInclude Include="ops\count.h" />
+ <ClInclude Include="ops\delete.h" />
+ <ClInclude Include="ops\update.h" />
+ <ClInclude Include="pagefault.h" />
+ <ClInclude Include="pipeline\accumulator.h" />
+ <ClInclude Include="pipeline\builder.h" />
+ <ClInclude Include="pipeline\document.h" />
+ <ClInclude Include="pipeline\document_source.h" />
+ <ClInclude Include="pipeline\doc_mem_monitor.h" />
+ <ClInclude Include="pipeline\expression.h" />
+ <ClInclude Include="pipeline\expression_context.h" />
+ <ClInclude Include="pipeline\field_path.h" />
+ <ClInclude Include="pipeline\value.h" />
+ <ClInclude Include="projection.h" />
+ <ClInclude Include="queryutil.h" />
+ <ClInclude Include="repl.h" />
+ <ClInclude Include="replpair.h" />
+ <ClInclude Include="repl\connections.h" />
+ <ClInclude Include="repl\multicmd.h" />
+ <ClInclude Include="repl\rsmember.h" />
+ <ClInclude Include="repl\rs_optime.h" />
+ <ClInclude Include="stats\counters.h" />
+ <ClInclude Include="stats\snapshots.h" />
+ <ClInclude Include="stats\top.h" />
+ <ClInclude Include="..\client\connpool.h" />
+ <ClInclude Include="..\client\dbclient.h" />
+ <ClInclude Include="..\client\model.h" />
+ <ClInclude Include="..\client\redef_macros.h" />
+ <ClInclude Include="..\client\syncclusterconnection.h" />
+ <ClInclude Include="..\client\undef_macros.h" />
+ <ClInclude Include="background.h" />
+ <ClInclude Include="client.h" />
+ <ClInclude Include="clientcursor.h" />
+ <ClInclude Include="cmdline.h" />
+ <ClInclude Include="commands.h" />
+ <ClInclude Include="concurrency.h" />
+ <ClInclude Include="curop.h" />
+ <ClInclude Include="cursor.h" />
+ <ClInclude Include="database.h" />
+ <ClInclude Include="db.h" />
+ <ClInclude Include="dbhelpers.h" />
+ <ClInclude Include="dbinfo.h" />
+ <ClInclude Include="dbmessage.h" />
+ <ClInclude Include="diskloc.h" />
+ <ClInclude Include="index.h" />
+ <ClInclude Include="indexkey.h" />
+ <ClInclude Include="introspect.h" />
+ <ClInclude Include="json.h" />
+ <ClInclude Include="matcher.h" />
+ <ClInclude Include="namespace.h" />
+ <ClInclude Include="..\pch.h" />
+ <ClInclude Include="pdfile.h" />
+ <ClInclude Include="..\grid\protocol.h" />
+ <ClInclude Include="query.h" />
+ <ClInclude Include="queryoptimizer.h" />
+ <ClInclude Include="resource.h" />
+ <ClInclude Include="scanandorder.h" />
+ <ClInclude Include="security.h" />
+ <ClInclude Include="..\util\allocator.h" />
+ <ClInclude Include="..\util\array.h" />
+ <ClInclude Include="..\util\assert_util.h" />
+ <ClInclude Include="..\util\background.h" />
+ <ClInclude Include="..\util\base64.h" />
+ <ClInclude Include="..\util\builder.h" />
+ <ClInclude Include="..\util\debug_util.h" />
+ <ClInclude Include="..\util\embedded_builder.h" />
+ <ClInclude Include="..\util\file.h" />
+ <ClInclude Include="..\util\file_allocator.h" />
+ <ClInclude Include="..\util\goodies.h" />
+ <ClInclude Include="..\util\hashtab.h" />
+ <ClInclude Include="..\util\hex.h" />
+ <ClInclude Include="lasterror.h" />
+ <ClInclude Include="..\util\log.h" />
+ <ClInclude Include="..\util\lruishmap.h" />
+ <ClInclude Include="..\util\mmap.h" />
+ <ClInclude Include="..\util\ntservice.h" />
+ <ClInclude Include="..\util\optime.h" />
+ <ClInclude Include="..\util\processinfo.h" />
+ <ClInclude Include="..\util\queue.h" />
+ <ClInclude Include="..\util\ramstore.h" />
+ <ClInclude Include="..\util\unittest.h" />
+ <ClInclude Include="..\util\concurrency\list.h" />
+ <ClInclude Include="..\util\concurrency\value.h" />
+ <ClInclude Include="..\util\web\html.h" />
+ <ClInclude Include="..\util\net\httpclient.h" />
+ <ClInclude Include="..\util\md5.h" />
+ <ClInclude Include="..\util\md5.hpp" />
+ <ClInclude Include="..\util\net\message.h" />
+ <ClInclude Include="..\util\net\message_server.h" />
+ <ClInclude Include="..\util\net\sock.h" />
+ <ClInclude Include="..\scripting\engine.h" />
+ <ClInclude Include="..\scripting\engine_spidermonkey.h" />
+ <ClInclude Include="..\scripting\engine_v8.h" />
+ <ClInclude Include="..\scripting\v8_db.h" />
+ <ClInclude Include="..\scripting\v8_utils.h" />
+ <ClInclude Include="..\scripting\v8_wrapper.h" />
+ <ClInclude Include="btree.h" />
+ <ClInclude Include="repl\health.h" />
+ <ClInclude Include="repl\rs.h" />
+ <ClInclude Include="repl\rs_config.h" />
+ <ClInclude Include="..\bson\bsonelement.h" />
+ <ClInclude Include="..\bson\bsoninlines.h" />
+ <ClInclude Include="..\bson\bsonmisc.h" />
+ <ClInclude Include="..\bson\bsonobj.h" />
+ <ClInclude Include="..\bson\bsonobjbuilder.h" />
+ <ClInclude Include="..\bson\bsonobjiterator.h" />
+ <ClInclude Include="..\bson\bsontypes.h" />
+ <ClInclude Include="jsobj.h" />
+ <ClInclude Include="..\bson\oid.h" />
+ <ClInclude Include="..\bson\ordering.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <Library Include="..\..\js\js32d.lib">
+ <FileType>Document</FileType>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+ </Library>
+ <Library Include="..\..\js\js32r.lib">
+ <FileType>Document</FileType>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ </Library>
+ <Library Include="..\..\js\js64d.lib">
+ <FileType>Document</FileType>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+ </Library>
+ <Library Include="..\..\js\js64r.lib">
+ <FileType>Document</FileType>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+ </Library>
+ </ItemGroup>
+ <ItemGroup>
+ <ResourceCompile Include="db.rc" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
+</Project>
\ No newline at end of file diff --git a/src/mongo/db/db.vcxproj.filters b/src/mongo/db/db.vcxproj.filters new file mode 100755 index 00000000000..a39df0dc796 --- /dev/null +++ b/src/mongo/db/db.vcxproj.filters @@ -0,0 +1,432 @@ +<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <ClCompile Include="..\bson\oid.cpp" />
+ <ClCompile Include="..\client\dbclientcursor.cpp" />
+ <ClCompile Include="..\client\dbclient_rs.cpp" />
+ <ClCompile Include="..\client\distlock.cpp" />
+ <ClCompile Include="..\client\model.cpp" />
+ <ClCompile Include="..\scripting\bench.cpp" />
+ <ClCompile Include="..\shell\mongo.cpp" />
+ <ClCompile Include="..\s\chunk.cpp" />
+ <ClCompile Include="..\s\config.cpp" />
+ <ClCompile Include="..\s\d_chunk_manager.cpp" />
+ <ClCompile Include="..\s\d_migrate.cpp" />
+ <ClCompile Include="..\s\d_split.cpp" />
+ <ClCompile Include="..\s\d_state.cpp" />
+ <ClCompile Include="..\s\d_writeback.cpp" />
+ <ClCompile Include="..\s\grid.cpp" />
+ <ClCompile Include="..\s\shard.cpp" />
+ <ClCompile Include="..\s\shardconnection.cpp" />
+ <ClCompile Include="..\s\shardkey.cpp" />
+ <ClCompile Include="..\util\alignedbuilder.cpp" />
+ <ClCompile Include="..\util\concurrency\spin_lock.cpp" />
+ <ClCompile Include="..\util\concurrency\synchronization.cpp" />
+ <ClCompile Include="..\util\concurrency\task.cpp" />
+ <ClCompile Include="..\util\concurrency\thread_pool.cpp" />
+ <ClCompile Include="..\util\concurrency\vars.cpp" />
+ <ClCompile Include="..\util\log.cpp" />
+ <ClCompile Include="..\util\logfile.cpp" />
+ <ClCompile Include="..\util\processinfo.cpp" />
+ <ClCompile Include="..\util\stringutils.cpp" />
+ <ClCompile Include="..\util\text.cpp" />
+ <ClCompile Include="..\util\version.cpp" />
+ <ClCompile Include="cap.cpp" />
+ <ClCompile Include="commands\distinct.cpp" />
+ <ClCompile Include="commands\group.cpp" />
+ <ClCompile Include="commands\isself.cpp" />
+ <ClCompile Include="commands\mr.cpp" />
+ <ClCompile Include="compact.cpp" />
+ <ClCompile Include="dbcommands_generic.cpp" />
+ <ClCompile Include="dur.cpp" />
+ <ClCompile Include="durop.cpp" />
+ <ClCompile Include="dur_commitjob.cpp" />
+ <ClCompile Include="dur_journal.cpp" />
+ <ClCompile Include="dur_preplogbuffer.cpp" />
+ <ClCompile Include="dur_recover.cpp" />
+ <ClCompile Include="dur_writetodatafiles.cpp" />
+ <ClCompile Include="geo\2d.cpp" />
+ <ClCompile Include="geo\haystack.cpp" />
+ <ClCompile Include="mongommf.cpp" />
+ <ClCompile Include="oplog.cpp" />
+ <ClCompile Include="projection.cpp" />
+ <ClCompile Include="repl.cpp" />
+ <ClCompile Include="repl\consensus.cpp" />
+ <ClCompile Include="repl\heartbeat.cpp" />
+ <ClCompile Include="repl\manager.cpp" />
+ <ClCompile Include="repl\rs_initialsync.cpp" />
+ <ClCompile Include="repl\rs_initiate.cpp" />
+ <ClCompile Include="repl\rs_rollback.cpp" />
+ <ClCompile Include="repl\rs_sync.cpp" />
+ <ClCompile Include="repl_block.cpp" />
+ <ClCompile Include="restapi.cpp" />
+ <ClCompile Include="..\client\connpool.cpp" />
+ <ClCompile Include="..\client\dbclient.cpp" />
+ <ClCompile Include="..\client\syncclusterconnection.cpp" />
+ <ClCompile Include="..\pch.cpp" />
+ <ClCompile Include="client.cpp" />
+ <ClCompile Include="clientcursor.cpp" />
+ <ClCompile Include="cloner.cpp" />
+ <ClCompile Include="commands.cpp" />
+ <ClCompile Include="common.cpp" />
+ <ClCompile Include="cursor.cpp" />
+ <ClCompile Include="database.cpp" />
+ <ClCompile Include="db.cpp" />
+ <ClCompile Include="dbcommands.cpp" />
+ <ClCompile Include="dbcommands_admin.cpp" />
+ <ClCompile Include="dbeval.cpp" />
+ <ClCompile Include="dbhelpers.cpp" />
+ <ClCompile Include="dbwebserver.cpp" />
+ <ClCompile Include="extsort.cpp" />
+ <ClCompile Include="index.cpp" />
+ <ClCompile Include="indexkey.cpp" />
+ <ClCompile Include="instance.cpp" />
+ <ClCompile Include="introspect.cpp" />
+ <ClCompile Include="jsobj.cpp" />
+ <ClCompile Include="json.cpp" />
+ <ClCompile Include="lasterror.cpp" />
+ <ClCompile Include="matcher.cpp" />
+ <ClCompile Include="matcher_covered.cpp" />
+ <ClCompile Include="..\util\mmap_win.cpp" />
+ <ClCompile Include="modules\mms.cpp" />
+ <ClCompile Include="module.cpp" />
+ <ClCompile Include="namespace.cpp" />
+ <ClCompile Include="nonce.cpp" />
+ <ClCompile Include="..\client\parallel.cpp" />
+ <ClCompile Include="pdfile.cpp" />
+ <ClCompile Include="queryoptimizer.cpp" />
+ <ClCompile Include="security.cpp" />
+ <ClCompile Include="security_commands.cpp" />
+ <ClCompile Include="tests.cpp" />
+ <ClCompile Include="cmdline.cpp" />
+ <ClCompile Include="queryutil.cpp" />
+ <ClCompile Include="..\util\assert_util.cpp" />
+ <ClCompile Include="..\util\background.cpp" />
+ <ClCompile Include="..\util\base64.cpp" />
+ <ClCompile Include="..\util\mmap.cpp" />
+ <ClCompile Include="..\util\ntservice.cpp" />
+ <ClCompile Include="..\util\processinfo_win32.cpp" />
+ <ClCompile Include="..\util\util.cpp" />
+ <ClCompile Include="..\util\md5.c" />
+ <ClCompile Include="..\util\md5main.cpp" />
+ <ClCompile Include="..\s\d_logic.cpp" />
+ <ClCompile Include="..\scripting\engine.cpp" />
+ <ClCompile Include="..\scripting\engine_spidermonkey.cpp" />
+ <ClCompile Include="..\scripting\utils.cpp" />
+ <ClCompile Include="stats\counters.cpp" />
+ <ClCompile Include="stats\snapshots.cpp" />
+ <ClCompile Include="stats\top.cpp" />
+ <ClCompile Include="btree.cpp" />
+ <ClCompile Include="btreecursor.cpp" />
+ <ClCompile Include="repl\health.cpp" />
+ <ClCompile Include="repl\rs.cpp" />
+ <ClCompile Include="repl\replset_commands.cpp" />
+ <ClCompile Include="repl\rs_config.cpp" />
+ <ClCompile Include="..\util\file_allocator.cpp" />
+ <ClCompile Include="querypattern.cpp" />
+ <ClCompile Include="..\util\ramlog.cpp" />
+ <ClCompile Include="key.cpp" />
+ <ClCompile Include="btreebuilder.cpp" />
+ <ClCompile Include="queryoptimizercursor.cpp" />
+ <ClCompile Include="record.cpp" />
+ <ClCompile Include="ops\delete.cpp" />
+ <ClCompile Include="ops\update.cpp" />
+ <ClCompile Include="security_common.cpp" />
+ <ClCompile Include="ops\query.cpp" />
+ <ClCompile Include="..\util\net\httpclient.cpp" />
+ <ClCompile Include="..\util\net\message.cpp" />
+ <ClCompile Include="..\util\net\message_server_port.cpp" />
+ <ClCompile Include="..\util\net\sock.cpp" />
+ <ClCompile Include="..\util\net\miniwebserver.cpp" />
+ <ClCompile Include="..\util\net\listen.cpp" />
+ <ClCompile Include="..\util\net\message_port.cpp" />
+ <ClCompile Include="dbmessage.cpp" />
+ <ClCompile Include="commands\find_and_modify.cpp" />
+ <ClCompile Include="..\util\compress.cpp">
+ <Filter>snappy</Filter>
+ </ClCompile>
+ <ClCompile Include="..\third_party\snappy\snappy-sinksource.cc">
+ <Filter>snappy</Filter>
+ </ClCompile>
+ <ClCompile Include="..\third_party\snappy\snappy.cc">
+ <Filter>snappy</Filter>
+ </ClCompile>
+ <ClCompile Include="scanandorder.cpp" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcrecpp.cc" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_chartables.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_compile.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_config.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_dfa_exec.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_exec.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_fullinfo.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_get.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_globals.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_info.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_maketables.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_newline.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_ord2utf8.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_refcount.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_scanner.cc" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_stringpiece.cc" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_study.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_tables.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_try_flipped.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_ucp_searchfuncs.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_valid_utf8.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_version.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcre_xclass.c" />
+ <ClCompile Include="..\third_party\pcre-7.4\pcreposix.c" />
+ <ClCompile Include="commands\cloud.cpp" />
+ <ClCompile Include="commands\pipeline_command.cpp" />
+ <ClCompile Include="commands\pipeline.cpp" />
+ <ClCompile Include="pipeline\accumulator.cpp" />
+ <ClCompile Include="pipeline\accumulator_add_to_set.cpp" />
+ <ClCompile Include="pipeline\accumulator_avg.cpp" />
+ <ClCompile Include="pipeline\accumulator_first.cpp" />
+ <ClCompile Include="pipeline\accumulator_last.cpp" />
+ <ClCompile Include="pipeline\accumulator_min_max.cpp" />
+ <ClCompile Include="pipeline\accumulator_push.cpp" />
+ <ClCompile Include="pipeline\accumulator_single_value.cpp" />
+ <ClCompile Include="pipeline\accumulator_sum.cpp" />
+ <ClCompile Include="pipeline\builder.cpp" />
+ <ClCompile Include="pipeline\doc_mem_monitor.cpp" />
+ <ClCompile Include="pipeline\document.cpp" />
+ <ClCompile Include="pipeline\document_source.cpp" />
+ <ClCompile Include="pipeline\document_source_bson_array.cpp" />
+ <ClCompile Include="pipeline\document_source_command_futures.cpp" />
+ <ClCompile Include="pipeline\document_source_filter.cpp" />
+ <ClCompile Include="pipeline\document_source_filter_base.cpp" />
+ <ClCompile Include="pipeline\document_source_group.cpp" />
+ <ClCompile Include="pipeline\document_source_limit.cpp" />
+ <ClCompile Include="pipeline\document_source_match.cpp" />
+ <ClCompile Include="pipeline\document_source_out.cpp" />
+ <ClCompile Include="pipeline\document_source_project.cpp" />
+ <ClCompile Include="pipeline\document_source_skip.cpp" />
+ <ClCompile Include="pipeline\document_source_sort.cpp" />
+ <ClCompile Include="pipeline\document_source_unwind.cpp" />
+ <ClCompile Include="pipeline\expression.cpp" />
+ <ClCompile Include="pipeline\expression_context.cpp" />
+ <ClCompile Include="pipeline\field_path.cpp" />
+ <ClCompile Include="pipeline\value.cpp" />
+ <ClCompile Include="..\util\intrusive_counter.cpp" />
+ <ClCompile Include="..\util\systeminfo_win32.cpp" />
+ <ClCompile Include="commands\document_source_cursor.cpp" />
+ <ClCompile Include="d_concurrency.cpp" />
+ <ClCompile Include="..\s\default_version.cpp" />
+ <ClCompile Include="ops\count.cpp" />
+ <ClCompile Include="pagefault.cpp" />
+ <ClCompile Include="d_globals.cpp" />
+ <ClCompile Include="curop.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="..\client\dbclientcursor.h" />
+ <ClInclude Include="..\client\distlock.h" />
+ <ClInclude Include="..\client\gridfs.h" />
+ <ClInclude Include="..\client\parallel.h" />
+ <ClInclude Include="..\s\d_logic.h" />
+ <ClInclude Include="..\targetver.h" />
+ <ClInclude Include="..\util\concurrency\rwlock.h" />
+ <ClInclude Include="..\util\concurrency\msg.h" />
+ <ClInclude Include="..\util\concurrency\mutex.h" />
+ <ClInclude Include="..\util\concurrency\mvar.h" />
+ <ClInclude Include="..\util\concurrency\task.h" />
+ <ClInclude Include="..\util\concurrency\thread_pool.h" />
+ <ClInclude Include="..\util\logfile.h" />
+ <ClInclude Include="..\util\mongoutils\checksum.h" />
+ <ClInclude Include="..\util\mongoutils\html.h" />
+ <ClInclude Include="..\util\mongoutils\str.h" />
+ <ClInclude Include="..\util\paths.h" />
+ <ClInclude Include="..\util\ramlog.h" />
+ <ClInclude Include="..\util\text.h" />
+ <ClInclude Include="..\util\time_support.h" />
+ <ClInclude Include="durop.h" />
+ <ClInclude Include="dur_commitjob.h" />
+ <ClInclude Include="dur_journal.h" />
+ <ClInclude Include="dur_journalformat.h" />
+ <ClInclude Include="dur_stats.h" />
+ <ClInclude Include="geo\core.h" />
+ <ClInclude Include="helpers\dblogger.h" />
+ <ClInclude Include="instance.h" />
+ <ClInclude Include="mongommf.h" />
+ <ClInclude Include="mongomutex.h" />
+ <ClInclude Include="namespace-inl.h" />
+ <ClInclude Include="oplogreader.h" />
+ <ClInclude Include="projection.h" />
+ <ClInclude Include="repl.h" />
+ <ClInclude Include="replpair.h" />
+ <ClInclude Include="repl\connections.h" />
+ <ClInclude Include="repl\multicmd.h" />
+ <ClInclude Include="repl\rsmember.h" />
+ <ClInclude Include="repl\rs_optime.h" />
+ <ClInclude Include="stats\counters.h" />
+ <ClInclude Include="stats\snapshots.h" />
+ <ClInclude Include="stats\top.h" />
+ <ClInclude Include="..\client\connpool.h" />
+ <ClInclude Include="..\client\dbclient.h" />
+ <ClInclude Include="..\client\model.h" />
+ <ClInclude Include="..\client\redef_macros.h" />
+ <ClInclude Include="..\client\syncclusterconnection.h" />
+ <ClInclude Include="..\client\undef_macros.h" />
+ <ClInclude Include="background.h" />
+ <ClInclude Include="client.h" />
+ <ClInclude Include="clientcursor.h" />
+ <ClInclude Include="cmdline.h" />
+ <ClInclude Include="commands.h" />
+ <ClInclude Include="concurrency.h" />
+ <ClInclude Include="curop.h" />
+ <ClInclude Include="cursor.h" />
+ <ClInclude Include="database.h" />
+ <ClInclude Include="db.h" />
+ <ClInclude Include="dbhelpers.h" />
+ <ClInclude Include="dbinfo.h" />
+ <ClInclude Include="dbmessage.h" />
+ <ClInclude Include="diskloc.h" />
+ <ClInclude Include="index.h" />
+ <ClInclude Include="indexkey.h" />
+ <ClInclude Include="introspect.h" />
+ <ClInclude Include="json.h" />
+ <ClInclude Include="matcher.h" />
+ <ClInclude Include="namespace.h" />
+ <ClInclude Include="..\pch.h" />
+ <ClInclude Include="pdfile.h" />
+ <ClInclude Include="..\grid\protocol.h" />
+ <ClInclude Include="query.h" />
+ <ClInclude Include="queryoptimizer.h" />
+ <ClInclude Include="resource.h" />
+ <ClInclude Include="scanandorder.h" />
+ <ClInclude Include="security.h" />
+ <ClInclude Include="..\util\allocator.h" />
+ <ClInclude Include="..\util\array.h" />
+ <ClInclude Include="..\util\assert_util.h" />
+ <ClInclude Include="..\util\background.h" />
+ <ClInclude Include="..\util\base64.h" />
+ <ClInclude Include="..\util\builder.h" />
+ <ClInclude Include="..\util\debug_util.h" />
+ <ClInclude Include="..\util\embedded_builder.h" />
+ <ClInclude Include="..\util\file.h" />
+ <ClInclude Include="..\util\file_allocator.h" />
+ <ClInclude Include="..\util\goodies.h" />
+ <ClInclude Include="..\util\hashtab.h" />
+ <ClInclude Include="..\util\hex.h" />
+ <ClInclude Include="lasterror.h" />
+ <ClInclude Include="..\util\log.h" />
+ <ClInclude Include="..\util\lruishmap.h" />
+ <ClInclude Include="..\util\mmap.h" />
+ <ClInclude Include="..\util\ntservice.h" />
+ <ClInclude Include="..\util\optime.h" />
+ <ClInclude Include="..\util\processinfo.h" />
+ <ClInclude Include="..\util\queue.h" />
+ <ClInclude Include="..\util\ramstore.h" />
+ <ClInclude Include="..\util\unittest.h" />
+ <ClInclude Include="..\util\concurrency\list.h" />
+ <ClInclude Include="..\util\concurrency\value.h" />
+ <ClInclude Include="..\util\web\html.h" />
+ <ClInclude Include="..\util\md5.h" />
+ <ClInclude Include="..\util\md5.hpp" />
+ <ClInclude Include="..\scripting\engine.h" />
+ <ClInclude Include="..\scripting\engine_spidermonkey.h" />
+ <ClInclude Include="..\scripting\engine_v8.h" />
+ <ClInclude Include="..\scripting\v8_db.h" />
+ <ClInclude Include="..\scripting\v8_utils.h" />
+ <ClInclude Include="..\scripting\v8_wrapper.h" />
+ <ClInclude Include="btree.h" />
+ <ClInclude Include="repl\health.h" />
+ <ClInclude Include="repl\rs.h" />
+ <ClInclude Include="repl\rs_config.h" />
+ <ClInclude Include="..\bson\bsonelement.h" />
+ <ClInclude Include="..\bson\bsoninlines.h" />
+ <ClInclude Include="..\bson\bsonmisc.h" />
+ <ClInclude Include="..\bson\bsonobj.h" />
+ <ClInclude Include="..\bson\bsonobjbuilder.h" />
+ <ClInclude Include="..\bson\bsonobjiterator.h" />
+ <ClInclude Include="..\bson\bsontypes.h" />
+ <ClInclude Include="jsobj.h" />
+ <ClInclude Include="..\bson\oid.h" />
+ <ClInclude Include="..\bson\ordering.h" />
+ <ClInclude Include="dur_journalimpl.h" />
+ <ClInclude Include="..\util\concurrency\race.h" />
+ <ClInclude Include="..\util\alignedbuilder.h" />
+ <ClInclude Include="queryutil.h" />
+ <ClInclude Include="..\bson\bson.h" />
+ <ClInclude Include="..\bson\bson_db.h" />
+ <ClInclude Include="..\bson\bson-inl.h" />
+ <ClInclude Include="..\bson\inline_decls.h" />
+ <ClInclude Include="..\bson\stringdata.h" />
+ <ClInclude Include="..\bson\util\atomic_int.h" />
+ <ClInclude Include="..\bson\util\builder.h" />
+ <ClInclude Include="..\bson\util\misc.h" />
+ <ClInclude Include="ops\delete.h" />
+ <ClInclude Include="ops\update.h" />
+ <ClInclude Include="..\util\net\httpclient.h" />
+ <ClInclude Include="..\util\net\message.h" />
+ <ClInclude Include="..\util\net\message_server.h" />
+ <ClInclude Include="..\util\net\sock.h" />
+ <ClInclude Include="..\third_party\snappy\config.h">
+ <Filter>snappy</Filter>
+ </ClInclude>
+ <ClInclude Include="..\third_party\snappy\snappy.h">
+ <Filter>snappy</Filter>
+ </ClInclude>
+ <ClInclude Include="..\third_party\pcre-7.4\config.h" />
+ <ClInclude Include="..\third_party\pcre-7.4\pcre.h" />
+ <ClInclude Include="globals.h" />
+ <ClInclude Include="..\util\net\hostandport.h" />
+ <ClInclude Include="..\util\net\listen.h" />
+ <ClInclude Include="..\util\net\message_port.h" />
+ <ClInclude Include="..\util\net\miniwebserver.h" />
+ <ClInclude Include="databaseholder.h" />
+ <ClInclude Include="pipeline\accumulator.h" />
+ <ClInclude Include="pipeline\builder.h" />
+ <ClInclude Include="pipeline\doc_mem_monitor.h" />
+ <ClInclude Include="pipeline\document.h" />
+ <ClInclude Include="pipeline\document_source.h" />
+ <ClInclude Include="pipeline\expression.h" />
+ <ClInclude Include="pipeline\expression_context.h" />
+ <ClInclude Include="pipeline\field_path.h" />
+ <ClInclude Include="pipeline\value.h" />
+ <ClInclude Include="..\util\intrusive_counter.h" />
+ <ClInclude Include="..\util\systeminfo.h" />
+ <ClInclude Include="namespacestring.h" />
+ <ClInclude Include="ops\count.h" />
+ <ClInclude Include="pagefault.h" />
+ <ClInclude Include="d_globals.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <ResourceCompile Include="db.rc" />
+ </ItemGroup>
+ <ItemGroup>
+ <None Include="..\jstests\dur\basic1.sh" />
+ <None Include="..\jstests\dur\dur1.js" />
+ <None Include="..\jstests\replsets\replset1.js" />
+ <None Include="..\jstests\replsets\replset2.js" />
+ <None Include="..\jstests\replsets\replset3.js" />
+ <None Include="..\jstests\replsets\replset4.js" />
+ <None Include="..\jstests\replsets\replset5.js" />
+ <None Include="..\jstests\replsets\replsetadd.js" />
+ <None Include="..\jstests\replsets\replsetarb1.js" />
+ <None Include="..\jstests\replsets\replsetarb2.js" />
+ <None Include="..\jstests\replsets\replsetprio1.js" />
+ <None Include="..\jstests\replsets\replsetrestart1.js" />
+ <None Include="..\jstests\replsets\replsetrestart2.js" />
+ <None Include="..\jstests\replsets\replset_remove_node.js" />
+ <None Include="..\jstests\replsets\rollback.js" />
+ <None Include="..\jstests\replsets\rollback2.js" />
+ <None Include="..\jstests\replsets\sync1.js" />
+ <None Include="..\jstests\replsets\twosets.js" />
+ <None Include="..\SConstruct" />
+ <None Include="..\util\mongoutils\README" />
+ <None Include="mongo.ico" />
+ <None Include="repl\notes.txt" />
+ </ItemGroup>
+ <ItemGroup>
+ <Library Include="..\..\js\js32d.lib" />
+ <Library Include="..\..\js\js32r.lib" />
+ <Library Include="..\..\js\js64d.lib" />
+ <Library Include="..\..\js\js64r.lib" />
+ </ItemGroup>
+ <ItemGroup>
+ <Filter Include="snappy">
+ <UniqueIdentifier>{bb99c086-7926-4f50-838d-f5f0c18397c0}</UniqueIdentifier>
+ </Filter>
+ </ItemGroup>
+</Project>
\ No newline at end of file diff --git a/src/mongo/db/db_10.sln b/src/mongo/db/db_10.sln new file mode 100755 index 00000000000..c1d83f3901a --- /dev/null +++ b/src/mongo/db/db_10.sln @@ -0,0 +1,168 @@ +
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "examples", "examples", "{4082881B-EB00-486F-906C-843B8EC06E18}"
+ ProjectSection(SolutionItems) = preProject
+ driverHelpers.cpp = driverHelpers.cpp
+ EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{2B262D59-9DC7-4BF1-A431-1BD4966899A5}"
+ ProjectSection(SolutionItems) = preProject
+ ..\shell\msvc\createCPPfromJavaScriptFiles.js = ..\shell\msvc\createCPPfromJavaScriptFiles.js
+ EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "unix files", "unix files", "{2F760952-C71B-4865-998F-AABAE96D1373}"
+ ProjectSection(SolutionItems) = preProject
+ ..\util\processinfo_darwin.cpp = ..\util\processinfo_darwin.cpp
+ ..\util\processinfo_linux2.cpp = ..\util\processinfo_linux2.cpp
+ ..\util\processinfo_none.cpp = ..\util\processinfo_none.cpp
+ EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "other", "other", "{12B11474-2D74-48C3-BB3D-F03249BEA88F}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongod", "db.vcxproj", "{215B2D68-0A70-4D10-8E75-B31010C62A91}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongos", "..\s\dbgrid.vcxproj", "{E03717ED-69B4-4D21-BC55-DF6690B585C6}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "..\dbtests\test.vcxproj", "{215B2D68-0A70-4D10-8E75-B33010C62A91}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bsondemo", "..\bson\bsondemo\bsondemo.vcxproj", "{C9DB5EB7-81AA-4185-BAA1-DA035654402F}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongoutils test program", "..\util\mongoutils\mongoutils.vcxproj", "{7B84584E-92BC-4DB9-971B-A1A8F93E5053}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simple_client_demo", "..\client\examples\simple_client_demo.vcxproj", "{89C30BC3-2874-4F2C-B4DA-EB04E9782236}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongo", "..\shell\msvc\mongo.vcxproj", "{FE959BD8-8EE2-4555-AE59-9FA14FFD410E}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mongoperf", "..\client\examples\mongoperf.vcxproj", "{79D4E297-BFB7-4FF2-9B13-08A146582E46}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Any CPU = Debug|Any CPU
+ Debug|Mixed Platforms = Debug|Mixed Platforms
+ Debug|Win32 = Debug|Win32
+ Debug|x64 = Debug|x64
+ Release|Any CPU = Release|Any CPU
+ Release|Mixed Platforms = Release|Mixed Platforms
+ Release|Win32 = Release|Win32
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Any CPU.ActiveCfg = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Mixed Platforms.Build.0 = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|Win32.Build.0 = Debug|Win32
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|x64.ActiveCfg = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Debug|x64.Build.0 = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Any CPU.ActiveCfg = Release|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Mixed Platforms.Build.0 = Release|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Win32.ActiveCfg = Release|Win32
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|Win32.Build.0 = Release|Win32
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|x64.ActiveCfg = Release|x64
+ {215B2D68-0A70-4D10-8E75-B31010C62A91}.Release|x64.Build.0 = Release|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Any CPU.ActiveCfg = Debug|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Mixed Platforms.Build.0 = Debug|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Win32.ActiveCfg = Debug|Win32
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|Win32.Build.0 = Debug|Win32
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|x64.ActiveCfg = Debug|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Debug|x64.Build.0 = Debug|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Any CPU.ActiveCfg = Release|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Mixed Platforms.Build.0 = Release|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Win32.ActiveCfg = Release|Win32
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|Win32.Build.0 = Release|Win32
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|x64.ActiveCfg = Release|x64
+ {E03717ED-69B4-4D21-BC55-DF6690B585C6}.Release|x64.Build.0 = Release|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Any CPU.ActiveCfg = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Mixed Platforms.Build.0 = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.ActiveCfg = Debug|Win32
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|Win32.Build.0 = Debug|Win32
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|x64.ActiveCfg = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Debug|x64.Build.0 = Debug|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Any CPU.ActiveCfg = Release|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Mixed Platforms.Build.0 = Release|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.ActiveCfg = Release|Win32
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|Win32.Build.0 = Release|Win32
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|x64.ActiveCfg = Release|x64
+ {215B2D68-0A70-4D10-8E75-B33010C62A91}.Release|x64.Build.0 = Release|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Any CPU.ActiveCfg = Debug|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Mixed Platforms.Build.0 = Debug|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Win32.ActiveCfg = Debug|Win32
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|Win32.Build.0 = Debug|Win32
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|x64.ActiveCfg = Debug|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Debug|x64.Build.0 = Debug|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Any CPU.ActiveCfg = Release|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Mixed Platforms.ActiveCfg = Release|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Mixed Platforms.Build.0 = Release|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Win32.ActiveCfg = Release|Win32
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|Win32.Build.0 = Release|Win32
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|x64.ActiveCfg = Release|x64
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F}.Release|x64.Build.0 = Release|x64
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Win32.ActiveCfg = Debug|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|Win32.Build.0 = Debug|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Debug|x64.ActiveCfg = Debug|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Any CPU.ActiveCfg = Release|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Mixed Platforms.Build.0 = Release|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Win32.ActiveCfg = Release|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|Win32.Build.0 = Release|Win32
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053}.Release|x64.ActiveCfg = Release|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Win32.ActiveCfg = Debug|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|Win32.Build.0 = Debug|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Debug|x64.ActiveCfg = Debug|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Any CPU.ActiveCfg = Release|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Mixed Platforms.Build.0 = Release|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Win32.ActiveCfg = Release|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|Win32.Build.0 = Release|Win32
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236}.Release|x64.ActiveCfg = Release|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Win32.ActiveCfg = Debug|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|Win32.Build.0 = Debug|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Debug|x64.ActiveCfg = Debug|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Any CPU.ActiveCfg = Release|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Mixed Platforms.Build.0 = Release|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Win32.ActiveCfg = Release|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|Win32.Build.0 = Release|Win32
+ {FE959BD8-8EE2-4555-AE59-9FA14FFD410E}.Release|x64.ActiveCfg = Release|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Any CPU.ActiveCfg = Debug|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Mixed Platforms.ActiveCfg = Debug|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Mixed Platforms.Build.0 = Debug|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Win32.ActiveCfg = Debug|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|Win32.Build.0 = Debug|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Debug|x64.ActiveCfg = Debug|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Any CPU.ActiveCfg = Release|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Mixed Platforms.ActiveCfg = Release|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Mixed Platforms.Build.0 = Release|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Win32.ActiveCfg = Release|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|Win32.Build.0 = Release|Win32
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46}.Release|x64.ActiveCfg = Release|Win32
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(NestedProjects) = preSolution
+ {2B262D59-9DC7-4BF1-A431-1BD4966899A5} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+ {2F760952-C71B-4865-998F-AABAE96D1373} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+ {4082881B-EB00-486F-906C-843B8EC06E18} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+ {C9DB5EB7-81AA-4185-BAA1-DA035654402F} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+ {7B84584E-92BC-4DB9-971B-A1A8F93E5053} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+ {89C30BC3-2874-4F2C-B4DA-EB04E9782236} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+ {79D4E297-BFB7-4FF2-9B13-08A146582E46} = {12B11474-2D74-48C3-BB3D-F03249BEA88F}
+ EndGlobalSection
+EndGlobal
diff --git a/src/mongo/db/dbcommands.cpp b/src/mongo/db/dbcommands.cpp new file mode 100644 index 00000000000..570c897fae4 --- /dev/null +++ b/src/mongo/db/dbcommands.cpp @@ -0,0 +1,1955 @@ +// dbcommands.cpp + +/** +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* SHARDING: + I believe this file is for mongod only. + See s/commnands_public.cpp for mongos. +*/ + +#include "pch.h" +#include "ops/count.h" +#include "ops/query.h" +#include "pdfile.h" +#include "jsobj.h" +#include "../bson/util/builder.h" +#include <time.h> +#include "introspect.h" +#include "btree.h" +#include "../util/lruishmap.h" +#include "../util/md5.hpp" +#include "../util/processinfo.h" +#include "../util/ramlog.h" +#include "json.h" +#include "repl.h" +#include "repl_block.h" +#include "replutil.h" +#include "commands.h" +#include "db.h" +#include "instance.h" +#include "lasterror.h" +#include "security.h" +#include "queryoptimizer.h" +#include "../scripting/engine.h" +#include "stats/counters.h" +#include "background.h" +#include "../util/version.h" +#include "../s/d_writeback.h" +#include "dur_stats.h" + +namespace mongo { + + namespace dur { + void setAgeOutJournalFiles(bool rotate); + } + /** @return true if fields found */ + bool setParmsMongodSpecific(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + BSONElement e = cmdObj["ageOutJournalFiles"]; + if( !e.eoo() ) { + bool r = e.trueValue(); + log() << "ageOutJournalFiles " << r << endl; + dur::setAgeOutJournalFiles(r); + return true; + } + return false; + } + + /* reset any errors so that getlasterror comes back clean. + + useful before performing a long series of operations where we want to + see if any of the operations triggered an error, but don't want to check + after each op as that woudl be a client/server turnaround. + */ + class CmdResetError : public Command { + public: + virtual LockType locktype() const { return NONE; } + virtual bool logTheOp() { + return false; + } + virtual bool slaveOk() const { + return true; + } + virtual void help( stringstream& help ) const { + help << "reset error state (used with getpreverror)"; + } + CmdResetError() : Command("resetError", false, "reseterror") {} + bool run(const string& db, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + LastError *le = lastError.get(); + assert( le ); + le->reset(); + return true; + } + } cmdResetError; + + /* set by replica sets if specified in the configuration. + a pointer is used to avoid any possible locking issues with lockless reading (see below locktype() is NONE + and would like to keep that) + (for now, it simply orphans any old copy as config changes should be extremely rare). + note: once non-null, never goes to null again. + */ + BSONObj *getLastErrorDefault = 0; + + class CmdGetLastError : public Command { + public: + CmdGetLastError() : Command("getLastError", false, "getlasterror") { } + virtual LockType locktype() const { return NONE; } + virtual bool logTheOp() { return false; } + virtual bool slaveOk() const { return true; } + virtual void help( stringstream& help ) const { + help << "return error status of the last operation on this connection\n" + << "options:\n" + << " { fsync:true } - fsync before returning, or wait for journal commit if running with --journal\n" + << " { j:true } - wait for journal commit if running with --journal\n" + << " { w:n } - await replication to n servers (including self) before returning\n" + << " { wtimeout:m} - timeout for w in m milliseconds"; + } + bool run(const string& dbname, BSONObj& _cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + LastError *le = lastError.disableForCommand(); + + bool err = false; + + if ( le->nPrev != 1 ) + err = LastError::noError.appendSelf( result , false ); + else + err = le->appendSelf( result , false ); + + Client& c = cc(); + c.appendLastOp( result ); + + result.appendNumber( "connectionId" , c.getConnectionId() ); // for sharding; also useful in general for debugging + + BSONObj cmdObj = _cmdObj; + { + BSONObj::iterator i(_cmdObj); + i.next(); + if( !i.more() ) { + /* empty, use default */ + BSONObj *def = getLastErrorDefault; + if( def ) + cmdObj = *def; + } + } + + if ( cmdObj["j"].trueValue() ) { + if( !getDur().awaitCommit() ) { + // --journal is off + result.append("jnote", "journaling not enabled on this server"); + } + if( cmdObj["fsync"].trueValue() ) { + errmsg = "fsync and j options are not used together"; + return false; + } + } + else if ( cmdObj["fsync"].trueValue() ) { + Timer t; + if( !getDur().awaitCommit() ) { + // if get here, not running with --journal + log() << "fsync from getlasterror" << endl; + result.append( "fsyncFiles" , MemoryMappedFile::flushAll( true ) ); + } + else { + // this perhaps is temp. how long we wait for the group commit to occur. + result.append( "waited", t.millis() ); + } + } + + if ( err ) { + // doesn't make sense to wait for replication + // if there was an error + return true; + } + + BSONElement e = cmdObj["w"]; + if ( e.ok() ) { + int timeout = cmdObj["wtimeout"].numberInt(); + Timer t; + + long long passes = 0; + char buf[32]; + while ( 1 ) { + OpTime op(c.getLastOp()); + + if ( op.isNull() ) { + if ( anyReplEnabled() ) { + result.append( "wnote" , "no write has been done on this connection" ); + } + else if ( e.isNumber() && e.numberInt() <= 1 ) { + // don't do anything + // w=1 and no repl, so this is fine + } + else { + // w=2 and no repl + result.append( "wnote" , "no replication has been enabled, so w=2+ won't work" ); + result.append( "err", "norepl" ); + return true; + } + break; + } + + // check this first for w=0 or w=1 + if ( opReplicatedEnough( op, e ) ) { + break; + } + + // if replication isn't enabled (e.g., config servers) + if ( ! anyReplEnabled() ) { + result.append( "err", "norepl" ); + return true; + } + + + if ( timeout > 0 && t.millis() >= timeout ) { + result.append( "wtimeout" , true ); + errmsg = "timed out waiting for slaves"; + result.append( "waited" , t.millis() ); + result.append( "err" , "timeout" ); + return true; + } + + assert( sprintf( buf , "w block pass: %lld" , ++passes ) < 30 ); + c.curop()->setMessage( buf ); + sleepmillis(1); + killCurrentOp.checkForInterrupt(); + } + result.appendNumber( "wtime" , t.millis() ); + } + + result.appendNull( "err" ); + return true; + } + } cmdGetLastError; + + class CmdGetPrevError : public Command { + public: + virtual LockType locktype() const { return NONE; } + virtual bool logTheOp() { + return false; + } + virtual void help( stringstream& help ) const { + help << "check for errors since last reseterror commandcal"; + } + virtual bool slaveOk() const { + return true; + } + CmdGetPrevError() : Command("getPrevError", false, "getpreverror") {} + bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + LastError *le = lastError.disableForCommand(); + le->appendSelf( result ); + if ( le->valid ) + result.append( "nPrev", le->nPrev ); + else + result.append( "nPrev", -1 ); + return true; + } + } cmdGetPrevError; + + CmdShutdown cmdShutdown; + + void CmdShutdown::help( stringstream& help ) const { + help << "shutdown the database. must be ran against admin db and " + << "either (1) ran from localhost or (2) authenticated. If " + << "this is a primary in a replica set and there is no member " + << "within 10 seconds of its optime, it will not shutdown " + << "without force : true. You can also specify timeoutSecs : " + << "N to wait N seconds for other members to catch up."; + } + + bool CmdShutdown::run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue(); + + if (!force && theReplSet && theReplSet->isPrimary()) { + long long timeout, now, start; + timeout = now = start = curTimeMicros64()/1000000; + if (cmdObj.hasField("timeoutSecs")) { + timeout += cmdObj["timeoutSecs"].numberLong(); + } + + OpTime lastOp = theReplSet->lastOpTimeWritten; + OpTime closest = theReplSet->lastOtherOpTime(); + long long int diff = lastOp.getSecs() - closest.getSecs(); + while (now <= timeout && (diff < 0 || diff > 10)) { + sleepsecs(1); + now++; + + lastOp = theReplSet->lastOpTimeWritten; + closest = theReplSet->lastOtherOpTime(); + diff = lastOp.getSecs() - closest.getSecs(); + } + + if (diff < 0 || diff > 10) { + errmsg = "no secondaries within 10 seconds of my optime"; + result.append("closest", closest.getSecs()); + result.append("difference", diff); + return false; + } + + // step down + theReplSet->stepDown(120); + + log() << "waiting for secondaries to catch up" << endl; + + lastOp = theReplSet->lastOpTimeWritten; + while (lastOp != closest && now - start < 60) { + closest = theReplSet->lastOtherOpTime(); + + now++; + sleepsecs(1); + } + + // regardless of whether they caught up, we'll shut down + } + + return shutdownHelper(); + } + + class CmdDropDatabase : public Command { + public: + virtual bool logTheOp() { + return true; + } + virtual void help( stringstream& help ) const { + help << "drop (delete) this database"; + } + virtual bool slaveOk() const { + return false; + } + virtual LockType locktype() const { return WRITE; } + CmdDropDatabase() : Command("dropDatabase") {} + bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + BSONElement e = cmdObj.firstElement(); + log() << "dropDatabase " << dbname << endl; + int p = (int) e.number(); + if ( p != 1 ) + return false; + dropDatabase(dbname); + result.append( "dropped" , dbname ); + return true; + } + } cmdDropDatabase; + + class CmdRepairDatabase : public Command { + public: + virtual bool logTheOp() { + return false; + } + virtual bool slaveOk() const { + return true; + } + virtual bool maintenanceMode() const { return true; } + virtual void help( stringstream& help ) const { + help << "repair database. also compacts. note: slow."; + } + virtual LockType locktype() const { return WRITE; } + CmdRepairDatabase() : Command("repairDatabase") {} + bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + BSONElement e = cmdObj.firstElement(); + log() << "repairDatabase " << dbname << endl; + int p = (int) e.number(); + if ( p != 1 ) { + errmsg = "bad option"; + return false; + } + e = cmdObj.getField( "preserveClonedFilesOnFailure" ); + bool preserveClonedFilesOnFailure = e.isBoolean() && e.boolean(); + e = cmdObj.getField( "backupOriginalFiles" ); + bool backupOriginalFiles = e.isBoolean() && e.boolean(); + return repairDatabase( dbname, errmsg, preserveClonedFilesOnFailure, backupOriginalFiles ); + } + } cmdRepairDatabase; + + /* set db profiling level + todo: how do we handle profiling information put in the db with replication? + sensibly or not? + */ + class CmdProfile : public Command { + public: + virtual bool slaveOk() const { + return true; + } + virtual void help( stringstream& help ) const { + help << "enable or disable performance profiling\n"; + help << "{ profile : <n> }\n"; + help << "0=off 1=log slow ops 2=log all\n"; + help << "-1 to get current values\n"; + help << "http://www.mongodb.org/display/DOCS/Database+Profiler"; + } + virtual LockType locktype() const { return WRITE; } + CmdProfile() : Command("profile") {} + bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + BSONElement e = cmdObj.firstElement(); + result.append("was", cc().database()->profile); + result.append("slowms", cmdLine.slowMS ); + + int p = (int) e.number(); + bool ok = false; + + if ( p == -1 ) + ok = true; + else if ( p >= 0 && p <= 2 ) { + ok = cc().database()->setProfilingLevel( p , errmsg ); + } + + BSONElement slow = cmdObj["slowms"]; + if ( slow.isNumber() ) + cmdLine.slowMS = slow.numberInt(); + + return ok; + } + } cmdProfile; + + class CmdServerStatus : public Command { + public: + virtual bool slaveOk() const { + return true; + } + CmdServerStatus() : Command("serverStatus", true) {} + + virtual LockType locktype() const { return NONE; } + + virtual void help( stringstream& help ) const { + help << "returns lots of administrative server statistics"; + } + + bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + long long start = Listener::getElapsedTimeMillis(); + BSONObjBuilder timeBuilder(128); + + + bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin"); + + result.append( "host" , prettyHostName() ); + result.append("version", versionString); + result.append("process","mongod"); + result.append("uptime",(double) (time(0)-cmdLine.started)); + result.append("uptimeEstimate",(double) (start/1000)); + result.appendDate( "localTime" , jsTime() ); + + { + BSONObjBuilder t; + + unsigned long long last, start, timeLocked; + d.dbMutex.info().getTimingInfo(start, timeLocked); + last = curTimeMicros64(); + double tt = (double) last-start; + double tl = (double) timeLocked; + t.append("totalTime", tt); + t.append("lockTime", tl); + t.append("ratio", (tt ? tl/tt : 0)); + + { + BSONObjBuilder ttt( t.subobjStart( "currentQueue" ) ); + int w=0, r=0; + Client::recommendedYieldMicros( &w , &r ); + ttt.append( "total" , w + r ); + ttt.append( "readers" , r ); + ttt.append( "writers" , w ); + ttt.done(); + } + + { + BSONObjBuilder ttt( t.subobjStart( "activeClients" ) ); + int w=0, r=0; + Client::getActiveClientCount( w , r ); + ttt.append( "total" , w + r ); + ttt.append( "readers" , r ); + ttt.append( "writers" , w ); + ttt.done(); + } + + + + result.append( "globalLock" , t.obj() ); + } + timeBuilder.appendNumber( "after basic" , Listener::getElapsedTimeMillis() - start ); + + { + + BSONObjBuilder t( result.subobjStart( "mem" ) ); + + t.append("bits", ( sizeof(int*) == 4 ? 32 : 64 ) ); + + ProcessInfo p; + int v = 0; + if ( p.supported() ) { + t.appendNumber( "resident" , p.getResidentSize() ); + v = p.getVirtualMemorySize(); + t.appendNumber( "virtual" , v ); + t.appendBool( "supported" , true ); + } + else { + result.append( "note" , "not all mem info support on this platform" ); + t.appendBool( "supported" , false ); + } + + timeBuilder.appendNumber( "middle of mem" , Listener::getElapsedTimeMillis() - start ); + + int m = (int) (MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 )); + t.appendNumber( "mapped" , m ); + + if ( cmdLine.dur ) { + m *= 2; + t.appendNumber( "mappedWithJournal" , m ); + } + + int overhead = v - m - connTicketHolder.used(); + + if( overhead > 4000 ) { + t.append("note", "virtual minus mapped is large. could indicate a memory leak"); + log() << "warning: virtual size (" << v << "MB) - mapped size (" << m << "MB) is large (" << overhead << "MB). could indicate a memory leak" << endl; + } + + t.done(); + + } + timeBuilder.appendNumber( "after mem" , Listener::getElapsedTimeMillis() - start ); + + { + BSONObjBuilder bb( result.subobjStart( "connections" ) ); + bb.append( "current" , connTicketHolder.used() ); + bb.append( "available" , connTicketHolder.available() ); + bb.done(); + } + timeBuilder.appendNumber( "after connections" , Listener::getElapsedTimeMillis() - start ); + + { + BSONObjBuilder bb( result.subobjStart( "extra_info" ) ); + bb.append("note", "fields vary by platform"); + ProcessInfo p; + p.getExtraInfo(bb); + bb.done(); + timeBuilder.appendNumber( "after extra info" , Listener::getElapsedTimeMillis() - start ); + + } + + { + BSONObjBuilder bb( result.subobjStart( "indexCounters" ) ); + globalIndexCounters.append( bb ); + bb.done(); + } + + { + BSONObjBuilder bb( result.subobjStart( "backgroundFlushing" ) ); + globalFlushCounters.append( bb ); + bb.done(); + } + + { + BSONObjBuilder bb( result.subobjStart( "cursors" ) ); + ClientCursor::appendStats( bb ); + bb.done(); + } + + { + BSONObjBuilder bb( result.subobjStart( "network" ) ); + networkCounter.append( bb ); + bb.done(); + } + + + timeBuilder.appendNumber( "after counters" , Listener::getElapsedTimeMillis() - start ); + + if ( anyReplEnabled() ) { + BSONObjBuilder bb( result.subobjStart( "repl" ) ); + appendReplicationInfo( bb , authed , cmdObj["repl"].numberInt() ); + bb.done(); + + if ( ! _isMaster() ) { + result.append( "opcountersRepl" , replOpCounters.getObj() ); + } + + } + + timeBuilder.appendNumber( "after repl" , Listener::getElapsedTimeMillis() - start ); + + result.append( "opcounters" , globalOpCounters.getObj() ); + + { + BSONObjBuilder asserts( result.subobjStart( "asserts" ) ); + asserts.append( "regular" , assertionCount.regular ); + asserts.append( "warning" , assertionCount.warning ); + asserts.append( "msg" , assertionCount.msg ); + asserts.append( "user" , assertionCount.user ); + asserts.append( "rollovers" , assertionCount.rollovers ); + asserts.done(); + } + + timeBuilder.appendNumber( "after asserts" , Listener::getElapsedTimeMillis() - start ); + + result.append( "writeBacksQueued" , ! writeBackManager.queuesEmpty() ); + + if( cmdLine.dur ) { + result.append("dur", dur::stats.asObj()); + } + + timeBuilder.appendNumber( "after dur" , Listener::getElapsedTimeMillis() - start ); + + { + RamLog* rl = RamLog::get( "warnings" ); + verify(15880, rl); + + if (rl->lastWrite() >= time(0)-(10*60)){ // only show warnings from last 10 minutes + vector<const char*> lines; + rl->get( lines ); + + BSONArrayBuilder arr( result.subarrayStart( "warnings" ) ); + for ( unsigned i=std::max(0,(int)lines.size()-10); i<lines.size(); i++ ) + arr.append( lines[i] ); + arr.done(); + } + } + + if ( ! authed ) + result.append( "note" , "run against admin for more info" ); + + timeBuilder.appendNumber( "at end" , Listener::getElapsedTimeMillis() - start ); + if ( Listener::getElapsedTimeMillis() - start > 1000 ) { + BSONObj t = timeBuilder.obj(); + log() << "serverStatus was very slow: " << t << endl; + result.append( "timing" , t ); + } + + return true; + } + } cmdServerStatus; + + class CmdGetOpTime : public Command { + public: + virtual bool slaveOk() const { + return true; + } + virtual void help( stringstream& help ) const { help << "internal"; } + virtual LockType locktype() const { return NONE; } + CmdGetOpTime() : Command("getoptime") { } + bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + writelock l( "" ); + result.appendDate("optime", OpTime::now().asDate()); + return true; + } + } cmdgetoptime; + + /* + class Cmd : public Command { + public: + Cmd() : Command("") { } + bool adminOnly() const { return true; } + bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result) { + return true; + } + } cmd; + */ + + class CmdDiagLogging : public Command { + public: + virtual bool slaveOk() const { + return true; + } + CmdDiagLogging() : Command("diagLogging") { } + bool adminOnly() const { + return true; + } + void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Monitoring+and+Diagnostics#MonitoringandDiagnostics-DatabaseRecord%2FReplay"; } + virtual LockType locktype() const { return WRITE; } + bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { + int was = _diaglog.setLevel( cmdObj.firstElement().numberInt() ); + _diaglog.flush(); + if ( !cmdLine.quiet ) + tlog() << "CMD: diagLogging set to " << _diaglog.getLevel() << " from: " << was << endl; + result.append( "was" , was ); + return true; + } + } cmddiaglogging; + + /* remove bit from a bit array - actually remove its slot, not a clear + note: this function does not work with x == 63 -- that is ok + but keep in mind in the future if max indexes were extended to + exactly 64 it would be a problem + */ + unsigned long long removeBit(unsigned long long b, int x) { + unsigned long long tmp = b; + return + (tmp & ((((unsigned long long) 1) << x)-1)) | + ((tmp >> (x+1)) << x); + } + + struct DBCommandsUnitTest { + DBCommandsUnitTest() { + assert( removeBit(1, 0) == 0 ); + assert( removeBit(2, 0) == 1 ); + assert( removeBit(2, 1) == 0 ); + assert( removeBit(255, 1) == 127 ); + assert( removeBit(21, 2) == 9 ); + assert( removeBit(0x4000000000000001ULL, 62) == 1 ); + } + } dbc_unittest; + + void assureSysIndexesEmptied(const char *ns, IndexDetails *exceptForIdIndex); + int removeFromSysIndexes(const char *ns, const char *idxName); + + bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool mayDeleteIdIndex ) { + + BackgroundOperation::assertNoBgOpInProgForNs(ns); + + d = d->writingWithExtra(); + d->aboutToDeleteAnIndex(); + + /* there may be pointers pointing at keys in the btree(s). kill them. */ + ClientCursor::invalidate(ns); + + // delete a specific index or all? + if ( *name == '*' && name[1] == 0 ) { + log(4) << " d->nIndexes was " << d->nIndexes << '\n'; + anObjBuilder.append("nIndexesWas", (double)d->nIndexes); + IndexDetails *idIndex = 0; + if( d->nIndexes ) { + for ( int i = 0; i < d->nIndexes; i++ ) { + if ( !mayDeleteIdIndex && d->idx(i).isIdIndex() ) { + idIndex = &d->idx(i); + } + else { + d->idx(i).kill_idx(); + } + } + d->nIndexes = 0; + } + if ( idIndex ) { + d->addIndex(ns) = *idIndex; + wassert( d->nIndexes == 1 ); + } + /* assuming here that id index is not multikey: */ + d->multiKeyIndexBits = 0; + assureSysIndexesEmptied(ns, idIndex); + anObjBuilder.append("msg", mayDeleteIdIndex ? + "indexes dropped for collection" : + "non-_id indexes dropped for collection"); + } + else { + // delete just one index + int x = d->findIndexByName(name); + if ( x >= 0 ) { + log(4) << " d->nIndexes was " << d->nIndexes << endl; + anObjBuilder.append("nIndexesWas", (double)d->nIndexes); + + /* note it is important we remove the IndexDetails with this + call, otherwise, on recreate, the old one would be reused, and its + IndexDetails::info ptr would be bad info. + */ + IndexDetails *id = &d->idx(x); + if ( !mayDeleteIdIndex && id->isIdIndex() ) { + errmsg = "may not delete _id index"; + return false; + } + id->kill_idx(); + d->multiKeyIndexBits = removeBit(d->multiKeyIndexBits, x); + d->nIndexes--; + for ( int i = x; i < d->nIndexes; i++ ) + d->idx(i) = d->idx(i+1); + } + else { + int n = removeFromSysIndexes(ns, name); // just in case an orphaned listing there - i.e. should have been repaired but wasn't + if( n ) { + log() << "info: removeFromSysIndexes cleaned up " << n << " entries" << endl; + } + log() << "dropIndexes: " << name << " not found" << endl; + errmsg = "index not found"; + return false; + } + } + return true; + } + + /* drop collection */ + class CmdDrop : public Command { + public: + CmdDrop() : Command("drop") { } + virtual bool logTheOp() { + return true; + } + virtual bool slaveOk() const { + return false; + } + virtual bool adminOnly() const { + return false; + } + virtual void help( stringstream& help ) const { help << "drop a collection\n{drop : <collectionName>}"; } + virtual LockType locktype() const { return WRITE; } + virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { + string nsToDrop = dbname + '.' + cmdObj.firstElement().valuestr(); + NamespaceDetails *d = nsdetails(nsToDrop.c_str()); + if ( !cmdLine.quiet ) + tlog() << "CMD: drop " << nsToDrop << endl; + if ( d == 0 ) { + errmsg = "ns not found"; + return false; + } + uassert( 10039 , "can't drop collection with reserved $ character in name", strchr(nsToDrop.c_str(), '$') == 0 ); + dropCollection( nsToDrop, errmsg, result ); + return true; + } + } cmdDrop; + + /* select count(*) */ + class CmdCount : public Command { + public: + virtual LockType locktype() const { return READ; } + CmdCount() : Command("count") { } + virtual bool logTheOp() { return false; } + virtual bool slaveOk() const { + // ok on --slave setups + return replSettings.slave == SimpleSlave; + } + virtual bool slaveOverrideOk() { return true; } + virtual bool maintenanceOk() const { return false; } + virtual bool adminOnly() const { return false; } + virtual void help( stringstream& help ) const { help << "count objects in collection"; } + virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { + string ns = parseNs(dbname, cmdObj); + string err; + long long n = runCount(ns.c_str(), cmdObj, err); + long long nn = n; + bool ok = true; + if ( n == -1 ) { + nn = 0; + result.appendBool( "missing" , true ); + } + else if ( n < 0 ) { + nn = 0; + ok = false; + if ( !err.empty() ) + errmsg = err; + } + result.append("n", (double) nn); + return ok; + } + } cmdCount; + + /* create collection */ + class CmdCreate : public Command { + public: + CmdCreate() : Command("create") { } + virtual bool logTheOp() { + return false; + } + virtual bool slaveOk() const { + return false; + } + virtual bool adminOnly() const { + return false; + } + virtual LockType locktype() const { return WRITE; } + virtual void help( stringstream& help ) const { + help << "create a collection explicitly\n" + "{ create: <ns>[, capped: <bool>, size: <collSizeInBytes>, max: <nDocs>] }"; + } + virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + uassert(15888, "must pass name of collection to create", cmdObj.firstElement().valuestrsafe()[0] != '\0'); + string ns = dbname + '.' + cmdObj.firstElement().valuestr(); + string err; + uassert(14832, "specify size:<n> when capped is true", !cmdObj["capped"].trueValue() || cmdObj["size"].isNumber() || cmdObj.hasField("$nExtents")); + bool ok = userCreateNS(ns.c_str(), cmdObj, err, ! fromRepl ); + if ( !ok && !err.empty() ) + errmsg = err; + return ok; + } + } cmdCreate; + + /* "dropIndexes" is now the preferred form - "deleteIndexes" deprecated */ + class CmdDropIndexes : public Command { + public: + virtual bool logTheOp() { + return true; + } + virtual bool slaveOk() const { + return false; + } + virtual LockType locktype() const { return WRITE; } + virtual void help( stringstream& help ) const { + help << "drop indexes for a collection"; + } + CmdDropIndexes() : Command("dropIndexes", false, "deleteIndexes") { } + bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& anObjBuilder, bool /*fromRepl*/) { + BSONElement e = jsobj.firstElement(); + string toDeleteNs = dbname + '.' + e.valuestr(); + NamespaceDetails *d = nsdetails(toDeleteNs.c_str()); + if ( !cmdLine.quiet ) + tlog() << "CMD: dropIndexes " << toDeleteNs << endl; + if ( d ) { + BSONElement f = jsobj.getField("index"); + if ( f.type() == String ) { + return dropIndexes( d, toDeleteNs.c_str(), f.valuestr(), errmsg, anObjBuilder, false ); + } + else if ( f.type() == Object ) { + int idxId = d->findIndexByKeyPattern( f.embeddedObject() ); + if ( idxId < 0 ) { + errmsg = "can't find index with key:"; + errmsg += f.embeddedObject().toString(); + return false; + } + else { + IndexDetails& ii = d->idx( idxId ); + string iName = ii.indexName(); + return dropIndexes( d, toDeleteNs.c_str(), iName.c_str() , errmsg, anObjBuilder, false ); + } + } + else { + errmsg = "invalid index name spec"; + return false; + } + } + else { + errmsg = "ns not found"; + return false; + } + } + } cmdDropIndexes; + + class CmdReIndex : public Command { + public: + virtual bool logTheOp() { return false; } // only reindexes on the one node + virtual bool slaveOk() const { return true; } // can reindex on a secondary + virtual LockType locktype() const { return WRITE; } + virtual void help( stringstream& help ) const { + help << "re-index a collection"; + } + CmdReIndex() : Command("reIndex") { } + bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { + static DBDirectClient db; + + BSONElement e = jsobj.firstElement(); + string toDeleteNs = dbname + '.' + e.valuestr(); + NamespaceDetails *d = nsdetails(toDeleteNs.c_str()); + tlog() << "CMD: reIndex " << toDeleteNs << endl; + BackgroundOperation::assertNoBgOpInProgForNs(toDeleteNs.c_str()); + + if ( ! d ) { + errmsg = "ns not found"; + return false; + } + + list<BSONObj> all; + auto_ptr<DBClientCursor> i = db.query( dbname + ".system.indexes" , BSON( "ns" << toDeleteNs ) , 0 , 0 , 0 , QueryOption_SlaveOk ); + BSONObjBuilder b; + while ( i->more() ) { + BSONObj o = i->next().removeField("v").getOwned(); + b.append( BSONObjBuilder::numStr( all.size() ) , o ); + all.push_back( o ); + } + + + bool ok = dropIndexes( d, toDeleteNs.c_str(), "*" , errmsg, result, true ); + if ( ! ok ) { + errmsg = "dropIndexes failed"; + return false; + } + + for ( list<BSONObj>::iterator i=all.begin(); i!=all.end(); i++ ) { + BSONObj o = *i; + log(1) << "reIndex ns: " << toDeleteNs << " index: " << o << endl; + theDataFileMgr.insertWithObjMod( Namespace( toDeleteNs.c_str() ).getSisterNS( "system.indexes" ).c_str() , o , true ); + } + + result.append( "nIndexes" , (int)all.size() ); + result.appendArray( "indexes" , b.obj() ); + return true; + } + } cmdReIndex; + + class CmdListDatabases : public Command { + public: + virtual bool slaveOk() const { + return true; + } + virtual bool slaveOverrideOk() { + return true; + } + virtual bool adminOnly() const { + return true; + } + virtual LockType locktype() const { return NONE; } + virtual void help( stringstream& help ) const { help << "list databases on this server"; } + CmdListDatabases() : Command("listDatabases" , true ) {} + bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { + vector< string > dbNames; + getDatabaseNames( dbNames ); + vector< BSONObj > dbInfos; + + set<string> seen; + boost::intmax_t totalSize = 0; + for ( vector< string >::iterator i = dbNames.begin(); i != dbNames.end(); ++i ) { + BSONObjBuilder b; + b.append( "name", *i ); + + boost::intmax_t size = dbSize( i->c_str() ); + b.append( "sizeOnDisk", (double) size ); + totalSize += size; + + { + Client::ReadContext rc( *i + ".system.namespaces" ); + b.appendBool( "empty", rc.ctx().db()->isEmpty() ); + } + + dbInfos.push_back( b.obj() ); + + seen.insert( i->c_str() ); + } + + // TODO: erh 1/1/2010 I think this is broken where path != dbpath ?? + set<string> allShortNames; + { + readlock lk; + dbHolder().getAllShortNames( false, allShortNames ); + } + + for ( set<string>::iterator i = allShortNames.begin(); i != allShortNames.end(); i++ ) { + string name = *i; + + if ( seen.count( name ) ) + continue; + + BSONObjBuilder b; + b.append( "name" , name ); + b.append( "sizeOnDisk" , (double)1.0 ); + + { + readlock lk( name ); + Client::Context ctx( name ); + b.appendBool( "empty", ctx.db()->isEmpty() ); + } + + dbInfos.push_back( b.obj() ); + } + + result.append( "databases", dbInfos ); + result.append( "totalSize", double( totalSize ) ); + return true; + } + } cmdListDatabases; + + /* note an access to a database right after this will open it back up - so this is mainly + for diagnostic purposes. + */ + class CmdCloseAllDatabases : public Command { + public: + virtual void help( stringstream& help ) const { help << "Close all database files.\nA new request will cause an immediate reopening; thus, this is mostly for testing purposes."; } + virtual bool adminOnly() const { return true; } + virtual bool slaveOk() const { return false; } + virtual LockType locktype() const { return WRITE; } + + CmdCloseAllDatabases() : Command( "closeAllDatabases" ) {} + bool run(const string& dbname , BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { + bool ok; + try { + ok = dbHolderW().closeAll( dbpath , result, false ); + } + catch(DBException&) { + throw; + } + catch(...) { + log() << "ERROR uncaught exception in command closeAllDatabases" << endl; + errmsg = "unexpected uncaught exception"; + return false; + } + return ok; + } + } cmdCloseAllDatabases; + + class CmdFileMD5 : public Command { + public: + CmdFileMD5() : Command( "filemd5" ) {} + virtual bool slaveOk() const { + return true; + } + virtual void help( stringstream& help ) const { + help << " example: { filemd5 : ObjectId(aaaaaaa) , root : \"fs\" }"; + } + virtual LockType locktype() const { return READ; } + bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + string ns = dbname; + ns += "."; + { + string root = jsobj.getStringField( "root" ); + if ( root.size() == 0 ) + root = "fs"; + ns += root; + } + ns += ".chunks"; // make this an option in jsobj + + md5digest d; + md5_state_t st; + md5_init(&st); + + BSONObj query = BSON( "files_id" << jsobj["filemd5"] ); + BSONObj sort = BSON( "files_id" << 1 << "n" << 1 ); + + shared_ptr<Cursor> cursor = bestGuessCursor(ns.c_str(), query, sort); + if ( ! cursor ) { + errmsg = "need an index on { files_id : 1 , n : 1 }"; + return false; + } + auto_ptr<ClientCursor> cc (new ClientCursor(QueryOption_NoCursorTimeout, cursor, ns.c_str())); + + int n = 0; + while ( cursor->ok() ) { + if ( ! cursor->matcher()->matchesCurrent( cursor.get() ) ) { + log() << "**** NOT MATCHING ****" << endl; + PRINT(cursor->current()); + cursor->advance(); + continue; + } + + BSONObj obj = cursor->current(); + cursor->advance(); + + BSONElement ne = obj["n"]; + assert(ne.isNumber()); + int myn = ne.numberInt(); + if ( n != myn ) { + log() << "should have chunk: " << n << " have:" << myn << endl; + dumpChunks( ns , query , sort ); + uassert( 10040 , "chunks out of order" , n == myn ); + } + + int len; + const char * data = obj["data"].binDataClean( len ); + + ClientCursor::YieldLock yield (cc.get()); + try { + md5_append( &st , (const md5_byte_t*)(data) , len ); + n++; + } + catch (...) { + if ( ! yield.stillOk() ) // relocks + cc.release(); + throw; + } + + if ( ! yield.stillOk() ) { + cc.release(); + uasserted(13281, "File deleted during filemd5 command"); + } + } + + md5_finish(&st, d); + + result.append( "numChunks" , n ); + result.append( "md5" , digestToString( d ) ); + return true; + } + + void dumpChunks( const string& ns , const BSONObj& query , const BSONObj& sort ) { + DBDirectClient client; + Query q(query); + q.sort(sort); + auto_ptr<DBClientCursor> c = client.query(ns, q); + while(c->more()) + PRINT(c->nextSafe()); + } + } cmdFileMD5; + + static IndexDetails *cmdIndexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ) { + if ( ns[ 0 ] == '\0' || min.isEmpty() || max.isEmpty() ) { + errmsg = "invalid command syntax (note: min and max are required)"; + return 0; + } + return indexDetailsForRange( ns, errmsg, min, max, keyPattern ); + } + + class CmdDatasize : public Command { + virtual string parseNs(const string& dbname, const BSONObj& cmdObj) const { + return parseNsFullyQualified(dbname, cmdObj); + } + public: + CmdDatasize() : Command( "dataSize", false, "datasize" ) {} + virtual bool slaveOk() const { return true; } + virtual LockType locktype() const { return READ; } + virtual void help( stringstream &help ) const { + help << + "determine data size for a set of data in a certain range" + "\nexample: { dataSize:\"blog.posts\", keyPattern:{x:1}, min:{x:10}, max:{x:55} }" + "\nkeyPattern, min, and max parameters are optional." + "\nnote: This command may take a while to run"; + } + bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + Timer timer; + + string ns = jsobj.firstElement().String(); + BSONObj min = jsobj.getObjectField( "min" ); + BSONObj max = jsobj.getObjectField( "max" ); + BSONObj keyPattern = jsobj.getObjectField( "keyPattern" ); + bool estimate = jsobj["estimate"].trueValue(); + + Client::Context ctx( ns ); + NamespaceDetails *d = nsdetails(ns.c_str()); + + if ( ! d || d->stats.nrecords == 0 ) { + result.appendNumber( "size" , 0 ); + result.appendNumber( "numObjects" , 0 ); + result.append( "millis" , timer.millis() ); + return true; + } + + result.appendBool( "estimate" , estimate ); + + shared_ptr<Cursor> c; + if ( min.isEmpty() && max.isEmpty() ) { + if ( estimate ) { + result.appendNumber( "size" , d->stats.datasize ); + result.appendNumber( "numObjects" , d->stats.nrecords ); + result.append( "millis" , timer.millis() ); + return 1; + } + c = theDataFileMgr.findAll( ns.c_str() ); + } + else if ( min.isEmpty() || max.isEmpty() ) { + errmsg = "only one of min or max specified"; + return false; + } + else { + IndexDetails *idx = cmdIndexDetailsForRange( ns.c_str(), errmsg, min, max, keyPattern ); + if ( idx == 0 ) + return false; + + c.reset( BtreeCursor::make( d, d->idxNo(*idx), *idx, min, max, false, 1 ) ); + } + + long long avgObjSize = d->stats.datasize / d->stats.nrecords; + + long long maxSize = jsobj["maxSize"].numberLong(); + long long maxObjects = jsobj["maxObjects"].numberLong(); + + long long size = 0; + long long numObjects = 0; + while( c->ok() ) { + + if ( estimate ) + size += avgObjSize; + else + size += c->currLoc().rec()->netLength(); + + numObjects++; + + if ( ( maxSize && size > maxSize ) || + ( maxObjects && numObjects > maxObjects ) ) { + result.appendBool( "maxReached" , true ); + break; + } + + c->advance(); + } + + ostringstream os; + os << "Finding size for ns: " << ns; + if ( ! min.isEmpty() ) { + os << " between " << min << " and " << max; + } + logIfSlow( timer , os.str() ); + + result.appendNumber( "size", size ); + result.appendNumber( "numObjects" , numObjects ); + result.append( "millis" , timer.millis() ); + return true; + } + } cmdDatasize; + + namespace { + long long getIndexSizeForCollection(string db, string ns, BSONObjBuilder* details=NULL, int scale = 1 ) { + d.dbMutex.assertAtLeastReadLocked(); + + NamespaceDetails * nsd = nsdetails( ns.c_str() ); + if ( ! nsd ) + return 0; + + long long totalSize = 0; + + NamespaceDetails::IndexIterator ii = nsd->ii(); + while ( ii.more() ) { + IndexDetails& d = ii.next(); + string collNS = d.indexNamespace(); + NamespaceDetails * mine = nsdetails( collNS.c_str() ); + if ( ! mine ) { + log() << "error: have index [" << collNS << "] but no NamespaceDetails" << endl; + continue; + } + totalSize += mine->stats.datasize; + if ( details ) + details->appendNumber( d.indexName() , mine->stats.datasize / scale ); + } + return totalSize; + } + } + + class CollectionStats : public Command { + public: + CollectionStats() : Command( "collStats", false, "collstats" ) {} + virtual bool slaveOk() const { return true; } + virtual LockType locktype() const { return READ; } + virtual void help( stringstream &help ) const { + help << "{ collStats:\"blog.posts\" , scale : 1 } scale divides sizes e.g. for KB use 1024\n" + " avgObjSize - in bytes"; + } + bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + string ns = dbname + "." + jsobj.firstElement().valuestr(); + Client::Context cx( ns ); + + NamespaceDetails * nsd = nsdetails( ns.c_str() ); + if ( ! nsd ) { + errmsg = "ns not found"; + return false; + } + + result.append( "ns" , ns.c_str() ); + + int scale = 1; + if ( jsobj["scale"].isNumber() ) { + scale = jsobj["scale"].numberInt(); + if ( scale <= 0 ) { + errmsg = "scale has to be > 0"; + return false; + } + } + else if ( jsobj["scale"].trueValue() ) { + errmsg = "scale has to be a number > 0"; + return false; + } + + bool verbose = jsobj["verbose"].trueValue(); + + long long size = nsd->stats.datasize / scale; + result.appendNumber( "count" , nsd->stats.nrecords ); + result.appendNumber( "size" , size ); + if( nsd->stats.nrecords ) + result.append ( "avgObjSize" , double(size) / double(nsd->stats.nrecords) ); + + int numExtents; + BSONArrayBuilder extents; + + result.appendNumber( "storageSize" , nsd->storageSize( &numExtents , verbose ? &extents : 0 ) / scale ); + result.append( "numExtents" , numExtents ); + result.append( "nindexes" , nsd->nIndexes ); + result.append( "lastExtentSize" , nsd->lastExtentSize / scale ); + result.append( "paddingFactor" , nsd->paddingFactor ); + result.append( "flags" , nsd->flags ); + + BSONObjBuilder indexSizes; + result.appendNumber( "totalIndexSize" , getIndexSizeForCollection(dbname, ns, &indexSizes, scale) / scale ); + result.append("indexSizes", indexSizes.obj()); + + if ( nsd->capped ) { + result.append( "capped" , nsd->capped ); + result.append( "max" , nsd->max ); + } + + if ( verbose ) + result.appendArray( "extents" , extents.arr() ); + + return true; + } + } cmdCollectionStats; + + class DBStats : public Command { + public: + DBStats() : Command( "dbStats", false, "dbstats" ) {} + virtual bool slaveOk() const { return true; } + virtual LockType locktype() const { return READ; } + virtual void help( stringstream &help ) const { + help << + "Get stats on a database. Not instantaneous. Slower for databases with large .ns files.\n" << + "Example: { dbStats:1, scale:1 }"; + } + bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + int scale = 1; + if ( jsobj["scale"].isNumber() ) { + scale = jsobj["scale"].numberInt(); + if ( scale <= 0 ) { + errmsg = "scale has to be > 0"; + return false; + } + } + else if ( jsobj["scale"].trueValue() ) { + errmsg = "scale has to be a number > 0"; + return false; + } + + list<string> collections; + Database* d = cc().database(); + if ( d ) + d->namespaceIndex.getNamespaces( collections ); + + long long ncollections = 0; + long long objects = 0; + long long size = 0; + long long storageSize = 0; + long long numExtents = 0; + long long indexes = 0; + long long indexSize = 0; + + for (list<string>::const_iterator it = collections.begin(); it != collections.end(); ++it) { + const string ns = *it; + + NamespaceDetails * nsd = nsdetails( ns.c_str() ); + if ( ! nsd ) { + errmsg = "missing ns: "; + errmsg += ns; + return false; + } + + ncollections += 1; + objects += nsd->stats.nrecords; + size += nsd->stats.datasize; + + int temp; + storageSize += nsd->storageSize( &temp ); + numExtents += temp; + + indexes += nsd->nIndexes; + indexSize += getIndexSizeForCollection(dbname, ns); + } + + result.append ( "db" , dbname ); + result.appendNumber( "collections" , ncollections ); + result.appendNumber( "objects" , objects ); + result.append ( "avgObjSize" , objects == 0 ? 0 : double(size) / double(objects) ); + result.appendNumber( "dataSize" , size / scale ); + result.appendNumber( "storageSize" , storageSize / scale); + result.appendNumber( "numExtents" , numExtents ); + result.appendNumber( "indexes" , indexes ); + result.appendNumber( "indexSize" , indexSize / scale ); + result.appendNumber( "fileSize" , d->fileSize() / scale ); + if( d ) + result.appendNumber( "nsSizeMB", (int) d->namespaceIndex.fileLength() / 1024 / 1024 ); + + return true; + } + } cmdDBStats; + + /* convertToCapped seems to use this */ + class CmdCloneCollectionAsCapped : public Command { + public: + CmdCloneCollectionAsCapped() : Command( "cloneCollectionAsCapped" ) {} + virtual bool slaveOk() const { return false; } + virtual LockType locktype() const { return WRITE; } + virtual void help( stringstream &help ) const { + help << "{ cloneCollectionAsCapped:<fromName>, toCollection:<toName>, size:<sizeInBytes> }"; + } + bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + string from = jsobj.getStringField( "cloneCollectionAsCapped" ); + string to = jsobj.getStringField( "toCollection" ); + long long size = (long long)jsobj.getField( "size" ).number(); + + if ( from.empty() || to.empty() || size == 0 ) { + errmsg = "invalid command spec"; + return false; + } + + string fromNs = dbname + "." + from; + string toNs = dbname + "." + to; + NamespaceDetails *nsd = nsdetails( fromNs.c_str() ); + massert( 10301 , "source collection " + fromNs + " does not exist", nsd ); + long long excessSize = nsd->stats.datasize - size * 2; // datasize and extentSize can't be compared exactly, so add some padding to 'size' + DiskLoc extent = nsd->firstExtent; + for( ; excessSize > extent.ext()->length && extent != nsd->lastExtent; extent = extent.ext()->xnext ) { + excessSize -= extent.ext()->length; + log( 2 ) << "cloneCollectionAsCapped skipping extent of size " << extent.ext()->length << endl; + log( 6 ) << "excessSize: " << excessSize << endl; + } + DiskLoc startLoc = extent.ext()->firstRecord; + + CursorId id; + { + shared_ptr<Cursor> c = theDataFileMgr.findAll( fromNs.c_str(), startLoc ); + ClientCursor *cc = new ClientCursor(0, c, fromNs.c_str()); + id = cc->cursorid(); + } + + DBDirectClient client; + Client::Context ctx( toNs ); + BSONObjBuilder spec; + spec.appendBool( "capped", true ); + spec.append( "size", double( size ) ); + if ( !userCreateNS( toNs.c_str(), spec.done(), errmsg, true ) ) + return false; + + auto_ptr< DBClientCursor > c = client.getMore( fromNs, id ); + while( c->more() ) { + BSONObj obj = c->next(); + theDataFileMgr.insertAndLog( toNs.c_str(), obj, true ); + getDur().commitIfNeeded(); + } + + return true; + } + } cmdCloneCollectionAsCapped; + + /* jan2010: + Converts the given collection to a capped collection w/ the specified size. + This command is not highly used, and is not currently supported with sharded + environments. + */ + class CmdConvertToCapped : public Command { + public: + CmdConvertToCapped() : Command( "convertToCapped" ) {} + virtual bool slaveOk() const { return false; } + virtual LockType locktype() const { return WRITE; } + virtual void help( stringstream &help ) const { + help << "{ convertToCapped:<fromCollectionName>, size:<sizeInBytes> }"; + } + bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + BackgroundOperation::assertNoBgOpInProgForDb(dbname.c_str()); + + string from = jsobj.getStringField( "convertToCapped" ); + long long size = (long long)jsobj.getField( "size" ).number(); + + if ( from.empty() || size == 0 ) { + errmsg = "invalid command spec"; + return false; + } + + string shortTmpName = str::stream() << ".tmp.convertToCapped." << from; + string longTmpName = str::stream() << dbname << "." << shortTmpName; + + DBDirectClient client; + client.dropCollection( longTmpName ); + + BSONObj info; + if ( !client.runCommand( dbname , + BSON( "cloneCollectionAsCapped" << from << "toCollection" << shortTmpName << "size" << double( size ) ), + info ) ) { + errmsg = "cloneCollectionAsCapped failed: " + info.toString(); + return false; + } + + if ( !client.dropCollection( dbname + "." + from ) ) { + errmsg = "failed to drop original collection"; + return false; + } + + if ( !client.runCommand( "admin", + BSON( "renameCollection" << longTmpName << + "to" << ( dbname + "." + from ) ), + info ) ) { + errmsg = "renameCollection failed: " + info.toString(); + return false; + } + + return true; + } + } cmdConvertToCapped; + + /* Returns client's uri */ + class CmdWhatsMyUri : public Command { + public: + CmdWhatsMyUri() : Command("whatsmyuri") { } + virtual bool slaveOk() const { + return true; + } + virtual LockType locktype() const { return NONE; } + virtual void help( stringstream &help ) const { + help << "{whatsmyuri:1}"; + } + virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { + BSONObj info = cc().curop()->infoNoauth(); + result << "you" << info[ "client" ]; + return true; + } + } cmdWhatsMyUri; + + /* For testing only, not for general use */ + class GodInsert : public Command { + public: + GodInsert() : Command( "godinsert" ) { } + virtual bool adminOnly() const { return false; } + virtual bool logTheOp() { return false; } + virtual bool slaveOk() const { return true; } + virtual LockType locktype() const { return NONE; } + virtual bool requiresAuth() { return true; } + virtual void help( stringstream &help ) const { + help << "internal. for testing only."; + } + virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { + + AuthenticationInfo *ai = cc().getAuthenticationInfo(); + if ( ! ai->isLocalHost ) { + errmsg = "godinsert only works locally"; + return false; + } + + string coll = cmdObj[ "godinsert" ].valuestrsafe(); + log() << "test only command godinsert invoked coll:" << coll << endl; + uassert( 13049, "godinsert must specify a collection", !coll.empty() ); + string ns = dbname + "." + coll; + BSONObj obj = cmdObj[ "obj" ].embeddedObjectUserCheck(); + { + dblock lk; + Client::Context ctx( ns ); + theDataFileMgr.insertWithObjMod( ns.c_str(), obj, true ); + } + return true; + } + } cmdGodInsert; + + class DBHashCmd : public Command { + public: + DBHashCmd() : Command( "dbHash", false, "dbhash" ) {} + virtual bool slaveOk() const { return true; } + virtual LockType locktype() const { return READ; } + virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { + list<string> colls; + Database* db = cc().database(); + if ( db ) + db->namespaceIndex.getNamespaces( colls ); + colls.sort(); + + result.appendNumber( "numCollections" , (long long)colls.size() ); + result.append( "host" , prettyHostName() ); + + md5_state_t globalState; + md5_init(&globalState); + + BSONObjBuilder bb( result.subobjStart( "collections" ) ); + for ( list<string>::iterator i=colls.begin(); i != colls.end(); i++ ) { + string c = *i; + if ( c.find( ".system.profil" ) != string::npos ) + continue; + + shared_ptr<Cursor> cursor; + + NamespaceDetails * nsd = nsdetails( c.c_str() ); + + // debug SERVER-761 + NamespaceDetails::IndexIterator ii = nsd->ii(); + while( ii.more() ) { + const IndexDetails &idx = ii.next(); + if ( !idx.head.isValid() || !idx.info.isValid() ) { + log() << "invalid index for ns: " << c << " " << idx.head << " " << idx.info; + if ( idx.info.isValid() ) + log() << " " << idx.info.obj(); + log() << endl; + } + } + + int idNum = nsd->findIdIndex(); + if ( idNum >= 0 ) { + cursor.reset( BtreeCursor::make( nsd , idNum , nsd->idx( idNum ) , BSONObj() , BSONObj() , false , 1 ) ); + } + else if ( c.find( ".system." ) != string::npos ) { + continue; + } + else if ( nsd->capped ) { + cursor = findTableScan( c.c_str() , BSONObj() ); + } + else { + log() << "can't find _id index for: " << c << endl; + continue; + } + + md5_state_t st; + md5_init(&st); + + long long n = 0; + while ( cursor->ok() ) { + BSONObj c = cursor->current(); + md5_append( &st , (const md5_byte_t*)c.objdata() , c.objsize() ); + n++; + cursor->advance(); + } + md5digest d; + md5_finish(&st, d); + string hash = digestToString( d ); + + bb.append( c.c_str() + ( dbname.size() + 1 ) , hash ); + + md5_append( &globalState , (const md5_byte_t*)hash.c_str() , hash.size() ); + } + bb.done(); + + md5digest d; + md5_finish(&globalState, d); + string hash = digestToString( d ); + + result.append( "md5" , hash ); + + return 1; + } + + } dbhashCmd; + + /* for diagnostic / testing purposes. */ + class CmdSleep : public Command { + public: + virtual LockType locktype() const { return NONE; } + virtual bool adminOnly() const { return true; } + virtual bool logTheOp() { return false; } + virtual bool slaveOk() const { return true; } + virtual void help( stringstream& help ) const { + help << "internal testing command. Makes db block (in a read lock) for 100 seconds\n"; + help << "w:true write lock. secs:<seconds>"; + } + CmdSleep() : Command("sleep") { } + bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + log() << "test only command sleep invoked" << endl; + int secs = 100; + if ( cmdObj["secs"].isNumber() ) + secs = cmdObj["secs"].numberInt(); + if( cmdObj.getBoolField("w") ) { + writelock lk(""); + sleepsecs(secs); + } + else { + readlock lk(""); + sleepsecs(secs); + } + return true; + } + } cmdSleep; + + // just for testing + class CapTrunc : public Command { + public: + CapTrunc() : Command( "captrunc" ) {} + virtual bool slaveOk() const { return false; } + virtual LockType locktype() const { return WRITE; } + virtual bool requiresAuth() { return true; } + virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { + string coll = cmdObj[ "captrunc" ].valuestrsafe(); + uassert( 13416, "captrunc must specify a collection", !coll.empty() ); + string ns = dbname + "." + coll; + int n = cmdObj.getIntField( "n" ); + + // inclusive range? + bool inc = cmdObj.getBoolField( "inc" ); + NamespaceDetails *nsd = nsdetails( ns.c_str() ); + ReverseCappedCursor c( nsd ); + massert( 13417, "captrunc collection not found or empty", c.ok() ); + for( int i = 0; i < n; ++i ) { + massert( 13418, "captrunc invalid n", c.advance() ); + } + DiskLoc end = c.currLoc(); + nsd->cappedTruncateAfter( ns.c_str(), end, inc ); + return true; + } + } capTruncCmd; + + // just for testing + class EmptyCapped : public Command { + public: + EmptyCapped() : Command( "emptycapped" ) {} + virtual bool slaveOk() const { return false; } + virtual LockType locktype() const { return WRITE; } + virtual bool requiresAuth() { return true; } + virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { + string coll = cmdObj[ "emptycapped" ].valuestrsafe(); + uassert( 13428, "emptycapped must specify a collection", !coll.empty() ); + string ns = dbname + "." + coll; + NamespaceDetails *nsd = nsdetails( ns.c_str() ); + massert( 13429, "emptycapped no such collection", nsd ); + nsd->emptyCappedCollection( ns.c_str() ); + return true; + } + } emptyCappedCmd; + + bool _execCommand(Command *c, const string& dbname, BSONObj& cmdObj, int queryOptions, BSONObjBuilder& result, bool fromRepl) { + + try { + string errmsg; + if ( ! c->run(dbname, cmdObj, queryOptions, errmsg, result, fromRepl ) ) { + result.append( "errmsg" , errmsg ); + return false; + } + } + catch ( SendStaleConfigException& e ){ + log(1) << "command failed because of stale config, can retry" << causedBy( e ) << endl; + throw; + } + catch ( DBException& e ) { + + // TODO: Rethrown errors have issues here, should divorce SendStaleConfigException from the DBException tree + + stringstream ss; + ss << "exception: " << e.what(); + result.append( "errmsg" , ss.str() ); + result.append( "code" , e.getCode() ); + return false; + } + + return true; + } + + /** + * this handles + - auth + - maintenance mode + - locking + - context + then calls run() + */ + bool execCommand( Command * c , + Client& client , int queryOptions , + const char *cmdns, BSONObj& cmdObj , + BSONObjBuilder& result, + bool fromRepl ) { + + string dbname = nsToDatabase( cmdns ); + + AuthenticationInfo *ai = client.getAuthenticationInfo(); + + if( c->adminOnly() && c->localHostOnlyIfNoAuth( cmdObj ) && noauth && !ai->isLocalHost ) { + result.append( "errmsg" , + "unauthorized: this command must run from localhost when running db without auth" ); + log() << "command denied: " << cmdObj.toString() << endl; + return false; + } + + if ( c->adminOnly() && ! fromRepl && dbname != "admin" ) { + result.append( "errmsg" , "access denied; use admin db" ); + log() << "command denied: " << cmdObj.toString() << endl; + return false; + } + + if ( cmdObj["help"].trueValue() ) { + client.curop()->ensureStarted(); + stringstream ss; + ss << "help for: " << c->name << " "; + c->help( ss ); + result.append( "help" , ss.str() ); + result.append( "lockType" , c->locktype() ); + return true; + } + + bool canRunHere = + isMaster( dbname.c_str() ) || + c->slaveOk() || + ( c->slaveOverrideOk() && ( queryOptions & QueryOption_SlaveOk ) ) || + fromRepl; + + if ( ! canRunHere ) { + result.append( "errmsg" , "not master" ); + result.append( "note" , "from execCommand" ); + return false; + } + + if ( ! c->maintenanceOk() && theReplSet && ! isMaster( dbname.c_str() ) && ! theReplSet->isSecondary() ) { + result.append( "errmsg" , "node is recovering" ); + result.append( "note" , "from execCommand" ); + return false; + } + + if ( c->adminOnly() ) + log( 2 ) << "command: " << cmdObj << endl; + + if (c->maintenanceMode() && theReplSet && theReplSet->isSecondary()) { + theReplSet->setMaintenanceMode(true); + } + + bool retval = false; + if ( c->locktype() == Command::NONE ) { + // we also trust that this won't crash + retval = true; + + if ( c->requiresAuth() ) { + // test that the user at least as read permissions + if ( ! client.getAuthenticationInfo()->isAuthorizedReads( dbname ) ) { + result.append( "errmsg" , "need to login" ); + retval = false; + } + } + + if (retval) { + client.curop()->ensureStarted(); + retval = _execCommand(c, dbname , cmdObj , queryOptions, result , fromRepl ); + } + } + else if( c->locktype() != Command::WRITE ) { + // read lock + assert( ! c->logTheOp() ); + string ns = c->parseNs(dbname, cmdObj); + Client::ReadContext ctx( ns , dbpath, c->requiresAuth() ); // read locks + client.curop()->ensureStarted(); + retval = _execCommand(c, dbname , cmdObj , queryOptions, result , fromRepl ); + } + else { + dassert( c->locktype() == Command::WRITE ); + writelock lk; + client.curop()->ensureStarted(); + Client::Context ctx( dbname , dbpath , c->requiresAuth() ); + retval = _execCommand(c, dbname , cmdObj , queryOptions, result , fromRepl ); + if ( retval && c->logTheOp() && ! fromRepl ) { + logOp("c", cmdns, cmdObj); + } + } + + if (c->maintenanceMode() && theReplSet) { + theReplSet->setMaintenanceMode(false); + } + + return retval; + } + + + /* TODO make these all command objects -- legacy stuff here + + usage: + abc.$cmd.findOne( { ismaster:1 } ); + + returns true if ran a cmd + */ + bool _runCommands(const char *ns, BSONObj& _cmdobj, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) { + string dbname = nsToDatabase( ns ); + + if( logLevel >= 1 ) + log() << "run command " << ns << ' ' << _cmdobj << endl; + + const char *p = strchr(ns, '.'); + if ( !p ) return false; + if ( strcmp(p, ".$cmd") != 0 ) return false; + + BSONObj jsobj; + { + BSONElement e = _cmdobj.firstElement(); + if ( e.type() == Object && (e.fieldName()[0] == '$' + ? str::equals("query", e.fieldName()+1) + : str::equals("query", e.fieldName()))) + { + jsobj = e.embeddedObject(); + } + else { + jsobj = _cmdobj; + } + } + + Client& client = cc(); + bool ok = false; + + BSONElement e = jsobj.firstElement(); + + Command * c = e.type() ? Command::findCommand( e.fieldName() ) : 0; + + if ( c ) { + ok = execCommand( c , client , queryOptions , ns , jsobj , anObjBuilder , fromRepl ); + } + else { + anObjBuilder.append("errmsg", str::stream() << "no such cmd: " << e.fieldName() ); + anObjBuilder.append("bad cmd" , _cmdobj ); + } + + // switch to bool, but wait a bit longer before switching? + // anObjBuilder.append("ok", ok); + anObjBuilder.append("ok", ok?1.0:0.0); + BSONObj x = anObjBuilder.done(); + b.appendBuf((void*) x.objdata(), x.objsize()); + + return true; + } + +} // namespace mongo diff --git a/src/mongo/db/dbcommands_admin.cpp b/src/mongo/db/dbcommands_admin.cpp new file mode 100644 index 00000000000..ffcc3f261fe --- /dev/null +++ b/src/mongo/db/dbcommands_admin.cpp @@ -0,0 +1,550 @@ +// dbcommands_admin.cpp + +/** + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** + this file has dbcommands that are for dba type administration + mostly around dbs and collections + NOT system stuff +*/ + + +#include "pch.h" +#include "jsobj.h" +#include "pdfile.h" +#include "namespace-inl.h" +#include "commands.h" +#include "cmdline.h" +#include "btree.h" +#include "curop-inl.h" +#include "../util/background.h" +#include "../util/logfile.h" +#include "../util/alignedbuilder.h" +#include "../util/paths.h" +#include "../scripting/engine.h" +#include "../util/timer.h" + +namespace mongo { + + class CleanCmd : public Command { + public: + CleanCmd() : Command( "clean" ) {} + + virtual bool slaveOk() const { return true; } + virtual LockType locktype() const { return WRITE; } + + virtual void help(stringstream& h) const { h << "internal"; } + + bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + string dropns = dbname + "." + cmdObj.firstElement().valuestrsafe(); + + if ( !cmdLine.quiet ) + tlog() << "CMD: clean " << dropns << endl; + + NamespaceDetails *d = nsdetails(dropns.c_str()); + + if ( ! d ) { + errmsg = "ns not found"; + return 0; + } + + for ( int i = 0; i < Buckets; i++ ) + d->deletedList[i].Null(); + + result.append("ns", dropns.c_str()); + return 1; + } + + } cleanCmd; + + namespace dur { + boost::filesystem::path getJournalDir(); + } + + class JournalLatencyTestCmd : public Command { + public: + JournalLatencyTestCmd() : Command( "journalLatencyTest" ) {} + + virtual bool slaveOk() const { return true; } + virtual LockType locktype() const { return NONE; } + virtual bool adminOnly() const { return true; } + virtual void help(stringstream& h) const { h << "test how long to write and fsync to a test file in the journal/ directory"; } + + bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + boost::filesystem::path p = dur::getJournalDir(); + p /= "journalLatencyTest"; + + // remove file if already present + try { + remove(p); + } + catch(...) { } + + BSONObjBuilder bb[2]; + for( int pass = 0; pass < 2; pass++ ) { + LogFile f(p.string()); + AlignedBuilder b(1024 * 1024); + { + Timer t; + for( int i = 0 ; i < 100; i++ ) { + f.synchronousAppend(b.buf(), 8192); + } + bb[pass].append("8KB", t.millis() / 100.0); + } + { + const int N = 50; + Timer t2; + long long x = 0; + for( int i = 0 ; i < N; i++ ) { + Timer t; + f.synchronousAppend(b.buf(), 8192); + x += t.micros(); + sleepmillis(4); + } + long long y = t2.micros() - 4*N*1000; + // not really trusting the timer granularity on all platforms so whichever is higher of x and y + bb[pass].append("8KBWithPauses", max(x,y) / (N*1000.0)); + } + { + Timer t; + for( int i = 0 ; i < 20; i++ ) { + f.synchronousAppend(b.buf(), 1024 * 1024); + } + bb[pass].append("1MB", t.millis() / 20.0); + } + // second time around, we are prealloced. + } + result.append("timeMillis", bb[0].obj()); + result.append("timeMillisWithPrealloc", bb[1].obj()); + + try { + remove(p); + } + catch(...) { } + + try { + result.append("onSamePartition", onSamePartition(dur::getJournalDir().string(), dbpath)); + } + catch(...) { } + + return 1; + } + } journalLatencyTestCmd; + + class ValidateCmd : public Command { + public: + ValidateCmd() : Command( "validate" ) {} + + virtual bool slaveOk() const { + return true; + } + + virtual void help(stringstream& h) const { h << "Validate contents of a namespace by scanning its data structures for correctness. Slow.\n" + "Add full:true option to do a more thorough check"; } + + virtual LockType locktype() const { return READ; } + //{ validate: "collectionnamewithoutthedbpart" [, scandata: <bool>] [, full: <bool> } */ + + bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + string ns = dbname + "." + cmdObj.firstElement().valuestrsafe(); + NamespaceDetails * d = nsdetails( ns.c_str() ); + if ( !cmdLine.quiet ) + tlog() << "CMD: validate " << ns << endl; + + if ( ! d ) { + errmsg = "ns not found"; + return 0; + } + + result.append( "ns", ns ); + validateNS( ns.c_str() , d, cmdObj, result); + return 1; + } + + private: + void validateNS(const char *ns, NamespaceDetails *d, const BSONObj& cmdObj, BSONObjBuilder& result) { + const bool full = cmdObj["full"].trueValue(); + const bool scanData = full || cmdObj["scandata"].trueValue(); + + bool valid = true; + BSONArrayBuilder errors; // explanation(s) for why valid = false + if ( d->capped ){ + result.append("capped", d->capped); + result.append("max", d->max); + } + + result.append("firstExtent", str::stream() << d->firstExtent.toString() << " ns:" << d->firstExtent.ext()->nsDiagnostic.toString()); + result.append( "lastExtent", str::stream() << d->lastExtent.toString() << " ns:" << d->lastExtent.ext()->nsDiagnostic.toString()); + + BSONArrayBuilder extentData; + + try { + d->firstExtent.ext()->assertOk(); + d->lastExtent.ext()->assertOk(); + + DiskLoc el = d->firstExtent; + int ne = 0; + while( !el.isNull() ) { + Extent *e = el.ext(); + e->assertOk(); + el = e->xnext; + ne++; + if ( full ) + extentData << e->dump(); + + killCurrentOp.checkForInterrupt(); + } + result.append("extentCount", ne); + } + catch (...) { + valid=false; + errors << "extent asserted"; + } + + if ( full ) + result.appendArray( "extents" , extentData.arr() ); + + + result.appendNumber("datasize", d->stats.datasize); + result.appendNumber("nrecords", d->stats.nrecords); + result.appendNumber("lastExtentSize", d->lastExtentSize); + result.appendNumber("padding", d->paddingFactor); + + + try { + + try { + result.append("firstExtentDetails", d->firstExtent.ext()->dump()); + + valid = valid && d->firstExtent.ext()->validates() && + d->firstExtent.ext()->xprev.isNull(); + } + catch (...) { + errors << "exception firstextent"; + valid = false; + } + + set<DiskLoc> recs; + if( scanData ) { + shared_ptr<Cursor> c = theDataFileMgr.findAll(ns); + int n = 0; + int nInvalid = 0; + long long len = 0; + long long nlen = 0; + int outOfOrder = 0; + DiskLoc cl_last; + while ( c->ok() ) { + n++; + + DiskLoc cl = c->currLoc(); + if ( n < 1000000 ) + recs.insert(cl); + if ( d->capped ) { + if ( cl < cl_last ) + outOfOrder++; + cl_last = cl; + } + + Record *r = c->_current(); + len += r->lengthWithHeaders; + nlen += r->netLength(); + + if (full){ + BSONObj obj(r); + if (!obj.isValid() || !obj.valid()){ // both fast and deep checks + valid = false; + if (nInvalid == 0) // only log once; + errors << "invalid bson object detected (see logs for more info)"; + + nInvalid++; + if (strcmp("_id", obj.firstElementFieldName()) == 0){ + try { + obj.firstElement().validate(); // throws on error + log() << "Invalid bson detected in " << ns << " with _id: " << obj.firstElement().toString(false) << endl; + } + catch(...){ + log() << "Invalid bson detected in " << ns << " with corrupt _id" << endl; + } + } + else { + log() << "Invalid bson detected in " << ns << " and couldn't find _id" << endl; + } + } + } + + c->advance(); + } + if ( d->capped && !d->capLooped() ) { + result.append("cappedOutOfOrder", outOfOrder); + if ( outOfOrder > 1 ) { + valid = false; + errors << "too many out of order records"; + } + } + result.append("objectsFound", n); + + if (full) { + result.append("invalidObjects", nInvalid); + } + + result.appendNumber("bytesWithHeaders", len); + result.appendNumber("bytesWithoutHeaders", nlen); + } + + BSONArrayBuilder deletedListArray; + for ( int i = 0; i < Buckets; i++ ) { + deletedListArray << d->deletedList[i].isNull(); + } + + int ndel = 0; + long long delSize = 0; + int incorrect = 0; + for ( int i = 0; i < Buckets; i++ ) { + DiskLoc loc = d->deletedList[i]; + try { + int k = 0; + while ( !loc.isNull() ) { + if ( recs.count(loc) ) + incorrect++; + ndel++; + + if ( loc.questionable() ) { + if( d->capped && !loc.isValid() && i == 1 ) { + /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid + see comments in namespace.h + */ + break; + } + + if ( loc.a() <= 0 || strstr(ns, "hudsonSmall") == 0 ) { + string err (str::stream() << "bad deleted loc: " << loc.toString() << " bucket:" << i << " k:" << k); + errors << err; + + valid = false; + break; + } + } + + DeletedRecord *d = loc.drec(); + delSize += d->lengthWithHeaders; + loc = d->nextDeleted; + k++; + killCurrentOp.checkForInterrupt(); + } + } + catch (...) { + errors << ("exception in deleted chain for bucket " + BSONObjBuilder::numStr(i)); + valid = false; + } + } + result.appendNumber("deletedCount", ndel); + result.appendNumber("deletedSize", delSize); + + if ( incorrect ) { + errors << (BSONObjBuilder::numStr(incorrect) + " records from datafile are in deleted list"); + valid = false; + } + + int idxn = 0; + try { + result.append("nIndexes", d->nIndexes); + BSONObjBuilder indexes; // not using subObjStart to be exception safe + NamespaceDetails::IndexIterator i = d->ii(); + while( i.more() ) { + IndexDetails& id = i.next(); + long long keys = id.idxInterface().fullValidate(id.head, id.keyPattern()); + indexes.appendNumber(id.indexNamespace(), keys); + } + result.append("keysPerIndex", indexes.done()); + } + catch (...) { + errors << ("exception during index validate idxn " + BSONObjBuilder::numStr(idxn)); + valid=false; + } + + } + catch (AssertionException) { + errors << "exception during validate"; + valid = false; + } + + result.appendBool("valid", valid); + result.append("errors", errors.arr()); + + if ( !full ){ + result.append("warning", "Some checks omitted for speed. use {full:true} option to do more thorough scan."); + } + + if ( !valid ) { + result.append("advice", "ns corrupt, requires repair"); + } + + } + } validateCmd; + + bool lockedForWriting = false; // read from db/instance.cpp + static bool unlockRequested = false; + static mongo::mutex fsyncLockMutex("fsyncLock"); + static boost::condition fsyncLockCondition; + static OID fsyncLockID; // identifies the current lock job + + /* + class UnlockCommand : public Command { + public: + UnlockCommand() : Command( "unlock" ) { } + virtual bool readOnly() { return true; } + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return true; } + virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if( lockedForWriting ) { + log() << "command: unlock requested" << endl; + errmsg = "unlock requested"; + unlockRequested = true; + } + else { + errmsg = "not locked, so cannot unlock"; + return 0; + } + return 1; + } + + } unlockCommand; + */ + /* see unlockFsync() for unlocking: + db.$cmd.sys.unlock.findOne() + */ + class FSyncCommand : public Command { + static const char* url() { return "http://www.mongodb.org/display/DOCS/fsync+Command"; } + class LockDBJob : public BackgroundJob { + protected: + virtual string name() const { return "lockdbjob"; } + void run() { + Client::initThread("fsyncjob"); + Client& c = cc(); + { + scoped_lock lk(fsyncLockMutex); + while (lockedForWriting){ // there is a small window for two LockDBJob's to be active. This prevents it. + fsyncLockCondition.wait(lk.boost()); + } + lockedForWriting = true; + fsyncLockID.init(); + } + readlock lk(""); + MemoryMappedFile::flushAll(true); + log() << "db is now locked for snapshotting, no writes allowed. db.fsyncUnlock() to unlock" << endl; + log() << " For more info see " << FSyncCommand::url() << endl; + _ready = true; + { + scoped_lock lk(fsyncLockMutex); + while( !unlockRequested ) { + fsyncLockCondition.wait(lk.boost()); + } + unlockRequested = false; + lockedForWriting = false; + fsyncLockCondition.notify_all(); + } + c.shutdown(); + } + public: + bool& _ready; + LockDBJob(bool& ready) : BackgroundJob( true /* delete self */ ), _ready(ready) { + _ready = false; + } + }; + public: + FSyncCommand() : Command( "fsync" ) {} + virtual LockType locktype() const { return WRITE; } + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return true; } + /*virtual bool localHostOnlyIfNoAuth(const BSONObj& cmdObj) { + string x = cmdObj["exec"].valuestrsafe(); + return !x.empty(); + }*/ + virtual void help(stringstream& h) const { h << url(); } + virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + bool sync = !cmdObj["async"].trueValue(); // async means do an fsync, but return immediately + bool lock = cmdObj["lock"].trueValue(); + log() << "CMD fsync: sync:" << sync << " lock:" << lock << endl; + + if( lock ) { + // fsync and lock variation + + uassert(12034, "fsync: can't lock while an unlock is pending", !unlockRequested); + uassert(12032, "fsync: sync option must be true when using lock", sync); + /* With releaseEarly(), we must be extremely careful we don't do anything + where we would have assumed we were locked. profiling is one of those things. + Perhaps at profile time we could check if we released early -- however, + we need to be careful to keep that code very fast it's a very common code path when on. + */ + uassert(12033, "fsync: profiling must be off to enter locked mode", cc().database()->profile == 0); + + // todo future: Perhaps we could do this in the background thread. As is now, writes may interleave between + // the releaseEarly below and the acquisition of the readlock in the background thread. + // However the real problem is that it seems complex to unlock here and then have a window for + // writes before the bg job -- can be done correctly but harder to reason about correctness. + // If this command ran within a read lock in the first place, would it work, and then that + // would be quite easy? + // Or, could we downgrade the write lock to a read lock, wait for ready, then release? + getDur().syncDataAndTruncateJournal(); + + bool ready = false; + LockDBJob *l = new LockDBJob(ready); + + d.dbMutex.releaseEarly(); + + // There is a narrow window for another lock request to come in + // here before the LockDBJob grabs the readlock. LockDBJob will + // ensure that the requests are serialized and never running + // concurrently + + l->go(); + // don't return until background thread has acquired the read lock + while( !ready ) { + sleepmillis(10); + } + result.append("info", "now locked against writes, use db.fsyncUnlock() to unlock"); + result.append("seeAlso", url()); + } + else { + // the simple fsync command case + + if (sync) + getDur().commitNow(); + result.append( "numFiles" , MemoryMappedFile::flushAll( sync ) ); + } + return 1; + } + + } fsyncCmd; + + // Note that this will only unlock the current lock. If another thread + // relocks before we return we still consider the unlocking successful. + // This is imporant because if two scripts are trying to fsync-lock, each + // one must be assured that between the fsync return and the call to unlock + // that the database is fully locked + void unlockFsyncAndWait(){ + scoped_lock lk(fsyncLockMutex); + if (lockedForWriting) { // could have handled another unlock before we grabbed the lock + OID curOp = fsyncLockID; + unlockRequested = true; + fsyncLockCondition.notify_all(); + while (lockedForWriting && fsyncLockID == curOp){ + fsyncLockCondition.wait( lk.boost() ); + } + } + } +} + diff --git a/src/mongo/db/dbcommands_generic.cpp b/src/mongo/db/dbcommands_generic.cpp new file mode 100644 index 00000000000..cfd833aa72d --- /dev/null +++ b/src/mongo/db/dbcommands_generic.cpp @@ -0,0 +1,432 @@ +/** @file dbcommands_generic.cpp commands suited for any mongo server (both mongod, mongos) */ + +/** +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "ops/query.h" +#include "pdfile.h" +#include "jsobj.h" +#include "../bson/util/builder.h" +#include <time.h> +#include "introspect.h" +#include "btree.h" +#include "../util/lruishmap.h" +#include "../util/md5.hpp" +#include "../util/processinfo.h" +#include "json.h" +#include "repl.h" +#include "repl_block.h" +#include "replutil.h" +#include "commands.h" +#include "db.h" +#include "instance.h" +#include "lasterror.h" +#include "security.h" +#include "../scripting/engine.h" +#include "stats/counters.h" +#include "background.h" +#include "../util/version.h" +#include "../util/ramlog.h" +#include "repl/multicmd.h" +#include "server.h" + +namespace mongo { + +#if 0 + namespace cloud { + SimpleMutex mtx("cloud"); + Guarded< vector<string>, mtx > ips; + bool startedThread = false; + + void thread() { + bson::bo cmd; + while( 1 ) { + list<Target> L; + { + SimpleMutex::scoped_lock lk(mtx); + if( ips.ref(lk).empty() ) + continue; + for( unsigned i = 0; i < ips.ref(lk).size(); i++ ) { + L.push_back( Target(ips.ref(lk)[i]) ); + } + } + + + /** repoll as machines might be down on the first lookup (only if not found previously) */ + sleepsecs(6); + } + } + } + + class CmdCloud : public Command { + public: + CmdCloud() : Command( "cloud" ) { } + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return true; } + virtual LockType locktype() const { return NONE; } + virtual void help( stringstream &help ) const { + help << "internal command facilitating running in certain cloud computing environments"; + } + bool run(const string& dbname, BSONObj& obj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + if( !obj.hasElement("servers") ) { + vector<string> ips; + obj["servers"].Obj().Vals(ips); + { + SimpleMutex::scoped_lock lk(cloud::mtx); + cloud::ips.ref(lk).swap(ips); + if( !cloud::startedThread ) { + cloud::startedThread = true; + boost::thread thr(cloud::thread); + } + } + } + return true; + } + } cmdCloud; +#endif + + class CmdBuildInfo : public Command { + public: + CmdBuildInfo() : Command( "buildInfo", true, "buildinfo" ) {} + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return false; } + virtual bool requiresAuth() { return false; } + virtual LockType locktype() const { return NONE; } + virtual void help( stringstream &help ) const { + help << "get version #, etc.\n"; + help << "{ buildinfo:1 }"; + } + bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + result << "version" << versionString << "gitVersion" << gitVersion() << "sysInfo" << sysInfo(); + result << "versionArray" << versionArray; + result << "bits" << ( sizeof( int* ) == 4 ? 32 : 64 ); + result.appendBool( "debug" , debug ); + result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize); + return true; + } + } cmdBuildInfo; + + /** experimental. either remove or add support in repl sets also. in a repl set, getting this setting from the + repl set config could make sense. + */ + unsigned replApplyBatchSize = 1; + + class CmdGet : public Command { + public: + CmdGet() : Command( "getParameter" ) { } + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return true; } + virtual LockType locktype() const { return NONE; } + virtual void help( stringstream &help ) const { + help << "get administrative option(s)\nexample:\n"; + help << "{ getParameter:1, notablescan:1 }\n"; + help << "supported so far:\n"; + help << " quiet\n"; + help << " notablescan\n"; + help << " logLevel\n"; + help << " syncdelay\n"; + help << "{ getParameter:'*' } to get everything\n"; + } + bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + bool all = *cmdObj.firstElement().valuestrsafe() == '*'; + + int before = result.len(); + + if( all || cmdObj.hasElement("quiet") ) { + result.append("quiet", cmdLine.quiet ); + } + if( all || cmdObj.hasElement("notablescan") ) { + result.append("notablescan", cmdLine.noTableScan); + } + if( all || cmdObj.hasElement("logLevel") ) { + result.append("logLevel", logLevel); + } + if( all || cmdObj.hasElement("syncdelay") ) { + result.append("syncdelay", cmdLine.syncdelay); + } + if( all || cmdObj.hasElement("replApplyBatchSize") ) { + result.append("replApplyBatchSize", replApplyBatchSize); + } + + if ( before == result.len() ) { + errmsg = "no option found to get"; + return false; + } + return true; + } + } cmdGet; + + // tempish + bool setParmsMongodSpecific(const string& dbname, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl ); + + class CmdSet : public Command { + public: + CmdSet() : Command( "setParameter" ) { } + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return true; } + virtual LockType locktype() const { return NONE; } + virtual void help( stringstream &help ) const { + help << "set administrative option(s)\n"; + help << "{ setParameter:1, <param>:<value> }\n"; + help << "supported so far:\n"; + help << " journalCommitInterval\n"; + help << " logLevel\n"; + help << " notablescan\n"; + help << " quiet\n"; + help << " syncdelay\n"; + } + bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) { + int s = 0; + bool found = setParmsMongodSpecific(dbname, cmdObj, errmsg, result, fromRepl); + if( cmdObj.hasElement("journalCommitInterval") ) { + if( !cmdLine.dur ) { + errmsg = "journaling is off"; + return false; + } + int x = (int) cmdObj["journalCommitInterval"].Number(); + assert( x > 1 && x < 500 ); + cmdLine.journalCommitInterval = x; + log() << "setParameter journalCommitInterval=" << x << endl; + s++; + } + if( cmdObj.hasElement("notablescan") ) { + assert( !cmdLine.isMongos() ); + if( s == 0 ) + result.append("was", cmdLine.noTableScan); + cmdLine.noTableScan = cmdObj["notablescan"].Bool(); + s++; + } + if( cmdObj.hasElement("quiet") ) { + if( s == 0 ) + result.append("was", cmdLine.quiet ); + cmdLine.quiet = cmdObj["quiet"].Bool(); + s++; + } + if( cmdObj.hasElement("syncdelay") ) { + assert( !cmdLine.isMongos() ); + if( s == 0 ) + result.append("was", cmdLine.syncdelay ); + cmdLine.syncdelay = cmdObj["syncdelay"].Number(); + s++; + } + if( cmdObj.hasElement( "logLevel" ) ) { + if( s == 0 ) + result.append("was", logLevel ); + logLevel = cmdObj["logLevel"].numberInt(); + s++; + } + if( cmdObj.hasElement( "replApplyBatchSize" ) ) { + if( s == 0 ) + result.append("was", replApplyBatchSize ); + BSONElement e = cmdObj["replApplyBatchSize"]; + ParameterValidator * v = ParameterValidator::get( e.fieldName() ); + assert( v ); + if ( ! v->isValid( e , errmsg ) ) + return false; + replApplyBatchSize = e.numberInt(); + s++; + } + if( cmdObj.hasElement( "traceExceptions" ) ) { + if( s == 0 ) result.append( "was", DBException::traceExceptions ); + DBException::traceExceptions = cmdObj["traceExceptions"].Bool(); + s++; + } + + if( s == 0 && !found ) { + errmsg = "no option found to set, use help:true to see options "; + return false; + } + + return true; + } + } cmdSet; + + class PingCommand : public Command { + public: + PingCommand() : Command( "ping" ) {} + virtual bool slaveOk() const { return true; } + virtual void help( stringstream &help ) const { help << "a way to check that the server is alive. responds immediately even if server is in a db lock."; } + virtual LockType locktype() const { return NONE; } + virtual bool requiresAuth() { return false; } + virtual bool run(const string& badns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { + // IMPORTANT: Don't put anything in here that might lock db - including authentication + return true; + } + } pingCmd; + + class FeaturesCmd : public Command { + public: + FeaturesCmd() : Command( "features", true ) {} + void help(stringstream& h) const { h << "return build level feature settings"; } + virtual bool slaveOk() const { return true; } + virtual bool readOnly() { return true; } + virtual LockType locktype() const { return NONE; } + virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if ( globalScriptEngine ) { + BSONObjBuilder bb( result.subobjStart( "js" ) ); + result.append( "utf8" , globalScriptEngine->utf8Ok() ); + bb.done(); + } + if ( cmdObj["oidReset"].trueValue() ) { + result.append( "oidMachineOld" , OID::getMachineId() ); + OID::regenMachineId(); + } + result.append( "oidMachine" , OID::getMachineId() ); + return true; + } + + } featuresCmd; + + class LogRotateCmd : public Command { + public: + LogRotateCmd() : Command( "logRotate" ) {} + virtual LockType locktype() const { return NONE; } + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return true; } + virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + rotateLogs(); + return 1; + } + + } logRotateCmd; + + class ListCommandsCmd : public Command { + public: + virtual void help( stringstream &help ) const { help << "get a list of all db commands"; } + ListCommandsCmd() : Command( "listCommands", false ) {} + virtual LockType locktype() const { return NONE; } + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return false; } + virtual bool run(const string& ns, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + BSONObjBuilder b( result.subobjStart( "commands" ) ); + for ( map<string,Command*>::iterator i=_commands->begin(); i!=_commands->end(); ++i ) { + Command * c = i->second; + + // don't show oldnames + if (i->first != c->name) + continue; + + BSONObjBuilder temp( b.subobjStart( c->name ) ); + + { + stringstream help; + c->help( help ); + temp.append( "help" , help.str() ); + } + temp.append( "lockType" , c->locktype() ); + temp.append( "slaveOk" , c->slaveOk() ); + temp.append( "adminOnly" , c->adminOnly() ); + //optionally indicates that the command can be forced to run on a slave/secondary + if ( c->slaveOverrideOk() ) temp.append( "slaveOverrideOk" , c->slaveOverrideOk() ); + temp.done(); + } + b.done(); + + return 1; + } + + } listCommandsCmd; + + bool CmdShutdown::shutdownHelper() { + Client * c = currentClient.get(); + if ( c ) { + c->shutdown(); + } + + log() << "terminating, shutdown command received" << endl; + + dbexit( EXIT_CLEAN , "shutdown called" , true ); // this never returns + assert(0); + return true; + } + + /* for testing purposes only */ + class CmdForceError : public Command { + public: + virtual void help( stringstream& help ) const { + help << "for testing purposes only. forces a user assertion exception"; + } + virtual bool logTheOp() { + return false; + } + virtual bool slaveOk() const { + return true; + } + virtual LockType locktype() const { return NONE; } + CmdForceError() : Command("forceerror") {} + bool run(const string& dbnamne, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + uassert( 10038 , "forced error", false); + return true; + } + } cmdForceError; + + class AvailableQueryOptions : public Command { + public: + AvailableQueryOptions() : Command( "availableQueryOptions" , false , "availablequeryoptions" ) {} + virtual bool slaveOk() const { return true; } + virtual LockType locktype() const { return NONE; } + virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { + result << "options" << QueryOption_AllSupported; + return true; + } + } availableQueryOptionsCmd; + + class GetLogCmd : public Command { + public: + GetLogCmd() : Command( "getLog" ){} + + virtual bool slaveOk() const { return true; } + virtual LockType locktype() const { return NONE; } + virtual bool adminOnly() const { return true; } + + virtual void help( stringstream& help ) const { + help << "{ getLog : '*' } OR { getLog : 'global' }"; + } + + virtual bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { + string p = cmdObj.firstElement().String(); + if ( p == "*" ) { + vector<string> names; + RamLog::getNames( names ); + + BSONArrayBuilder arr; + for ( unsigned i=0; i<names.size(); i++ ) { + arr.append( names[i] ); + } + + result.appendArray( "names" , arr.arr() ); + } + else { + RamLog* rl = RamLog::get( p ); + if ( ! rl ) { + errmsg = str::stream() << "no RamLog named: " << p; + return false; + } + + vector<const char*> lines; + rl->get( lines ); + + BSONArrayBuilder arr( result.subarrayStart( "log" ) ); + for ( unsigned i=0; i<lines.size(); i++ ) + arr.append( lines[i] ); + arr.done(); + } + return true; + } + + } getLogCmd; + +} diff --git a/src/mongo/db/dbeval.cpp b/src/mongo/db/dbeval.cpp new file mode 100644 index 00000000000..9e77d8c8097 --- /dev/null +++ b/src/mongo/db/dbeval.cpp @@ -0,0 +1,136 @@ +/* commands.cpp + db "commands" (sent via db.$cmd.findOne(...)) + */ + +/** +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "ops/query.h" +#include "pdfile.h" +#include "jsobj.h" +#include "../bson/util/builder.h" +#include <time.h> +#include "introspect.h" +#include "btree.h" +#include "../util/lruishmap.h" +#include "json.h" +#include "repl.h" +#include "commands.h" +#include "cmdline.h" + +#include "../scripting/engine.h" + +namespace mongo { + + const int edebug=0; + + bool dbEval(const string& dbName, BSONObj& cmd, BSONObjBuilder& result, string& errmsg) { + BSONElement e = cmd.firstElement(); + uassert( 10046 , "eval needs Code" , e.type() == Code || e.type() == CodeWScope || e.type() == String ); + + const char *code = 0; + switch ( e.type() ) { + case String: + case Code: + code = e.valuestr(); + break; + case CodeWScope: + code = e.codeWScopeCode(); + break; + default: + assert(0); + } + assert( code ); + + if ( ! globalScriptEngine ) { + errmsg = "db side execution is disabled"; + return false; + } + + auto_ptr<Scope> s = globalScriptEngine->getPooledScope( dbName ); + ScriptingFunction f = s->createFunction(code); + if ( f == 0 ) { + errmsg = (string)"compile failed: " + s->getError(); + return false; + } + + if ( e.type() == CodeWScope ) + s->init( e.codeWScopeScopeData() ); + s->localConnect( dbName.c_str() ); + + BSONObj args; + { + BSONElement argsElement = cmd.getField("args"); + if ( argsElement.type() == Array ) { + args = argsElement.embeddedObject(); + if ( edebug ) { + out() << "args:" << args.toString() << endl; + out() << "code:\n" << code << endl; + } + } + } + + int res; + { + Timer t; + res = s->invoke(f, &args, 0, cmdLine.quota ? 10 * 60 * 1000 : 0 ); + int m = t.millis(); + if ( m > cmdLine.slowMS ) { + out() << "dbeval slow, time: " << dec << m << "ms " << dbName << endl; + if ( m >= 1000 ) log() << code << endl; + else OCCASIONALLY log() << code << endl; + } + } + if ( res ) { + result.append("errno", (double) res); + errmsg = "invoke failed: "; + errmsg += s->getError(); + return false; + } + + s->append( result , "retval" , "return" ); + + return true; + } + + class CmdEval : public Command { + public: + virtual bool slaveOk() const { + return false; + } + virtual void help( stringstream &help ) const { + help << "Evaluate javascript at the server.\n" "http://www.mongodb.org/display/DOCS/Server-side+Code+Execution"; + } + virtual LockType locktype() const { return NONE; } + CmdEval() : Command("eval", false, "$eval") { } + bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + + AuthenticationInfo *ai = cc().getAuthenticationInfo(); + uassert( 12598 , "$eval reads unauthorized", ai->isAuthorizedReads(dbname.c_str()) ); + + if ( cmdObj["nolock"].trueValue() ) { + return dbEval(dbname, cmdObj, result, errmsg); + } + + // write security will be enforced in DBDirectClient + mongolock lk( ai->isAuthorized( dbname.c_str() ) ); + Client::Context ctx( dbname ); + + return dbEval(dbname, cmdObj, result, errmsg); + } + } cmdeval; + +} // namespace mongo diff --git a/src/mongo/db/dbhelpers.cpp b/src/mongo/db/dbhelpers.cpp new file mode 100644 index 00000000000..39540c9ce89 --- /dev/null +++ b/src/mongo/db/dbhelpers.cpp @@ -0,0 +1,353 @@ +// dbhelpers.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "db.h" +#include "dbhelpers.h" +#include "json.h" +#include "queryoptimizer.h" +#include "btree.h" +#include "pdfile.h" +#include "oplog.h" +#include "ops/update.h" +#include "ops/delete.h" + +namespace mongo { + + void Helpers::ensureIndex(const char *ns, BSONObj keyPattern, bool unique, const char *name) { + NamespaceDetails *d = nsdetails(ns); + if( d == 0 ) + return; + + { + NamespaceDetails::IndexIterator i = d->ii(); + while( i.more() ) { + if( i.next().keyPattern().woCompare(keyPattern) == 0 ) + return; + } + } + + if( d->nIndexes >= NamespaceDetails::NIndexesMax ) { + problem() << "Helper::ensureIndex fails, MaxIndexes exceeded " << ns << '\n'; + return; + } + + string system_indexes = cc().database()->name + ".system.indexes"; + + BSONObjBuilder b; + b.append("name", name); + b.append("ns", ns); + b.append("key", keyPattern); + b.appendBool("unique", unique); + BSONObj o = b.done(); + + theDataFileMgr.insert(system_indexes.c_str(), o.objdata(), o.objsize()); + } + + /* fetch a single object from collection ns that matches query + set your db SavedContext first + */ + bool Helpers::findOne(const char *ns, const BSONObj &query, BSONObj& result, bool requireIndex) { + DiskLoc loc = findOne( ns, query, requireIndex ); + if ( loc.isNull() ) + return false; + result = loc.obj(); + return true; + } + + /* fetch a single object from collection ns that matches query + set your db SavedContext first + */ + DiskLoc Helpers::findOne(const char *ns, const BSONObj &query, bool requireIndex) { + shared_ptr<Cursor> c = NamespaceDetailsTransient::getCursor( ns, query, BSONObj(), requireIndex ); + while( c->ok() ) { + if ( c->currentMatches() && !c->getsetdup( c->currLoc() ) ) { + return c->currLoc(); + } + c->advance(); + } + return DiskLoc(); + } + + bool Helpers::findById(Client& c, const char *ns, BSONObj query, BSONObj& result , + bool * nsFound , bool * indexFound ) { + d.dbMutex.assertAtLeastReadLocked(); + Database *database = c.database(); + assert( database ); + NamespaceDetails *d = database->namespaceIndex.details(ns); + if ( ! d ) + return false; + if ( nsFound ) + *nsFound = 1; + + int idxNo = d->findIdIndex(); + if ( idxNo < 0 ) + return false; + if ( indexFound ) + *indexFound = 1; + + IndexDetails& i = d->idx( idxNo ); + + BSONObj key = i.getKeyFromQuery( query ); + + DiskLoc loc = i.idxInterface().findSingle(i , i.head , key); + if ( loc.isNull() ) + return false; + result = loc.obj(); + return true; + } + + DiskLoc Helpers::findById(NamespaceDetails *d, BSONObj idquery) { + assert(d); + int idxNo = d->findIdIndex(); + uassert(13430, "no _id index", idxNo>=0); + IndexDetails& i = d->idx( idxNo ); + BSONObj key = i.getKeyFromQuery( idquery ); + return i.idxInterface().findSingle(i , i.head , key); + } + + bool Helpers::isEmpty(const char *ns, bool doAuth) { + Client::Context context(ns, dbpath, doAuth); + shared_ptr<Cursor> c = DataFileMgr::findAll(ns); + return !c->ok(); + } + + /* Get the first object from a collection. Generally only useful if the collection + only ever has a single object -- which is a "singleton collection. + + Returns: true if object exists. + */ + bool Helpers::getSingleton(const char *ns, BSONObj& result) { + Client::Context context(ns); + + shared_ptr<Cursor> c = DataFileMgr::findAll(ns); + if ( !c->ok() ) { + context.getClient()->curop()->done(); + return false; + } + + result = c->current(); + context.getClient()->curop()->done(); + return true; + } + + bool Helpers::getLast(const char *ns, BSONObj& result) { + Client::Context ctx(ns); + shared_ptr<Cursor> c = findTableScan(ns, reverseNaturalObj); + if( !c->ok() ) + return false; + result = c->current(); + return true; + } + + void Helpers::upsert( const string& ns , const BSONObj& o ) { + BSONElement e = o["_id"]; + assert( e.type() ); + BSONObj id = e.wrap(); + + OpDebug debug; + Client::Context context(ns); + updateObjects(ns.c_str(), o, /*pattern=*/id, /*upsert=*/true, /*multi=*/false , /*logtheop=*/true , debug ); + } + + void Helpers::putSingleton(const char *ns, BSONObj obj) { + OpDebug debug; + Client::Context context(ns); + updateObjects(ns, obj, /*pattern=*/BSONObj(), /*upsert=*/true, /*multi=*/false , /*logtheop=*/true , debug ); + context.getClient()->curop()->done(); + } + + void Helpers::putSingletonGod(const char *ns, BSONObj obj, bool logTheOp) { + OpDebug debug; + Client::Context context(ns); + _updateObjects(/*god=*/true, ns, obj, /*pattern=*/BSONObj(), /*upsert=*/true, /*multi=*/false , logTheOp , debug ); + context.getClient()->curop()->done(); + } + + BSONObj Helpers::toKeyFormat( const BSONObj& o , BSONObj& key ) { + BSONObjBuilder me; + BSONObjBuilder k; + + BSONObjIterator i( o ); + while ( i.more() ) { + BSONElement e = i.next(); + k.append( e.fieldName() , 1 ); + me.appendAs( e , "" ); + } + key = k.obj(); + return me.obj(); + } + + long long Helpers::removeRange( const string& ns , const BSONObj& min , const BSONObj& max , bool yield , bool maxInclusive , RemoveCallback * callback ) { + BSONObj keya , keyb; + BSONObj minClean = toKeyFormat( min , keya ); + BSONObj maxClean = toKeyFormat( max , keyb ); + assert( keya == keyb ); + + Client::Context ctx(ns); + NamespaceDetails* nsd = nsdetails( ns.c_str() ); + if ( ! nsd ) + return 0; + + int ii = nsd->findIndexByKeyPattern( keya ); + assert( ii >= 0 ); + + long long num = 0; + + IndexDetails& i = nsd->idx( ii ); + + shared_ptr<Cursor> c( BtreeCursor::make( nsd , ii , i , minClean , maxClean , maxInclusive, 1 ) ); + auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) ); + cc->setDoingDeletes( true ); + + while ( c->ok() ) { + + if ( yield && ! cc->yieldSometimes( ClientCursor::WillNeed) ) { + // cursor got finished by someone else, so we're done + cc.release(); // if the collection/db is dropped, cc may be deleted + break; + } + + if ( ! c->ok() ) + break; + + DiskLoc rloc = c->currLoc(); + + if ( callback ) + callback->goingToDelete( c->current() ); + + c->advance(); + c->noteLocation(); + + logOp( "d" , ns.c_str() , rloc.obj()["_id"].wrap() ); + theDataFileMgr.deleteRecord(ns.c_str() , rloc.rec(), rloc); + num++; + + c->checkLocation(); + + getDur().commitIfNeeded(); + + + } + + return num; + } + + void Helpers::emptyCollection(const char *ns) { + Client::Context context(ns); + deleteObjects(ns, BSONObj(), false); + } + + DbSet::~DbSet() { + if ( name_.empty() ) + return; + try { + Client::Context c( name_.c_str() ); + if ( nsdetails( name_.c_str() ) ) { + string errmsg; + BSONObjBuilder result; + dropCollection( name_, errmsg, result ); + } + } + catch ( ... ) { + problem() << "exception cleaning up DbSet" << endl; + } + } + + void DbSet::reset( const string &name, const BSONObj &key ) { + if ( !name.empty() ) + name_ = name; + if ( !key.isEmpty() ) + key_ = key.getOwned(); + Client::Context c( name_.c_str() ); + if ( nsdetails( name_.c_str() ) ) { + Helpers::emptyCollection( name_.c_str() ); + } + else { + string err; + massert( 10303 , err, userCreateNS( name_.c_str(), fromjson( "{autoIndexId:false}" ), err, false ) ); + } + Helpers::ensureIndex( name_.c_str(), key_, true, "setIdx" ); + } + + bool DbSet::get( const BSONObj &obj ) const { + Client::Context c( name_.c_str() ); + BSONObj temp; + return Helpers::findOne( name_.c_str(), obj, temp, true ); + } + + void DbSet::set( const BSONObj &obj, bool val ) { + Client::Context c( name_.c_str() ); + if ( val ) { + try { + BSONObj k = obj; + theDataFileMgr.insertWithObjMod( name_.c_str(), k, false ); + } + catch ( DBException& ) { + // dup key - already in set + } + } + else { + deleteObjects( name_.c_str(), obj, true, false, false ); + } + } + + RemoveSaver::RemoveSaver( const string& a , const string& b , const string& why) : _out(0) { + static int NUM = 0; + + _root = dbpath; + if ( a.size() ) + _root /= a; + if ( b.size() ) + _root /= b; + assert( a.size() || b.size() ); + + _file = _root; + + stringstream ss; + ss << why << "." << terseCurrentTime(false) << "." << NUM++ << ".bson"; + _file /= ss.str(); + + } + + RemoveSaver::~RemoveSaver() { + if ( _out ) { + _out->close(); + delete _out; + _out = 0; + } + } + + void RemoveSaver::goingToDelete( const BSONObj& o ) { + if ( ! _out ) { + create_directories( _root ); + _out = new ofstream(); + _out->open( _file.string().c_str() , ios_base::out | ios_base::binary ); + if ( ! _out->good() ) { + log( LL_WARNING ) << "couldn't create file: " << _file.string() << " for remove saving" << endl; + delete _out; + _out = 0; + return; + } + + } + _out->write( o.objdata() , o.objsize() ); + } + + +} // namespace mongo diff --git a/src/mongo/db/dbhelpers.h b/src/mongo/db/dbhelpers.h new file mode 100644 index 00000000000..99d401fa1f8 --- /dev/null +++ b/src/mongo/db/dbhelpers.h @@ -0,0 +1,159 @@ +/* @file dbhelpers.h + + db helpers are helper functions and classes that let us easily manipulate the local + database instance in-proc. +*/ + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../pch.h" +#include "client.h" +#include "db.h" + +namespace mongo { + + const BSONObj reverseNaturalObj = BSON( "$natural" << -1 ); + + class Cursor; + class CoveredIndexMatcher; + + /** + all helpers assume locking is handled above them + */ + struct Helpers { + + /* ensure the specified index exists. + + @param keyPattern key pattern, e.g., { ts : 1 } + @param name index name, e.g., "name_1" + + This method can be a little (not much) cpu-slow, so you may wish to use + OCCASIONALLY ensureIndex(...); + + Note: use ensureHaveIdIndex() for the _id index: it is faster. + Note: does nothing if collection does not yet exist. + */ + static void ensureIndex(const char *ns, BSONObj keyPattern, bool unique, const char *name); + + /* fetch a single object from collection ns that matches query. + set your db SavedContext first. + + @param query - the query to perform. note this is the low level portion of query so "orderby : ..." + won't work. + + @param requireIndex if true, assert if no index for the query. a way to guard against + writing a slow query. + + @return true if object found + */ + static bool findOne(const char *ns, const BSONObj &query, BSONObj& result, bool requireIndex = false); + static DiskLoc findOne(const char *ns, const BSONObj &query, bool requireIndex); + + /** + * @param foundIndex if passed in will be set to 1 if ns and index found + * @return true if object found + */ + static bool findById(Client&, const char *ns, BSONObj query, BSONObj& result , + bool * nsFound = 0 , bool * indexFound = 0 ); + + /* uasserts if no _id index. + @return null loc if not found */ + static DiskLoc findById(NamespaceDetails *d, BSONObj query); + + /** Get/put the first (or last) object from a collection. Generally only useful if the collection + only ever has a single object -- which is a "singleton collection". + + You do not need to set the database (Context) before calling. + + @return true if object exists. + */ + static bool getSingleton(const char *ns, BSONObj& result); + static void putSingleton(const char *ns, BSONObj obj); + static void putSingletonGod(const char *ns, BSONObj obj, bool logTheOp); + static bool getFirst(const char *ns, BSONObj& result) { return getSingleton(ns, result); } + static bool getLast(const char *ns, BSONObj& result); // get last object int he collection; e.g. {$natural : -1} + + /** + * you have to lock + * you do not have to have Context set + * o has to have an _id field or will assert + */ + static void upsert( const string& ns , const BSONObj& o ); + + /** You do not need to set the database before calling. + @return true if collection is empty. + */ + static bool isEmpty(const char *ns, bool doAuth=true); + + // TODO: this should be somewhere else probably + static BSONObj toKeyFormat( const BSONObj& o , BSONObj& key ); + + class RemoveCallback { + public: + virtual ~RemoveCallback() {} + virtual void goingToDelete( const BSONObj& o ) = 0; + }; + /* removeRange: operation is oplog'd */ + static long long removeRange( const string& ns , const BSONObj& min , const BSONObj& max , bool yield = false , bool maxInclusive = false , RemoveCallback * callback = 0 ); + + /* Remove all objects from a collection. + You do not need to set the database before calling. + */ + static void emptyCollection(const char *ns); + + }; + + class Database; + + // manage a set using collection backed storage + class DbSet { + public: + DbSet( const string &name = "", const BSONObj &key = BSONObj() ) : + name_( name ), + key_( key.getOwned() ) { + } + ~DbSet(); + void reset( const string &name = "", const BSONObj &key = BSONObj() ); + bool get( const BSONObj &obj ) const; + void set( const BSONObj &obj, bool val ); + private: + string name_; + BSONObj key_; + }; + + + /** + * user for saving deleted bson objects to a flat file + */ + class RemoveSaver : public Helpers::RemoveCallback , boost::noncopyable { + public: + RemoveSaver( const string& type , const string& ns , const string& why); + ~RemoveSaver(); + + void goingToDelete( const BSONObj& o ); + + private: + path _root; + path _file; + ofstream* _out; + + }; + + +} // namespace mongo diff --git a/src/mongo/db/dbmessage.cpp b/src/mongo/db/dbmessage.cpp new file mode 100644 index 00000000000..c86b5a05240 --- /dev/null +++ b/src/mongo/db/dbmessage.cpp @@ -0,0 +1,108 @@ +// dbmessage.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "dbmessage.h" +#include "../client/dbclient.h" + +namespace mongo { + + string Message::toString() const { + stringstream ss; + ss << "op: " << opToString( operation() ) << " len: " << size(); + if ( operation() >= 2000 && operation() < 2100 ) { + DbMessage d(*this); + ss << " ns: " << d.getns(); + switch ( operation() ) { + case dbUpdate: { + int flags = d.pullInt(); + BSONObj q = d.nextJsObj(); + BSONObj o = d.nextJsObj(); + ss << " flags: " << flags << " query: " << q << " update: " << o; + break; + } + case dbInsert: + ss << d.nextJsObj(); + break; + case dbDelete: { + int flags = d.pullInt(); + BSONObj q = d.nextJsObj(); + ss << " flags: " << flags << " query: " << q; + break; + } + default: + ss << " CANNOT HANDLE YET"; + } + + + } + return ss.str(); + } + + + void replyToQuery(int queryResultFlags, + AbstractMessagingPort* p, Message& requestMsg, + void *data, int size, + int nReturned, int startingFrom, + long long cursorId + ) { + BufBuilder b(32768); + b.skip(sizeof(QueryResult)); + b.appendBuf(data, size); + QueryResult *qr = (QueryResult *) b.buf(); + qr->_resultFlags() = queryResultFlags; + qr->len = b.len(); + qr->setOperation(opReply); + qr->cursorId = cursorId; + qr->startingFrom = startingFrom; + qr->nReturned = nReturned; + b.decouple(); + Message resp(qr, true); + p->reply(requestMsg, resp, requestMsg.header()->id); + } + + void replyToQuery(int queryResultFlags, + AbstractMessagingPort* p, Message& requestMsg, + BSONObj& responseObj) { + replyToQuery(queryResultFlags, + p, requestMsg, + (void *) responseObj.objdata(), responseObj.objsize(), 1); + } + + void replyToQuery(int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj) { + BufBuilder b; + b.skip(sizeof(QueryResult)); + b.appendBuf((void*) obj.objdata(), obj.objsize()); + QueryResult* msgdata = (QueryResult *) b.buf(); + b.decouple(); + QueryResult *qr = msgdata; + qr->_resultFlags() = queryResultFlags; + qr->len = b.len(); + qr->setOperation(opReply); + qr->cursorId = 0; + qr->startingFrom = 0; + qr->nReturned = 1; + Message *resp = new Message(); + resp->setData(msgdata, true); // transport will free + dbresponse.response = resp; + dbresponse.responseTo = m.header()->id; + } + + + +} diff --git a/src/mongo/db/dbmessage.h b/src/mongo/db/dbmessage.h new file mode 100644 index 00000000000..a789bff849c --- /dev/null +++ b/src/mongo/db/dbmessage.h @@ -0,0 +1,282 @@ +// dbmessage.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "diskloc.h" +#include "jsobj.h" +#include "namespace-inl.h" +#include "../util/net/message.h" +#include "../client/constants.h" +#include "instance.h" + +namespace mongo { + + /* db response format + + Query or GetMore: // see struct QueryResult + int resultFlags; + int64 cursorID; + int startingFrom; + int nReturned; + list of marshalled JSObjects; + */ + +/* db request message format + + unsigned opid; // arbitary; will be echoed back + byte operation; + int options; + + then for: + + dbInsert: + string collection; + a series of JSObjects + dbDelete: + string collection; + int flags=0; // 1=DeleteSingle + JSObject query; + dbUpdate: + string collection; + int flags; // 1=upsert + JSObject query; + JSObject objectToUpdate; + objectToUpdate may include { $inc: <field> } or { $set: ... }, see struct Mod. + dbQuery: + string collection; + int nToSkip; + int nToReturn; // how many you want back as the beginning of the cursor data (0=no limit) + // greater than zero is simply a hint on how many objects to send back per "cursor batch". + // a negative number indicates a hard limit. + JSObject query; + [JSObject fieldsToReturn] + dbGetMore: + string collection; // redundant, might use for security. + int nToReturn; + int64 cursorID; + dbKillCursors=2007: + int n; + int64 cursorIDs[n]; + + Note that on Update, there is only one object, which is different + from insert where you can pass a list of objects to insert in the db. + Note that the update field layout is very similar layout to Query. +*/ + + +#pragma pack(1) + struct QueryResult : public MsgData { + long long cursorId; + int startingFrom; + int nReturned; + const char *data() { + return (char *) (((int *)&nReturned)+1); + } + int resultFlags() { + return dataAsInt(); + } + int& _resultFlags() { + return dataAsInt(); + } + void setResultFlagsToOk() { + _resultFlags() = ResultFlag_AwaitCapable; + } + void initializeResultFlags() { + _resultFlags() = 0; + } + }; + +#pragma pack() + + /* For the database/server protocol, these objects and functions encapsulate + the various messages transmitted over the connection. + + See http://www.mongodb.org/display/DOCS/Mongo+Wire+Protocol + */ + class DbMessage { + public: + DbMessage(const Message& _m) : m(_m) , mark(0) { + // for received messages, Message has only one buffer + theEnd = _m.singleData()->_data + _m.header()->dataLen(); + char *r = _m.singleData()->_data; + reserved = (int *) r; + data = r + 4; + nextjsobj = data; + } + + /** the 32 bit field before the ns + * track all bit usage here as its cross op + * 0: InsertOption_ContinueOnError + * 1: fromWriteback + */ + int& reservedField() { return *reserved; } + + const char * getns() const { + return data; + } + void getns(Namespace& ns) const { + ns = data; + } + + const char * afterNS() const { + return data + strlen( data ) + 1; + } + + int getInt( int num ) const { + const int * foo = (const int*)afterNS(); + return foo[num]; + } + + int getQueryNToReturn() const { + return getInt( 1 ); + } + + /** + * get an int64 at specified offsetBytes after ns + */ + long long getInt64( int offsetBytes ) const { + const char * x = afterNS(); + x += offsetBytes; + const long long * ll = (const long long*)x; + return ll[0]; + } + + void resetPull() { nextjsobj = data; } + int pullInt() const { return pullInt(); } + int& pullInt() { + if ( nextjsobj == data ) + nextjsobj += strlen(data) + 1; // skip namespace + int& i = *((int *)nextjsobj); + nextjsobj += 4; + return i; + } + long long pullInt64() const { + return pullInt64(); + } + long long &pullInt64() { + if ( nextjsobj == data ) + nextjsobj += strlen(data) + 1; // skip namespace + long long &i = *((long long *)nextjsobj); + nextjsobj += 8; + return i; + } + + OID* getOID() const { + return (OID *) (data + strlen(data) + 1); // skip namespace + } + + void getQueryStuff(const char *&query, int& ntoreturn) { + int *i = (int *) (data + strlen(data) + 1); + ntoreturn = *i; + i++; + query = (const char *) i; + } + + /* for insert and update msgs */ + bool moreJSObjs() const { + return nextjsobj != 0; + } + BSONObj nextJsObj() { + if ( nextjsobj == data ) { + nextjsobj += strlen(data) + 1; // skip namespace + massert( 13066 , "Message contains no documents", theEnd > nextjsobj ); + } + massert( 10304 , "Client Error: Remaining data too small for BSON object", theEnd - nextjsobj > 3 ); + BSONObj js(nextjsobj); + massert( 10305 , "Client Error: Invalid object size", js.objsize() > 3 ); + massert( 10306 , "Client Error: Next object larger than space left in message", + js.objsize() < ( theEnd - data ) ); + if ( cmdLine.objcheck && !js.valid() ) { + massert( 10307 , "Client Error: bad object in message", false); + } + nextjsobj += js.objsize(); + if ( nextjsobj >= theEnd ) + nextjsobj = 0; + return js; + } + + const Message& msg() const { return m; } + + void markSet() { + mark = nextjsobj; + } + + void markReset() { + assert( mark ); + nextjsobj = mark; + } + + private: + const Message& m; + int* reserved; + const char *data; + const char *nextjsobj; + const char *theEnd; + + const char * mark; + + public: + enum ReservedOptions { + Reserved_InsertOption_ContinueOnError = 1 << 0 , + Reserved_FromWriteback = 1 << 1 + }; + }; + + + /* a request to run a query, received from the database */ + class QueryMessage { + public: + const char *ns; + int ntoskip; + int ntoreturn; + int queryOptions; + BSONObj query; + BSONObj fields; + + /* parses the message into the above fields */ + QueryMessage(DbMessage& d) { + ns = d.getns(); + ntoskip = d.pullInt(); + ntoreturn = d.pullInt(); + query = d.nextJsObj(); + if ( d.moreJSObjs() ) { + fields = d.nextJsObj(); + } + queryOptions = d.msg().header()->dataAsInt(); + } + }; + + void replyToQuery(int queryResultFlags, + AbstractMessagingPort* p, Message& requestMsg, + void *data, int size, + int nReturned, int startingFrom = 0, + long long cursorId = 0 + ); + + + /* object reply helper. */ + void replyToQuery(int queryResultFlags, + AbstractMessagingPort* p, Message& requestMsg, + BSONObj& responseObj); + + /* helper to do a reply using a DbResponse object */ + void replyToQuery(int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj); + + +} // namespace mongo diff --git a/src/mongo/db/dbwebserver.cpp b/src/mongo/db/dbwebserver.cpp new file mode 100644 index 00000000000..eb19ba3be6c --- /dev/null +++ b/src/mongo/db/dbwebserver.cpp @@ -0,0 +1,539 @@ +/* dbwebserver.cpp + + This is the administrative web page displayed on port 28017. +*/ + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "../util/net/miniwebserver.h" +#include "../util/mongoutils/html.h" +#include "../util/md5.hpp" +#include "db.h" +#include "instance.h" +#include "security.h" +#include "stats/snapshots.h" +#include "background.h" +#include "commands.h" +#include "../util/version.h" +#include "../util/ramlog.h" +#include "pcrecpp.h" +#include "../util/admin_access.h" +#include "dbwebserver.h" +#include <boost/date_time/posix_time/posix_time.hpp> +#undef assert +#define assert MONGO_assert + +namespace mongo { + + using namespace mongoutils::html; + using namespace bson; + + time_t started = time(0); + + struct Timing { + Timing() { + start = timeLocked = 0; + } + unsigned long long start, timeLocked; + }; + + bool execCommand( Command * c , + Client& client , int queryOptions , + const char *ns, BSONObj& cmdObj , + BSONObjBuilder& result, + bool fromRepl ); + + class DbWebServer : public MiniWebServer { + public: + DbWebServer(const string& ip, int port, const AdminAccess* webUsers) + : MiniWebServer("admin web console", ip, port), _webUsers(webUsers) { + WebStatusPlugin::initAll(); + } + + private: + const AdminAccess* _webUsers; // not owned here + + void doUnlockedStuff(stringstream& ss) { + /* this is in the header already ss << "port: " << port << '\n'; */ + ss << "<pre>"; + ss << mongodVersion() << '\n'; + ss << "git hash: " << gitVersion() << '\n'; + ss << "sys info: " << sysInfo() << '\n'; + ss << "uptime: " << time(0)-started << " seconds\n"; + ss << "</pre>"; + } + + bool allowed( const char * rq , vector<string>& headers, const SockAddr &from ) { + if ( from.isLocalHost() || !_webUsers->haveAdminUsers() ) { + cmdAuthenticate.authenticate( "admin", "RestUser", false ); + return true; + } + + string auth = getHeader( rq , "Authorization" ); + + if ( auth.size() > 0 && auth.find( "Digest " ) == 0 ) { + auth = auth.substr( 7 ) + ", "; + + map<string,string> parms; + pcrecpp::StringPiece input( auth ); + + string name, val; + pcrecpp::RE re("(\\w+)=\"?(.*?)\"?, "); + while ( re.Consume( &input, &name, &val) ) { + parms[name] = val; + } + + BSONObj user = _webUsers->getAdminUser( parms["username"] ); + if ( ! user.isEmpty() ) { + string ha1 = user["pwd"].str(); + string ha2 = md5simpledigest( (string)"GET" + ":" + parms["uri"] ); + + stringstream r; + r << ha1 << ':' << parms["nonce"]; + if ( parms["nc"].size() && parms["cnonce"].size() && parms["qop"].size() ) { + r << ':'; + r << parms["nc"]; + r << ':'; + r << parms["cnonce"]; + r << ':'; + r << parms["qop"]; + } + r << ':'; + r << ha2; + string r1 = md5simpledigest( r.str() ); + + if ( r1 == parms["response"] ) { + cmdAuthenticate.authenticate( "admin", user["user"].str(), user[ "readOnly" ].isBoolean() && user[ "readOnly" ].boolean() ); + return true; + } + } + } + + stringstream authHeader; + authHeader + << "WWW-Authenticate: " + << "Digest realm=\"mongo\", " + << "nonce=\"abc\", " + << "algorithm=MD5, qop=\"auth\" " + ; + + headers.push_back( authHeader.str() ); + return 0; + } + + virtual void doRequest( + const char *rq, // the full request + string url, + // set these and return them: + string& responseMsg, + int& responseCode, + vector<string>& headers, // if completely empty, content-type: text/html will be added + const SockAddr &from + ) { + if ( url.size() > 1 ) { + + if ( ! allowed( rq , headers, from ) ) { + responseCode = 401; + headers.push_back( "Content-Type: text/plain;charset=utf-8" ); + responseMsg = "not allowed\n"; + return; + } + + { + BSONObj params; + const size_t pos = url.find( "?" ); + if ( pos != string::npos ) { + MiniWebServer::parseParams( params , url.substr( pos + 1 ) ); + url = url.substr(0, pos); + } + + DbWebHandler * handler = DbWebHandler::findHandler( url ); + if ( handler ) { + if ( handler->requiresREST( url ) && ! cmdLine.rest ) { + _rejectREST( responseMsg , responseCode , headers ); + } + else { + string callback = params.getStringField("jsonp"); + uassert(13453, "server not started with --jsonp", callback.empty() || cmdLine.jsonp); + + handler->handle( rq , url , params , responseMsg , responseCode , headers , from ); + + if (responseCode == 200 && !callback.empty()) { + responseMsg = callback + '(' + responseMsg + ')'; + } + } + return; + } + } + + + if ( ! cmdLine.rest ) { + _rejectREST( responseMsg , responseCode , headers ); + return; + } + + responseCode = 404; + headers.push_back( "Content-Type: text/html;charset=utf-8" ); + responseMsg = "<html><body>unknown url</body></html>\n"; + return; + } + + // generate home page + + if ( ! allowed( rq , headers, from ) ) { + responseCode = 401; + headers.push_back( "Content-Type: text/plain;charset=utf-8" ); + responseMsg = "not allowed\n"; + return; + } + + responseCode = 200; + stringstream ss; + string dbname; + { + stringstream z; + z << cmdLine.binaryName << ' ' << prettyHostName(); + dbname = z.str(); + } + ss << start(dbname) << h2(dbname); + ss << "<p><a href=\"/_commands\">List all commands</a> | \n"; + ss << "<a href=\"/_replSet\">Replica set status</a></p>\n"; + + //ss << "<a href=\"/_status\">_status</a>"; + { + const map<string, Command*> *m = Command::webCommands(); + if( m ) { + ss << + a("", + "These read-only context-less commands can be executed from the web interface. " + "Results are json format, unless ?text=1 is appended in which case the result is output as text " + "for easier human viewing", + "Commands") + << ": "; + for( map<string, Command*>::const_iterator i = m->begin(); i != m->end(); i++ ) { + stringstream h; + i->second->help(h); + string help = h.str(); + ss << "<a href=\"/" << i->first << "?text=1\""; + if( help != "no help defined" ) + ss << " title=\"" << help << '"'; + ss << ">" << i->first << "</a> "; + } + ss << '\n'; + } + } + ss << '\n'; + /* + ss << "HTTP <a " + "title=\"click for documentation on this http interface\"" + "href=\"http://www.mongodb.org/display/DOCS/Http+Interface\">admin port</a>:" << _port << "<p>\n"; + */ + + doUnlockedStuff(ss); + + WebStatusPlugin::runAll( ss ); + + ss << "</body></html>\n"; + responseMsg = ss.str(); + headers.push_back( "Content-Type: text/html;charset=utf-8" ); + } + + void _rejectREST( string& responseMsg , int& responseCode, vector<string>& headers ) { + responseCode = 403; + stringstream ss; + ss << "REST is not enabled. use --rest to turn on.\n"; + ss << "check that port " << _port << " is secured for the network too.\n"; + responseMsg = ss.str(); + headers.push_back( "Content-Type: text/plain;charset=utf-8" ); + } + + }; + // --- + + bool prisort( const Prioritizable * a , const Prioritizable * b ) { + return a->priority() < b->priority(); + } + + // -- status framework --- + WebStatusPlugin::WebStatusPlugin( const string& secionName , double priority , const string& subheader ) + : Prioritizable(priority), _name( secionName ) , _subHeading( subheader ) { + if ( ! _plugins ) + _plugins = new vector<WebStatusPlugin*>(); + _plugins->push_back( this ); + } + + void WebStatusPlugin::initAll() { + if ( ! _plugins ) + return; + + sort( _plugins->begin(), _plugins->end() , prisort ); + + for ( unsigned i=0; i<_plugins->size(); i++ ) + (*_plugins)[i]->init(); + } + + void WebStatusPlugin::runAll( stringstream& ss ) { + if ( ! _plugins ) + return; + + for ( unsigned i=0; i<_plugins->size(); i++ ) { + WebStatusPlugin * p = (*_plugins)[i]; + ss << "<hr>\n" + << "<b>" << p->_name << "</b>"; + + ss << " " << p->_subHeading; + + ss << "<br>\n"; + + p->run(ss); + } + + } + + vector<WebStatusPlugin*> * WebStatusPlugin::_plugins = 0; + + // -- basic statuc plugins -- + + class LogPlugin : public WebStatusPlugin { + public: + LogPlugin() : WebStatusPlugin( "Log" , 100 ), _log(0) { + } + + virtual void init() { + _log = RamLog::get( "global" ); + if ( ! _log ) { + _log = new RamLog("global"); + Logstream::get().addGlobalTee( _log ); + } + } + + virtual void run( stringstream& ss ) { + _log->toHTML( ss ); + } + RamLog * _log; + }; + + LogPlugin * logPlugin = new LogPlugin(); + + // -- handler framework --- + + DbWebHandler::DbWebHandler( const string& name , double priority , bool requiresREST ) + : Prioritizable(priority), _name(name) , _requiresREST(requiresREST) { + + { + // setup strings + _defaultUrl = "/"; + _defaultUrl += name; + + stringstream ss; + ss << name << " priority: " << priority << " rest: " << requiresREST; + _toString = ss.str(); + } + + { + // add to handler list + if ( ! _handlers ) + _handlers = new vector<DbWebHandler*>(); + _handlers->push_back( this ); + sort( _handlers->begin() , _handlers->end() , prisort ); + } + } + + DbWebHandler * DbWebHandler::findHandler( const string& url ) { + if ( ! _handlers ) + return 0; + + for ( unsigned i=0; i<_handlers->size(); i++ ) { + DbWebHandler * h = (*_handlers)[i]; + if ( h->handles( url ) ) + return h; + } + + return 0; + } + + vector<DbWebHandler*> * DbWebHandler::_handlers = 0; + + // --- basic handlers --- + + class FavIconHandler : public DbWebHandler { + public: + FavIconHandler() : DbWebHandler( "favicon.ico" , 0 , false ) {} + + virtual void handle( const char *rq, string url, BSONObj params, + string& responseMsg, int& responseCode, + vector<string>& headers, const SockAddr &from ) { + responseCode = 404; + headers.push_back( "Content-Type: text/plain;charset=utf-8" ); + responseMsg = "no favicon\n"; + } + + } faviconHandler; + + class StatusHandler : public DbWebHandler { + public: + StatusHandler() : DbWebHandler( "_status" , 1 , false ) {} + + virtual void handle( const char *rq, string url, BSONObj params, + string& responseMsg, int& responseCode, + vector<string>& headers, const SockAddr &from ) { + headers.push_back( "Content-Type: application/json;charset=utf-8" ); + responseCode = 200; + + static vector<string> commands; + if ( commands.size() == 0 ) { + commands.push_back( "serverStatus" ); + commands.push_back( "buildinfo" ); + } + + BSONObjBuilder buf(1024); + + for ( unsigned i=0; i<commands.size(); i++ ) { + string cmd = commands[i]; + + Command * c = Command::findCommand( cmd ); + assert( c ); + assert( c->locktype() == 0 ); + + BSONObj co; + { + BSONObjBuilder b; + b.append( cmd , 1 ); + + if ( cmd == "serverStatus" && params["repl"].type() ) { + b.append( "repl" , atoi( params["repl"].valuestr() ) ); + } + + co = b.obj(); + } + + string errmsg; + + BSONObjBuilder sub; + if ( ! c->run( "admin.$cmd" , co , 0, errmsg , sub , false ) ) + buf.append( cmd , errmsg ); + else + buf.append( cmd , sub.obj() ); + } + + responseMsg = buf.obj().jsonString(); + + } + + } statusHandler; + + class CommandListHandler : public DbWebHandler { + public: + CommandListHandler() : DbWebHandler( "_commands" , 1 , true ) {} + + virtual void handle( const char *rq, string url, BSONObj params, + string& responseMsg, int& responseCode, + vector<string>& headers, const SockAddr &from ) { + headers.push_back( "Content-Type: text/html;charset=utf-8" ); + responseCode = 200; + + stringstream ss; + ss << start("Commands List"); + ss << p( a("/", "back", "Home") ); + ss << p( "<b>MongoDB List of <a href=\"http://www.mongodb.org/display/DOCS/Commands\">Commands</a></b>\n" ); + const map<string, Command*> *m = Command::commandsByBestName(); + ss << "S:slave-ok R:read-lock W:write-lock A:admin-only<br>\n"; + ss << table(); + ss << "<tr><th>Command</th><th>Attributes</th><th>Help</th></tr>\n"; + for( map<string, Command*>::const_iterator i = m->begin(); i != m->end(); i++ ) + i->second->htmlHelp(ss); + ss << _table() << _end(); + + responseMsg = ss.str(); + } + } commandListHandler; + + class CommandsHandler : public DbWebHandler { + public: + CommandsHandler() : DbWebHandler( "DUMMY COMMANDS" , 2 , true ) {} + + bool _cmd( const string& url , string& cmd , bool& text, bo params ) const { + cmd = str::after(url, '/'); + text = params["text"].boolean(); + return true; + } + + Command * _cmd( const string& cmd ) const { + const map<string,Command*> *m = Command::webCommands(); + if( ! m ) + return 0; + + map<string,Command*>::const_iterator i = m->find(cmd); + if ( i == m->end() ) + return 0; + + return i->second; + } + + virtual bool handles( const string& url ) const { + string cmd; + bool text; + if ( ! _cmd( url , cmd , text, bo() ) ) + return false; + return _cmd(cmd) != 0; + } + + virtual void handle( const char *rq, string url, BSONObj params, + string& responseMsg, int& responseCode, + vector<string>& headers, const SockAddr &from ) { + string cmd; + bool text = false; + assert( _cmd( url , cmd , text, params ) ); + Command * c = _cmd( cmd ); + assert( c ); + + BSONObj cmdObj = BSON( cmd << 1 ); + Client& client = cc(); + + BSONObjBuilder result; + execCommand(c, client, 0, "admin.", cmdObj , result, false); + + responseCode = 200; + + string j = result.done().jsonString(Strict, text ); + responseMsg = j; + + if( text ) { + headers.push_back( "Content-Type: text/plain;charset=utf-8" ); + responseMsg += '\n'; + } + else { + headers.push_back( "Content-Type: application/json;charset=utf-8" ); + } + + } + + } commandsHandler; + + // --- external ---- + + void webServerThread(const AdminAccess* adminAccess) { + boost::scoped_ptr<const AdminAccess> adminAccessPtr(adminAccess); // adminAccess is owned here + Client::initThread("websvr"); + const int p = cmdLine.port + 1000; + DbWebServer mini(cmdLine.bind_ip, p, adminAccessPtr.get()); + mini.initAndListen(); + cc().shutdown(); + } + +} // namespace mongo diff --git a/src/mongo/db/dbwebserver.h b/src/mongo/db/dbwebserver.h new file mode 100644 index 00000000000..bdbcba2c07d --- /dev/null +++ b/src/mongo/db/dbwebserver.h @@ -0,0 +1,85 @@ +/** @file dbwebserver.h + */ + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "../util/admin_access.h" + +namespace mongo { + + class Prioritizable { + public: + Prioritizable( double p ) : _priority(p) {} + double priority() const { return _priority; } + private: + double _priority; + }; + + class DbWebHandler : public Prioritizable { + public: + DbWebHandler( const string& name , double priority , bool requiresREST ); + virtual ~DbWebHandler() {} + + virtual bool handles( const string& url ) const { return url == _defaultUrl; } + + virtual bool requiresREST( const string& url ) const { return _requiresREST; } + + virtual void handle( const char *rq, // the full request + string url, + BSONObj params, + // set these and return them: + string& responseMsg, + int& responseCode, + vector<string>& headers, // if completely empty, content-type: text/html will be added + const SockAddr &from + ) = 0; + + string toString() const { return _toString; } + static DbWebHandler * findHandler( const string& url ); + + private: + string _name; + bool _requiresREST; + + string _defaultUrl; + string _toString; + + static vector<DbWebHandler*> * _handlers; + }; + + class WebStatusPlugin : public Prioritizable { + public: + WebStatusPlugin( const string& secionName , double priority , const string& subheader = "" ); + virtual ~WebStatusPlugin() {} + + virtual void run( stringstream& ss ) = 0; + /** called when web server stats up */ + virtual void init() = 0; + + static void initAll(); + static void runAll( stringstream& ss ); + private: + string _name; + string _subHeading; + static vector<WebStatusPlugin*> * _plugins; + + }; + + void webServerThread( const AdminAccess* admins ); + string prettyHostName(); + +}; diff --git a/src/mongo/db/diskloc.h b/src/mongo/db/diskloc.h new file mode 100644 index 00000000000..5295df3e260 --- /dev/null +++ b/src/mongo/db/diskloc.h @@ -0,0 +1,160 @@ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* @file diskloc.h + + Storage subsystem management. + Lays out our datafiles on disk, manages disk space. +*/ + +#pragma once + +#include "jsobj.h" + +namespace mongo { + + class Record; + class DeletedRecord; + class Extent; + class MongoDataFile; + class DiskLoc; + + template< class Version > class BtreeBucket; + +#pragma pack(1) + /** represents a disk location/offset on disk in a database. 64 bits. + it is assumed these will be passed around by value a lot so don't do anything to make them large + (such as adding a virtual function) + */ + class DiskLoc { + int _a; // this will be volume, file #, etsc. but is a logical value could be anything depending on storage engine + int ofs; + + public: + + enum SentinelValues { + /* note NullOfs is different. todo clean up. see refs to NullOfs in code - use is valid but outside DiskLoc context so confusing as-is. */ + NullOfs = -1, + MaxFiles=16000 // thus a limit of about 32TB of data per db + }; + + DiskLoc(int a, int Ofs) : _a(a), ofs(Ofs) { } + DiskLoc() { Null(); } + DiskLoc(const DiskLoc& l) { + _a=l._a; + ofs=l.ofs; + } + + bool questionable() const { + return ofs < -1 || + _a < -1 || + _a > 524288; + } + + bool isNull() const { return _a == -1; } + void Null() { + _a = -1; + ofs = 0; /* note NullOfs is different. todo clean up. see refs to NullOfs in code - use is valid but outside DiskLoc context so confusing as-is. */ + } + void assertOk() { assert(!isNull()); } + void setInvalid() { + _a = -2; + ofs = 0; + } + bool isValid() const { return _a != -2; } + + string toString() const { + if ( isNull() ) + return "null"; + stringstream ss; + ss << hex << _a << ':' << ofs; + return ss.str(); + } + + BSONObj toBSONObj() const { return BSON( "file" << _a << "offset" << ofs ); } + + int a() const { return _a; } + + int& GETOFS() { return ofs; } + int getOfs() const { return ofs; } + void set(int a, int b) { + _a=a; + ofs=b; + } + + void inc(int amt) { + assert( !isNull() ); + ofs += amt; + } + + bool sameFile(DiskLoc b) { + return _a== b._a; + } + + bool operator==(const DiskLoc& b) const { + return _a==b._a&& ofs == b.ofs; + } + bool operator!=(const DiskLoc& b) const { + return !(*this==b); + } + const DiskLoc& operator=(const DiskLoc& b) { + _a=b._a; + ofs = b.ofs; + //assert(ofs!=0); + return *this; + } + int compare(const DiskLoc& b) const { + int x = _a - b._a; + if ( x ) + return x; + return ofs - b.ofs; + } + bool operator<(const DiskLoc& b) const { + return compare(b) < 0; + } + + /** + * Marks this disk loc for writing + * @returns a non const reference to this disk loc + * This function explicitly signals we are writing and casts away const + */ + DiskLoc& writing() const; // see dur.h + + /* Get the "thing" associated with this disk location. + it is assumed the object is what you say it is -- you must assure that + (think of this as an unchecked type cast) + Note: set your Context first so that the database to which the diskloc applies is known. + */ + BSONObj obj() const; + Record* rec() const; + DeletedRecord* drec() const; + Extent* ext() const; + + template< class V > + const BtreeBucket<V> * btree() const; + + // Explicitly signals we are writing and casts away const + template< class V > + BtreeBucket<V> * btreemod() const; + + /*MongoDataFile& pdf() const;*/ + }; +#pragma pack() + + const DiskLoc minDiskLoc(0, 1); + const DiskLoc maxDiskLoc(0x7fffffff, 0x7fffffff); + +} // namespace mongo diff --git a/src/mongo/db/driverHelpers.cpp b/src/mongo/db/driverHelpers.cpp new file mode 100644 index 00000000000..12aa01886c4 --- /dev/null +++ b/src/mongo/db/driverHelpers.cpp @@ -0,0 +1,62 @@ +// driverHelpers.cpp + +/** + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** + this file has dbcommands that are for drivers + mostly helpers +*/ + + +#include "pch.h" +#include "jsobj.h" +#include "pdfile.h" +#include "namespace-inl.h" +#include "commands.h" +#include "cmdline.h" +#include "btree.h" +#include "curop-inl.h" +#include "../util/background.h" +#include "../scripting/engine.h" + +namespace mongo { + + class BasicDriverHelper : public Command { + public: + BasicDriverHelper( const char * name ) : Command( name ) {} + + virtual LockType locktype() const { return NONE; } + virtual bool slaveOk() const { return true; } + virtual bool slaveOverrideOk() { return true; } + }; + + class ObjectIdTest : public BasicDriverHelper { + public: + ObjectIdTest() : BasicDriverHelper( "driverOIDTest" ) {} + virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if ( cmdObj.firstElement().type() != jstOID ) { + errmsg = "not oid"; + return false; + } + + const OID& oid = cmdObj.firstElement().__oid(); + result.append( "oid" , oid ); + result.append( "str" , oid.str() ); + + return true; + } + } driverObjectIdTest; +} diff --git a/src/mongo/db/dur.cpp b/src/mongo/db/dur.cpp new file mode 100644 index 00000000000..822fa5232c0 --- /dev/null +++ b/src/mongo/db/dur.cpp @@ -0,0 +1,840 @@ +// @file dur.cpp durability in the storage engine (crash-safeness / journaling) + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* + phases: + + PREPLOGBUFFER + we will build an output buffer ourself and then use O_DIRECT + we could be in read lock for this + for very large objects write directly to redo log in situ? + WRITETOJOURNAL + we could be unlocked (the main db lock that is...) for this, with sufficient care, but there is some complexity + have to handle falling behind which would use too much ram (going back into a read lock would suffice to stop that). + for now (1.7.5/1.8.0) we are in read lock which is not ideal. + WRITETODATAFILES + apply the writes back to the non-private MMF after they are for certain in redo log + REMAPPRIVATEVIEW + we could in a write lock quickly flip readers back to the main view, then stay in read lock and do our real + remapping. with many files (e.g., 1000), remapping could be time consuming (several ms), so we don't want + to be too frequent. + there could be a slow down immediately after remapping as fresh copy-on-writes for commonly written pages will + be required. so doing these remaps fractionally is helpful. + + mutexes: + + READLOCK dbMutex + LOCK groupCommitMutex + PREPLOGBUFFER() + READLOCK mmmutex + commitJob.reset() + UNLOCK dbMutex // now other threads can write + WRITETOJOURNAL() + WRITETODATAFILES() + UNLOCK mmmutex + UNLOCK groupCommitMutex + + on the next write lock acquisition for dbMutex: // see MongoMutex::_acquiredWriteLock() + REMAPPRIVATEVIEW() + + @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc +*/ + +#include "pch.h" +#include "cmdline.h" +#include "client.h" +#include "dur.h" +#include "dur_journal.h" +#include "dur_commitjob.h" +#include "dur_recover.h" +#include "dur_stats.h" +#include "../util/concurrency/race.h" +#include "../util/mongoutils/hash.h" +#include "../util/mongoutils/str.h" +#include "../util/timer.h" + +using namespace mongoutils; + +namespace mongo { + + namespace dur { + + void PREPLOGBUFFER(JSectHeader& outParm); + void WRITETOJOURNAL(JSectHeader h, AlignedBuilder& uncompressed); + void WRITETODATAFILES(const JSectHeader& h, AlignedBuilder& uncompressed); + + /** declared later in this file + only used in this file -- use DurableInterface::commitNow() outside + */ + static void groupCommit(); + + CommitJob& commitJob = *(new CommitJob()); // don't destroy + + Stats stats; + + void Stats::S::reset() { + memset(this, 0, sizeof(*this)); + } + + Stats::Stats() { + _a.reset(); + _b.reset(); + curr = &_a; + _intervalMicros = 3000000; + } + + Stats::S * Stats::other() { + return curr == &_a ? &_b : &_a; + } + string _CSVHeader(); + + string Stats::S::_CSVHeader() { + return "cmts jrnMB\twrDFMB\tcIWLk\tearly\tprpLgB wrToJ\twrToDF\trmpPrVw"; + } + + string Stats::S::_asCSV() { + stringstream ss; + ss << + setprecision(2) << + _commits << '\t' << fixed << + _journaledBytes / 1000000.0 << '\t' << + _writeToDataFilesBytes / 1000000.0 << '\t' << + _commitsInWriteLock << '\t' << + _earlyCommits << '\t' << + (unsigned) (_prepLogBufferMicros/1000) << '\t' << + (unsigned) (_writeToJournalMicros/1000) << '\t' << + (unsigned) (_writeToDataFilesMicros/1000) << '\t' << + (unsigned) (_remapPrivateViewMicros/1000); + return ss.str(); + } + + //int getAgeOutJournalFiles(); + BSONObj Stats::S::_asObj() { + BSONObjBuilder b; + b << + "commits" << _commits << + "journaledMB" << _journaledBytes / 1000000.0 << + "writeToDataFilesMB" << _writeToDataFilesBytes / 1000000.0 << + "compression" << _journaledBytes / (_uncompressedBytes+1.0) << + "commitsInWriteLock" << _commitsInWriteLock << + "earlyCommits" << _earlyCommits << + "timeMs" << + BSON( "dt" << _dtMillis << + "prepLogBuffer" << (unsigned) (_prepLogBufferMicros/1000) << + "writeToJournal" << (unsigned) (_writeToJournalMicros/1000) << + "writeToDataFiles" << (unsigned) (_writeToDataFilesMicros/1000) << + "remapPrivateView" << (unsigned) (_remapPrivateViewMicros/1000) + ); + /*int r = getAgeOutJournalFiles(); + if( r == -1 ) + b << "ageOutJournalFiles" << "mutex timeout"; + if( r == 0 ) + b << "ageOutJournalFiles" << false;*/ + if( cmdLine.journalCommitInterval != 0 ) + b << "journalCommitIntervalMs" << cmdLine.journalCommitInterval; + return b.obj(); + } + + BSONObj Stats::asObj() { + return other()->_asObj(); + } + + void Stats::rotate() { + unsigned long long now = curTimeMicros64(); + unsigned long long dt = now - _lastRotate; + if( dt >= _intervalMicros && _intervalMicros ) { + // rotate + curr->_dtMillis = (unsigned) (dt/1000); + _lastRotate = now; + curr = other(); + curr->reset(); + } + } + + void NonDurableImpl::setNoJournal(void *dst, void *src, unsigned len) { + memcpy(dst, src, len); + } + + void DurableImpl::setNoJournal(void *dst, void *src, unsigned len) { + // we are at least read locked, so we need not worry about REMAPPRIVATEVIEW herein. + DEV d.dbMutex.assertAtLeastReadLocked(); + + MemoryMappedFile::makeWritable(dst, len); + + // we enter the RecoveryJob mutex here, so that if WRITETODATAFILES is happening we do not + // conflict with it + scoped_lock lk1( RecoveryJob::get()._mx ); + + // we stay in this mutex for everything to work with DurParanoid/validateSingleMapMatches + // + // either of these mutexes also makes setNoJournal threadsafe, which is good as we call it from a read + // (not a write) lock in class SlaveTracking + // + scoped_lock lk( privateViews._mutex() ); + + size_t ofs; + MongoMMF *f = privateViews.find_inlock(dst, ofs); + assert(f); + void *w = (((char *)f->view_write())+ofs); + // first write it to the writable (file) view + memcpy(w, src, len); + if( memcmp(w, dst, len) ) { + // if we get here, a copy-on-write had previously occurred. so write it to the private view too + // to keep them in sync. we do this as we do not want to cause a copy on write unnecessarily. + memcpy(dst, src, len); + } + } + + /** base declare write intent function that all the helpers call. */ + void DurableImpl::declareWriteIntent(void *p, unsigned len) { + commitJob.note(p, len); + } + + static DurableImpl* durableImpl = new DurableImpl(); + static NonDurableImpl* nonDurableImpl = new NonDurableImpl(); + DurableInterface* DurableInterface::_impl = nonDurableImpl; + + void DurableInterface::enableDurability() { + assert(_impl == nonDurableImpl); + _impl = durableImpl; + } + + void DurableInterface::disableDurability() { + assert(_impl == durableImpl); + massert(13616, "can't disable durability with pending writes", !commitJob.hasWritten()); + _impl = nonDurableImpl; + } + + bool DurableImpl::commitNow() { + stats.curr->_earlyCommits++; + groupCommit(); + return true; + } + + bool DurableImpl::awaitCommit() { + commitJob._notify.awaitBeyondNow(); + return true; + } + + /** Declare that a file has been created + Normally writes are applied only after journaling, for safety. But here the file + is created first, and the journal will just replay the creation if the create didn't + happen because of crashing. + */ + void DurableImpl::createdFile(string filename, unsigned long long len) { + shared_ptr<DurOp> op( new FileCreatedOp(filename, len) ); + commitJob.noteOp(op); + } + + void* DurableImpl::writingPtr(void *x, unsigned len) { + void *p = x; + declareWriteIntent(p, len); + return p; + } + + /** declare intent to write + @param ofs offset within buf at which we will write + @param len the length at ofs we will write + @return new buffer pointer. + */ + void* DurableImpl::writingAtOffset(void *buf, unsigned ofs, unsigned len) { + char *p = (char *) buf; + declareWriteIntent(p+ofs, len); + return p; + } + + void* DurableImpl::writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges ) { + char *p = (char *) buf; + for( vector< pair< long long, unsigned > >::const_iterator i = ranges.begin(); + i != ranges.end(); ++i ) { + declareWriteIntent( p + i->first, i->second ); + } + return p; + } + + bool DurableImpl::aCommitIsNeeded() const { + DEV commitJob._nSinceCommitIfNeededCall = 0; + return commitJob.bytes() > UncommittedBytesLimit; + } + + bool DurableImpl::commitIfNeeded() { + if ( !d.dbMutex.isWriteLocked() ) + return false; + + DEV commitJob._nSinceCommitIfNeededCall = 0; + if (commitJob.bytes() > UncommittedBytesLimit) { // should this also fire if CmdLine::DurAlwaysCommit? + stats.curr->_earlyCommits++; + groupCommit(); + return true; + } + return false; + } + + /** Used in _DEBUG builds to check that we didn't overwrite the last intent + that was declared. called just before writelock release. we check a few + bytes after the declared region to see if they changed. + + @see MongoMutex::_releasedWriteLock + + SLOW + */ +#if 0 + void DurableImpl::debugCheckLastDeclaredWrite() { + static int n; + ++n; + + assert(debug && cmdLine.dur); + if (commitJob.writes().empty()) + return; + const WriteIntent &i = commitJob.lastWrite(); + size_t ofs; + MongoMMF *mmf = privateViews.find(i.start(), ofs); + if( mmf == 0 ) + return; + size_t past = ofs + i.length(); + if( mmf->length() < past + 8 ) + return; // too close to end of view + char *priv = (char *) mmf->getView(); + char *writ = (char *) mmf->view_write(); + unsigned long long *a = (unsigned long long *) (priv+past); + unsigned long long *b = (unsigned long long *) (writ+past); + if( *a != *b ) { + for( set<WriteIntent>::iterator it(commitJob.writes().begin()), end((commitJob.writes().begin())); it != end; ++it ) { + const WriteIntent& wi = *it; + char *r1 = (char*) wi.start(); + char *r2 = (char*) wi.end(); + if( r1 <= (((char*)a)+8) && r2 > (char*)a ) { + //log() << "it's ok " << wi.p << ' ' << wi.len << endl; + return; + } + } + log() << "journal data after write area " << i.start() << " does not agree" << endl; + log() << " was: " << ((void*)b) << " " << hexdump((char*)b, 8) << endl; + log() << " now: " << ((void*)a) << " " << hexdump((char*)a, 8) << endl; + log() << " n: " << n << endl; + log() << endl; + } + } +#endif + + // Functor to be called over all MongoFiles + + class validateSingleMapMatches { + public: + validateSingleMapMatches(unsigned long long& bytes) :_bytes(bytes) {} + void operator () (MongoFile *mf) { + if( mf->isMongoMMF() ) { + MongoMMF *mmf = (MongoMMF*) mf; + const unsigned char *p = (const unsigned char *) mmf->getView(); + const unsigned char *w = (const unsigned char *) mmf->view_write(); + + if (!p || !w) return; // File not fully opened yet + + _bytes += mmf->length(); + + assert( mmf->length() == (unsigned) mmf->length() ); + { + scoped_lock lk( privateViews._mutex() ); // see setNoJournal + if (memcmp(p, w, (unsigned) mmf->length()) == 0) + return; // next file + } + + unsigned low = 0xffffffff; + unsigned high = 0; + log() << "DurParanoid mismatch in " << mmf->filename() << endl; + int logged = 0; + unsigned lastMismatch = 0xffffffff; + for( unsigned i = 0; i < mmf->length(); i++ ) { + if( p[i] != w[i] ) { + if( lastMismatch != 0xffffffff && lastMismatch+1 != i ) + log() << endl; // separate blocks of mismatches + lastMismatch= i; + if( ++logged < 60 ) { + if( logged == 1 ) + log() << "ofs % 628 = 0x" << hex << (i%628) << endl; // for .ns files to find offset in record + stringstream ss; + ss << "mismatch ofs:" << hex << i << "\tfilemap:" << setw(2) << (unsigned) w[i] << "\tprivmap:" << setw(2) << (unsigned) p[i]; + if( p[i] > 32 && p[i] <= 126 ) + ss << '\t' << p[i]; + log() << ss.str() << endl; + } + if( logged == 60 ) + log() << "..." << endl; + if( i < low ) low = i; + if( i > high ) high = i; + } + } + if( low != 0xffffffff ) { + std::stringstream ss; + ss << "journal error warning views mismatch " << mmf->filename() << ' ' << (hex) << low << ".." << high << " len:" << high-low+1; + log() << ss.str() << endl; + log() << "priv loc: " << (void*)(p+low) << ' ' << endl; + set<WriteIntent>& b = commitJob.writes(); + (void)b; // mark as unused. Useful for inspection in debugger + + // should we abort() here so this isn't unnoticed in some circumstances? + massert(13599, "Written data does not match in-memory view. Missing WriteIntent?", false); + } + } + } + private: + unsigned long long& _bytes; + }; + + /** (SLOW) diagnostic to check that the private view and the non-private view are in sync. + */ + void debugValidateAllMapsMatch() { + if( ! (cmdLine.durOptions & CmdLine::DurParanoid) ) + return; + + unsigned long long bytes = 0; + Timer t; + MongoFile::forEach(validateSingleMapMatches(bytes)); + OCCASIONALLY log() << "DurParanoid map check " << t.millis() << "ms for " << (bytes / (1024*1024)) << "MB" << endl; + } + + extern size_t privateMapBytes; + + static void _REMAPPRIVATEVIEW() { + // todo: Consider using ProcessInfo herein and watching for getResidentSize to drop. that could be a way + // to assure very good behavior here. + + static unsigned startAt; + static unsigned long long lastRemap; + + LOG(4) << "journal REMAPPRIVATEVIEW" << endl; + + d.dbMutex.assertWriteLocked(); + d.dbMutex._remapPrivateViewRequested = false; + assert( !commitJob.hasWritten() ); + + // we want to remap all private views about every 2 seconds. there could be ~1000 views so + // we do a little each pass; beyond the remap time, more significantly, there will be copy on write + // faults after remapping, so doing a little bit at a time will avoid big load spikes on + // remapping. + unsigned long long now = curTimeMicros64(); + double fraction = (now-lastRemap)/2000000.0; + if( cmdLine.durOptions & CmdLine::DurAlwaysRemap ) + fraction = 1; + lastRemap = now; + + LockMongoFilesShared lk; + set<MongoFile*>& files = MongoFile::getAllFiles(); + unsigned sz = files.size(); + if( sz == 0 ) + return; + + { + // be careful not to use too much memory if the write rate is + // extremely high + double f = privateMapBytes / ((double)UncommittedBytesLimit); + if( f > fraction ) { + fraction = f; + } + privateMapBytes = 0; + } + + unsigned ntodo = (unsigned) (sz * fraction); + if( ntodo < 1 ) ntodo = 1; + if( ntodo > sz ) ntodo = sz; + + const set<MongoFile*>::iterator b = files.begin(); + const set<MongoFile*>::iterator e = files.end(); + set<MongoFile*>::iterator i = b; + // skip to our starting position + for( unsigned x = 0; x < startAt; x++ ) { + i++; + if( i == e ) i = b; + } + unsigned startedAt = startAt; + startAt = (startAt + ntodo) % sz; // mark where to start next time + + Timer t; + for( unsigned x = 0; x < ntodo; x++ ) { + dassert( i != e ); + if( (*i)->isMongoMMF() ) { + MongoMMF *mmf = (MongoMMF*) *i; + assert(mmf); + if( mmf->willNeedRemap() ) { + mmf->willNeedRemap() = false; + mmf->remapThePrivateView(); + } + i++; + if( i == e ) i = b; + } + } + LOG(2) << "journal REMAPPRIVATEVIEW done startedAt: " << startedAt << " n:" << ntodo << ' ' << t.millis() << "ms" << endl; + } + + /** We need to remap the private views periodically. otherwise they would become very large. + Call within write lock. See top of file for more commentary. + */ + void REMAPPRIVATEVIEW() { + Timer t; + _REMAPPRIVATEVIEW(); + stats.curr->_remapPrivateViewMicros += t.micros(); + } + + // lock order: dbMutex first, then this + mutex groupCommitMutex("groupCommit"); + + bool _groupCommitWithLimitedLocks() { + + int p = 0; + LOG(4) << "groupcommitll " << p++ << endl; + + scoped_ptr<ExcludeAllWrites> lk1( new ExcludeAllWrites() ); + + LOG(4) << "groupcommitll " << p++ << endl; + + scoped_lock lk2(groupCommitMutex); + + LOG(4) << "groupcommitll " << p++ << endl; + + commitJob.beginCommit(); + + if( !commitJob.hasWritten() ) { + // getlasterror request could have came after the data was already committed + commitJob.notifyCommitted(); + return true; + } + + LOG(4) << "groupcommitll " << p++ << endl; + + JSectHeader h; + PREPLOGBUFFER(h); // need to be in readlock (writes excluded) for this + + LOG(4) << "groupcommitll " << p++ << endl; + + LockMongoFilesShared lk3; + + LOG(4) << "groupcommitll " << p++ << endl; + + unsigned abLen = commitJob._ab.len(); + commitJob.reset(); // must be reset before allowing anyone to write + DEV assert( !commitJob.hasWritten() ); + + LOG(4) << "groupcommitll " << p++ << endl; + + // release the readlock -- allowing others to now write while we are writing to the journal (etc.) + lk1.reset(); + + LOG(4) << "groupcommitll " << p++ << endl; + + // ****** now other threads can do writes ****** + + WRITETOJOURNAL(h, commitJob._ab); + assert( abLen == commitJob._ab.len() ); // a check that no one touched the builder while we were doing work. if so, our locking is wrong. + + LOG(4) << "groupcommitll " << p++ << endl; + + // data is now in the journal, which is sufficient for acknowledging getLastError. + // (ok to crash after that) + commitJob.notifyCommitted(); + + LOG(4) << "groupcommitll " << p++ << " WRITETODATAFILES()" << endl; + + WRITETODATAFILES(h, commitJob._ab); + assert( abLen == commitJob._ab.len() ); // check again wasn't modded + commitJob._ab.reset(); + + LOG(4) << "groupcommitll " << p++ << endl; + + // can't : d.dbMutex._remapPrivateViewRequested = true; + + return true; + } + + /** @return true if committed; false if lock acquisition timed out (we only try for a read lock herein and only wait for a certain duration). */ + bool groupCommitWithLimitedLocks() { + try { + return _groupCommitWithLimitedLocks(); + } + catch(DBException& e ) { + log() << "dbexception in groupCommitLL causing immediate shutdown: " << e.toString() << endl; + mongoAbort("dur1"); + } + catch(std::ios_base::failure& e) { + log() << "ios_base exception in groupCommitLL causing immediate shutdown: " << e.what() << endl; + mongoAbort("dur2"); + } + catch(std::bad_alloc& e) { + log() << "bad_alloc exception in groupCommitLL causing immediate shutdown: " << e.what() << endl; + mongoAbort("dur3"); + } + catch(std::exception& e) { + log() << "exception in dur::groupCommitLL causing immediate shutdown: " << e.what() << endl; + mongoAbort("dur4"); + } + return false; + } + + static void _groupCommit() { + + LOG(4) << "_groupCommit " << endl; + + // we need to be at least read locked on the dbMutex so that we know the write intent data + // structures are not changing while we work + d.dbMutex.assertAtLeastReadLocked(); + + commitJob.beginCommit(); + + if( !commitJob.hasWritten() ) { + // getlasterror request could have came after the data was already committed + commitJob.notifyCommitted(); + return; + } + + // we need to make sure two group commits aren't running at the same time + // (and we are only read locked in the dbMutex, so it could happen) + scoped_lock lk(groupCommitMutex); + + JSectHeader h; + PREPLOGBUFFER(h); + + // todo : write to the journal outside locks, as this write can be slow. + // however, be careful then about remapprivateview as that cannot be done + // if new writes are then pending in the private maps. + WRITETOJOURNAL(h, commitJob._ab); + + // data is now in the journal, which is sufficient for acknowledging getLastError. + // (ok to crash after that) + commitJob.notifyCommitted(); + + WRITETODATAFILES(h, commitJob._ab); + debugValidateAllMapsMatch(); + + commitJob.reset(); + commitJob._ab.reset(); + + // REMAPPRIVATEVIEW + // + // remapping private views must occur after WRITETODATAFILES otherwise + // we wouldn't see newly written data on reads. + // + DEV assert( !commitJob.hasWritten() ); + if( !d.dbMutex.isWriteLocked() ) { + // this needs done in a write lock (as there is a short window during remapping when each view + // might not exist) thus we do it on the next acquisition of that instead of here (there is no + // rush if you aren't writing anyway -- but it must happen, if it is done, before any uncommitted + // writes occur). If desired, perhaps this can be eliminated on posix as it may be that the remap + // is race-free there. + // + d.dbMutex._remapPrivateViewRequested = true; + } + else { + stats.curr->_commitsInWriteLock++; + // however, if we are already write locked, we must do it now -- up the call tree someone + // may do a write without a new lock acquisition. this can happen when MongoMMF::close() calls + // this method when a file (and its views) is about to go away. + // + REMAPPRIVATEVIEW(); + } + } + + /** locking: in read lock when called + or, for early commits (commitIfNeeded), in write lock + @see MongoMMF::close() + */ + static void groupCommit() { + try { + _groupCommit(); + } + catch(DBException& e ) { + log() << "dbexception in groupCommit causing immediate shutdown: " << e.toString() << endl; + mongoAbort("gc1"); + } + catch(std::ios_base::failure& e) { + log() << "ios_base exception in groupCommit causing immediate shutdown: " << e.what() << endl; + mongoAbort("gc2"); + } + catch(std::bad_alloc& e) { + log() << "bad_alloc exception in groupCommit causing immediate shutdown: " << e.what() << endl; + mongoAbort("gc3"); + } + catch(std::exception& e) { + log() << "exception in dur::groupCommit causing immediate shutdown: " << e.what() << endl; + mongoAbort("gc4"); + } + LOG(4) << "groupCommit end" << endl; + } + + static void go() { + const int N = 10; + static int n; + if( privateMapBytes < UncommittedBytesLimit && ++n % N && (cmdLine.durOptions&CmdLine::DurAlwaysRemap)==0 ) { + // limited locks version doesn't do any remapprivateview at all, so only try this if privateMapBytes + // is in an acceptable range. also every Nth commit, we do everything so we can do some remapping; + // remapping a lot all at once could cause jitter from a large amount of copy-on-writes all at once. + if( groupCommitWithLimitedLocks() ) + return; + } + else { + readlocktry lk("", 1000); + if( lk.got() ) { + groupCommit(); + return; + } + } + + // starvation on read locks could occur. so if read lock acquisition is slow, try to get a + // write lock instead. otherwise journaling could be delayed too long (too much data will + // not accumulate though, as commitIfNeeded logic will have executed in the meantime if there + // has been writes) + writelock lk; + groupCommit(); + } + + /** called when a MongoMMF is closing -- we need to go ahead and group commit in that case before its + views disappear + */ + void closingFileNotification() { + if (!cmdLine.dur) + return; + + if( d.dbMutex.atLeastReadLocked() ) { + groupCommit(); + } + else { + assert( inShutdown() ); + if( commitJob.hasWritten() ) { + log() << "journal warning files are closing outside locks with writes pending" << endl; + } + } + } + + extern int groupCommitIntervalMs; + boost::filesystem::path getJournalDir(); + + void durThread() { + Client::initThread("journal"); + + bool samePartition = true; + try { + const string dbpathDir = boost::filesystem::path(dbpath).native_directory_string(); + samePartition = onSamePartition(getJournalDir().string(), dbpathDir); + } + catch(...) { + } + + while( !inShutdown() ) { + RACECHECK + + unsigned ms = cmdLine.journalCommitInterval; + if( ms == 0 ) { + // use default + ms = samePartition ? 100 : 30; + } + + unsigned oneThird = (ms / 3) + 1; // +1 so never zero + + try { + stats.rotate(); + + // we do this in a couple blocks (the invoke()), which makes it a tiny bit faster (only a little) on throughput, + // but is likely also less spiky on our cpu usage, which is good. + + // commit sooner if one or more getLastError j:true is pending + sleepmillis(oneThird); + for( unsigned i = 1; i <= 2; i++ ) { + if( commitJob._notify.nWaiting() ) + break; + commitJob.wi()._deferred.invoke(); + sleepmillis(oneThird); + } + + go(); + } + catch(std::exception& e) { + log() << "exception in durThread causing immediate shutdown: " << e.what() << endl; + mongoAbort("exception in durThread"); + } + } + cc().shutdown(); + } + + void recover(); + + unsigned notesThisLock = 0; + + void releasingWriteLock() { + DEV notesThisLock = 0; + // implicit commitIfNeeded check on each write unlock + DEV commitJob._nSinceCommitIfNeededCall = 0; // implicit commit if needed + if( commitJob.bytes() > UncommittedBytesLimit || cmdLine.durOptions & CmdLine::DurAlwaysCommit ) { + stats.curr->_earlyCommits++; + groupCommit(); + } + } + + void preallocateFiles(); + + /** at startup, recover, and then start the journal threads */ + void startup() { + if( !cmdLine.dur ) + return; + +#if defined(_DURABLEDEFAULTON) + DEV { + if( time(0) & 1 ) { + cmdLine.durOptions |= CmdLine::DurAlwaysCommit; + log() << "_DEBUG _DURABLEDEFAULTON : forcing DurAlwaysCommit mode for this run" << endl; + } + if( time(0) & 2 ) { + cmdLine.durOptions |= CmdLine::DurAlwaysRemap; + log() << "_DEBUG _DURABLEDEFAULTON : forcing DurAlwaysRemap mode for this run" << endl; + } + } +#endif + + DurableInterface::enableDurability(); + + journalMakeDir(); + try { + recover(); + } + catch(...) { + log() << "exception during recovery" << endl; + throw; + } + + preallocateFiles(); + + boost::thread t(durThread); + } + + void DurableImpl::syncDataAndTruncateJournal() { + d.dbMutex.assertWriteLocked(); + + // a commit from the commit thread won't begin while we are in the write lock, + // but it may already be in progress and the end of that work is done outside + // (dbMutex) locks. This line waits for that to complete if already underway. + { + scoped_lock lk(groupCommitMutex); + } + + groupCommit(); + MongoFile::flushAll(true); + journalCleanup(); + + assert(!haveJournalFiles()); // Double check post-conditions + } + + } // namespace dur + +} // namespace mongo diff --git a/src/mongo/db/dur.h b/src/mongo/db/dur.h new file mode 100644 index 00000000000..f06ff500195 --- /dev/null +++ b/src/mongo/db/dur.h @@ -0,0 +1,209 @@ +// @file dur.h durability support + +#pragma once + +#include "diskloc.h" +#include "mongommf.h" + +namespace mongo { + + class NamespaceDetails; + + void mongoAbort(const char *msg); + void abort(); // not defined -- use mongoAbort() instead + + namespace dur { + + // a smaller limit is likely better on 32 bit +#if defined(__i386__) || defined(_M_IX86) + const unsigned UncommittedBytesLimit = 50 * 1024 * 1024; +#else + const unsigned UncommittedBytesLimit = 100 * 1024 * 1024; +#endif + + /** Call during startup so durability module can initialize + Throws if fatal error + Does nothing if cmdLine.dur is false + */ + void startup(); + + class DurableInterface : boost::noncopyable { + public: + virtual ~DurableInterface() { log() << "ERROR warning ~DurableInterface not intended to be called" << endl; } + + /** Declare that a file has been created + Normally writes are applied only after journaling, for safety. But here the file + is created first, and the journal will just replay the creation if the create didn't + happen because of crashing. + */ + virtual void createdFile(string filename, unsigned long long len) = 0; + + /** Declarations of write intent. + + Use these methods to declare "i'm about to write to x and it should be logged for redo." + + Failure to call writing...() is checked in _DEBUG mode by using a read only mapped view + (i.e., you'll segfault if the code is covered in that situation). The _DEBUG check doesn't + verify that your length is correct though. + */ + + /** declare intent to write to x for up to len + @return pointer where to write. this is modified when testIntent is true. + */ + virtual void* writingPtr(void *x, unsigned len) = 0; + + /** declare write intent; should already be in the write view to work correctly when testIntent is true. + if you aren't, use writingPtr() instead. + */ + virtual void declareWriteIntent(void *x, unsigned len) = 0; + + /** declare intent to write + @param ofs offset within buf at which we will write + @param len the length at ofs we will write + @return new buffer pointer. this is modified when testIntent is true. + */ + virtual void* writingAtOffset(void *buf, unsigned ofs, unsigned len) = 0; + + /** declare intent to write + @param ranges vector of pairs representing ranges. Each pair + comprises an offset from buf where a range begins, then the + range length. + @return new buffer pointer. this is modified when testIntent is true. + */ + virtual void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges ) = 0; + + /** Wait for acknowledgement of the next group commit. + @return true if --dur is on. There will be delay. + @return false if --dur is off. + */ + virtual bool awaitCommit() = 0; + + /** Commit immediately. + + Generally, you do not want to do this often, as highly granular committing may affect + performance. + + Does not return until the commit is complete. + + You must be at least read locked when you call this. Ideally, you are not write locked + and then read operations can occur concurrently. + + @return true if --dur is on. + @return false if --dur is off. (in which case there is action) + */ + virtual bool commitNow() = 0; + + /** Commit if enough bytes have been modified. Current threshold is 50MB + + The idea is that long running write operations that dont yield + (like creating an index or update with $atomic) can call this + whenever the db is in a sane state and it will prevent commits + from growing too large. + @return true if commited + */ + virtual bool commitIfNeeded() = 0; + + /** @return true if time to commit but does NOT do a commit */ + virtual bool aCommitIsNeeded() const = 0; + + /** Declare write intent for a DiskLoc. @see DiskLoc::writing() */ + inline DiskLoc& writingDiskLoc(DiskLoc& d) { return *((DiskLoc*) writingPtr(&d, sizeof(d))); } + + /** Declare write intent for an int */ + inline int& writingInt(const int& d) { return *((int*) writingPtr((int*) &d, sizeof(d))); } + + /** "assume i've already indicated write intent, let me write" + redeclaration is fine too, but this is faster. + */ + template <typename T> + inline + T* alreadyDeclared(T *x) { +#if defined(_TESTINTENT) + return (T*) MongoMMF::switchToPrivateView(x); +#else + return x; +#endif + } + + /** declare intent to write to x for sizeof(*x) */ + template <typename T> + inline + T* writing(T *x) { + return (T*) writingPtr(x, sizeof(T)); + } + + /** write something that doesn't have to be journaled, as this write is "unimportant". + a good example is paddingFactor. + can be thought of as memcpy(dst,src,len) + the dur implementation acquires a mutex in this method, so do not assume it is faster + without measuring! + */ + virtual void setNoJournal(void *dst, void *src, unsigned len) = 0; + + /** Commits pending changes, flushes all changes to main data + files, then removes the journal. + + This is useful as a "barrier" to ensure that writes before this + call will never go through recovery and be applied to files + that have had changes made after this call applied. + */ + virtual void syncDataAndTruncateJournal() = 0; + + static DurableInterface& getDur() { return *_impl; } + + private: + /** Intentionally unimplemented method. + It's very easy to manipulate Record::data open ended. Thus a call to writing(Record*) is suspect. + This will override the templated version and yield an unresolved external. + */ + Record* writing(Record* r); + /** Intentionally unimplemented method. BtreeBuckets are allocated in buffers larger than sizeof( BtreeBucket ). */ +// BtreeBucket* writing( BtreeBucket* ); + /** Intentionally unimplemented method. NamespaceDetails may be based on references to 'Extra' objects. */ + NamespaceDetails* writing( NamespaceDetails* ); + + static DurableInterface* _impl; // NonDurableImpl at startup() + static void enableDurability(); // makes _impl a DurableImpl + static void disableDurability(); // makes _impl a NonDurableImpl + + // these need to be able to enable/disable Durability + friend void startup(); + friend class TempDisableDurability; + }; // class DurableInterface + + class NonDurableImpl : public DurableInterface { + void* writingPtr(void *x, unsigned len) { return x; } + void* writingAtOffset(void *buf, unsigned ofs, unsigned len) { return buf; } + void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges) { return buf; } + void declareWriteIntent(void *, unsigned) { } + void createdFile(string filename, unsigned long long len) { } + bool awaitCommit() { return false; } + bool commitNow() { return false; } + bool commitIfNeeded() { return false; } + bool aCommitIsNeeded() const { return false; } + void setNoJournal(void *dst, void *src, unsigned len); + void syncDataAndTruncateJournal() {} + }; + + class DurableImpl : public DurableInterface { + void* writingPtr(void *x, unsigned len); + void* writingAtOffset(void *buf, unsigned ofs, unsigned len); + void* writingRangesAtOffsets(void *buf, const vector< pair< long long, unsigned > > &ranges); + void declareWriteIntent(void *, unsigned); + void createdFile(string filename, unsigned long long len); + bool awaitCommit(); + bool commitNow(); + bool aCommitIsNeeded() const; + bool commitIfNeeded(); + void setNoJournal(void *dst, void *src, unsigned len); + void syncDataAndTruncateJournal(); + }; + + } // namespace dur + + inline dur::DurableInterface& getDur() { return dur::DurableInterface::getDur(); } + + /** declare that we are modifying a diskloc and this is a datafile write. */ + inline DiskLoc& DiskLoc::writing() const { return getDur().writingDiskLoc(*const_cast< DiskLoc * >( this )); } + +} diff --git a/src/mongo/db/dur_commitjob.cpp b/src/mongo/db/dur_commitjob.cpp new file mode 100644 index 00000000000..5a9e9cb5679 --- /dev/null +++ b/src/mongo/db/dur_commitjob.cpp @@ -0,0 +1,240 @@ +/* @file dur_commitjob.cpp */ + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "dur_commitjob.h" +#include "dur_stats.h" +#include "taskqueue.h" +#include "client.h" + +namespace mongo { + + namespace dur { + + BOOST_STATIC_ASSERT( UncommittedBytesLimit > BSONObjMaxInternalSize * 3 ); + BOOST_STATIC_ASSERT( sizeof(void*)==4 || UncommittedBytesLimit > BSONObjMaxInternalSize * 6 ); + + void Writes::D::go(const Writes::D& d) { + commitJob.wi()._insertWriteIntent(d.p, d.len); + } + + void WriteIntent::absorb(const WriteIntent& other) { + dassert(overlaps(other)); + + void* newStart = min(start(), other.start()); + p = max(p, other.p); + len = (char*)p - (char*)newStart; + + dassert(contains(other)); + } + + void Writes::clear() { + d.dbMutex.assertAtLeastReadLocked(); + + _alreadyNoted.clear(); + _writes.clear(); + _ops.clear(); + _drained = false; +#if defined(DEBUG_WRITE_INTENT) + cout << "_debug clear\n"; + _debug.clear(); +#endif + } + +#if defined(DEBUG_WRITE_INTENT) + void assertAlreadyDeclared(void *p, int len) { + if( commitJob.wi()._debug[p] >= len ) + return; + log() << "assertAlreadyDeclared fails " << (void*)p << " len:" << len << ' ' << commitJob.wi()._debug[p] << endl; + printStackTrace(); + abort(); + } +#endif + + void Writes::_insertWriteIntent(void* p, int len) { + WriteIntent wi(p, len); + + if (_writes.empty()) { + _writes.insert(wi); + return; + } + + typedef set<WriteIntent>::const_iterator iterator; // shorter + + iterator closest = _writes.lower_bound(wi); + // closest.end() >= wi.end() + + if ((closest != _writes.end() && closest->overlaps(wi)) || // high end + (closest != _writes.begin() && (--closest)->overlaps(wi))) { // low end + if (closest->contains(wi)) + return; // nothing to do + + // find overlapping range and merge into wi + iterator end(closest); + iterator begin(closest); + while ( end->overlaps(wi)) { wi.absorb(*end); ++end; if (end == _writes.end()) break; } // look forwards + while (begin->overlaps(wi)) { wi.absorb(*begin); if (begin == _writes.begin()) break; --begin; } // look backwards + if (!begin->overlaps(wi)) ++begin; // make inclusive + + DEV { // ensure we're not deleting anything we shouldn't + for (iterator it(begin); it != end; ++it) { + assert(wi.contains(*it)); + } + } + + _writes.erase(begin, end); + _writes.insert(wi); + + DEV { // ensure there are no overlaps + // this can be very slow - n^2 - so make it RARELY + RARELY { + for (iterator it(_writes.begin()), end(boost::prior(_writes.end())); it != end; ++it) { + assert(!it->overlaps(*boost::next(it))); + } + } + } + } + else { // no entries overlapping wi + _writes.insert(closest, wi); + } + } + + /** note an operation other than a "basic write" */ + void CommitJob::noteOp(shared_ptr<DurOp> p) { + d.dbMutex.assertWriteLocked(); + dassert( cmdLine.dur ); + cc()._hasWrittenThisPass = true; + if( !_hasWritten ) { + assert( !d.dbMutex._remapPrivateViewRequested ); + _hasWritten = true; + } + _wi._ops.push_back(p); + } + + size_t privateMapBytes = 0; // used by _REMAPPRIVATEVIEW to track how much / how fast to remap + + void CommitJob::beginCommit() { + DEV d.dbMutex.assertAtLeastReadLocked(); + _commitNumber = _notify.now(); + stats.curr->_commits++; + } + + void CommitJob::reset() { + _hasWritten = false; + _wi.clear(); + privateMapBytes += _bytes; + _bytes = 0; + _nSinceCommitIfNeededCall = 0; + } + + CommitJob::CommitJob() : _ab(4 * 1024 * 1024) , _hasWritten(false), + _bytes(0), _nSinceCommitIfNeededCall(0) { + _commitNumber = 0; + } + + extern unsigned notesThisLock; + + void CommitJob::note(void* p, int len) { + // from the point of view of the dur module, it would be fine (i think) to only + // be read locked here. but must be at least read locked to avoid race with + // remapprivateview + DEV notesThisLock++; + DEV d.dbMutex.assertWriteLocked(); + dassert( cmdLine.dur ); + cc()._hasWrittenThisPass = true; + if( !_wi._alreadyNoted.checkAndSet(p, len) ) { + MemoryMappedFile::makeWritable(p, len); + + if( !_hasWritten ) { + // you can't be writing if one of these is pending, so this is a verification. + assert( !d.dbMutex._remapPrivateViewRequested ); // safe to assert here since it must be the first write in a write lock + + // we don't bother doing a group commit when nothing is written, so we have a var to track that + _hasWritten = true; + } + + /** tips for debugging: + if you have an incorrect diff between data files in different folders + (see jstests/dur/quick.js for example), + turn this on and see what is logged. if you have a copy of its output from before the + regression, a simple diff of these lines would tell you a lot likely. + */ +#if 0 && defined(_DEBUG) + { + static int n; + if( ++n < 10000 ) { + size_t ofs; + MongoMMF *mmf = privateViews._find(w.p, ofs); + if( mmf ) { + log() << "DEBUG note write intent " << w.p << ' ' << mmf->filename() << " ofs:" << hex << ofs << " len:" << w.len << endl; + } + else { + log() << "DEBUG note write intent " << w.p << ' ' << w.len << " NOT FOUND IN privateViews" << endl; + } + } + else if( n == 10000 ) { + log() << "DEBUG stopping write intent logging, too much to log" << endl; + } + } +#endif + + // remember intent. we will journal it in a bit + _wi.insertWriteIntent(p, len); + wassert( _wi._writes.size() < 2000000 ); + //assert( _wi._writes.size() < 20000000 ); + + { + // a bit over conservative in counting pagebytes used + static size_t lastPos; // note this doesn't reset with each commit, but that is ok we aren't being that precise + size_t x = ((size_t) p) & ~0xfff; // round off to page address (4KB) + if( x != lastPos ) { + lastPos = x; + unsigned b = (len+4095) & ~0xfff; + _bytes += b; +#if defined(_DEBUG) + _nSinceCommitIfNeededCall++; + if( _nSinceCommitIfNeededCall >= 80 ) { + if( _nSinceCommitIfNeededCall % 40 == 0 ) { + log() << "debug nsincecommitifneeded:" << _nSinceCommitIfNeededCall << " bytes:" << _bytes << endl; + if( _nSinceCommitIfNeededCall == 120 || _nSinceCommitIfNeededCall == 1200 ) { + log() << "_DEBUG printing stack given high nsinccommitifneeded number" << endl; + printStackTrace(); + } + } + } +#endif + if (_bytes > UncommittedBytesLimit * 3) { + static time_t lastComplain; + static unsigned nComplains; + // throttle logging + if( ++nComplains < 100 || time(0) - lastComplain >= 60 ) { + lastComplain = time(0); + warning() << "DR102 too much data written uncommitted " << _bytes/1000000.0 << "MB" << endl; + if( nComplains < 10 || nComplains % 10 == 0 ) { + // wassert makes getLastError show an error, so we just print stack trace + printStackTrace(); + } + } + } + } + } + } + } + + } +} diff --git a/src/mongo/db/dur_commitjob.h b/src/mongo/db/dur_commitjob.h new file mode 100644 index 00000000000..bfc5e3c268f --- /dev/null +++ b/src/mongo/db/dur_commitjob.h @@ -0,0 +1,220 @@ +/* @file dur_commitjob.h used by dur.cpp +*/ + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../util/alignedbuilder.h" +#include "../util/mongoutils/hash.h" +#include "../util/concurrency/synchronization.h" +#include "cmdline.h" +#include "durop.h" +#include "dur.h" +#include "taskqueue.h" + +//#define DEBUG_WRITE_INTENT 1 + +namespace mongo { + namespace dur { + + /** declaration of an intent to write to a region of a memory mapped view + * + * We store the end rather than the start pointer to make operator< faster + * since that is heavily used in set lookup. + */ + struct WriteIntent { /* copyable */ + WriteIntent() : /*w_ptr(0), */ p(0) { } + WriteIntent(void *a, unsigned b) : /*w_ptr(0), */ p((char*)a+b), len(b) { } + + void* start() const { return (char*)p - len; } + void* end() const { return p; } + unsigned length() const { return len; } + + bool operator < (const WriteIntent& rhs) const { return end() < rhs.end(); } + + // can they be merged? + bool overlaps(const WriteIntent& rhs) const { + return (start() <= rhs.end() && end() >= rhs.start()); + } + + // is merging necessary? + bool contains(const WriteIntent& rhs) const { + return (start() <= rhs.start() && end() >= rhs.end()); + } + + // merge into me + void absorb(const WriteIntent& other); + + friend ostream& operator << (ostream& out, const WriteIntent& wi) { + return (out << "p: " << wi.p << " end: " << wi.end() << " len: " << wi.len); + } + + //mutable void *w_ptr; // writable mapping of p. + // mutable because set::iterator is const but this isn't used in op< +#if defined(_EXPERIMENTAL) + mutable unsigned ofsInJournalBuffer; +#endif + private: + void *p; // intent to write up to p + unsigned len; // up to this len + }; + + /** try to remember things we have already marked for journaling. false negatives are ok if infrequent - + we will just log them twice. + */ + template<int Prime> + class Already : boost::noncopyable { + public: + Already() { clear(); } + void clear() { memset(this, 0, sizeof(*this)); } + + /* see if we have Already recorded/indicated our write intent for this region of memory. + automatically upgrades the length if the length was shorter previously. + @return true if already indicated. + */ + bool checkAndSet(void* p, int len) { + unsigned x = mongoutils::hashPointer(p); + pair<void*, int>& nd = nodes[x % N]; + if( nd.first == p ) { + if( nd.second < len ) { + nd.second = len; + return false; // haven't indicated this len yet + } + return true; // already indicated + } + nd.first = p; + nd.second = len; + return false; // a new set + } + + private: + enum { N = Prime }; // this should be small the idea is that it fits in the cpu cache easily + pair<void*,int> nodes[N]; + }; + + /** our record of pending/uncommitted write intents */ + class Writes : boost::noncopyable { + struct D { + void *p; + unsigned len; + static void go(const D& d); + }; + public: + TaskQueue<D> _deferred; + Already<127> _alreadyNoted; + set<WriteIntent> _writes; + vector< shared_ptr<DurOp> > _ops; // all the ops other than basic writes + bool _drained; // _deferred is drained? for asserting/testing + + /** reset the Writes structure (empties all the above) */ + void clear(); + + /** merges into set (ie non-deferred version) */ + void _insertWriteIntent(void* p, int len); + + void insertWriteIntent(void* p, int len) { +#if defined(DEBUG_WRITE_INTENT) + if( _debug[p] < len ) + _debug[p] = len; +#endif + D d; + d.p = p; + d.len = len; + _deferred.defer(d); + } + +#ifdef _DEBUG + WriteIntent _last; +#endif +#if defined(DEBUG_WRITE_INTENT) + map<void*,int> _debug; +#endif + }; + +#if defined(DEBUG_WRITE_INTENT) + void assertAlreadyDeclared(void *, int len); +#else + inline void assertAlreadyDeclared(void *, int len) { } +#endif + + /** A commit job object for a group commit. Currently there is one instance of this object. + + concurrency: assumption is caller is appropriately locking. + for example note() invocations are from the write lock. + other uses are in a read lock from a single thread (durThread) + */ + class CommitJob : boost::noncopyable { + public: + AlignedBuilder _ab; // for direct i/o writes to journal + + CommitJob(); + + ~CommitJob(){ assert(!"shouldn't destroy CommitJob!"); } + + /** record/note an intent to write */ + void note(void* p, int len); + + /** note an operation other than a "basic write" */ + void noteOp(shared_ptr<DurOp> p); + + set<WriteIntent>& writes() { + if( !_wi._drained ) { + // generally, you don't want to use the set until it is prepared (after deferred ops are applied) + // thus this assert here. + assert(false); + } + return _wi._writes; + } + + vector< shared_ptr<DurOp> >& ops() { return _wi._ops; } + + /** this method is safe to call outside of locks. when haswritten is false we don't do any group commit and avoid even + trying to acquire a lock, which might be helpful at times. + */ + bool hasWritten() const { return _hasWritten; } + + /** we use the commitjob object over and over, calling reset() rather than reconstructing */ + void reset(); + + void beginCommit(); + + /** the commit code calls this when data reaches the journal (on disk) */ + void notifyCommitted() { _notify.notifyAll(_commitNumber); } + + /** we check how much written and if it is getting to be a lot, we commit sooner. */ + size_t bytes() const { return _bytes; } + +#if defined(_DEBUG) + const WriteIntent& lastWrite() const { return _wi._last; } +#endif + + Writes& wi() { return _wi; } + private: + NotifyAll::When _commitNumber; + bool _hasWritten; + Writes _wi; // todo: fix name + size_t _bytes; + public: + NotifyAll _notify; // for getlasterror fsync:true acknowledgements + unsigned _nSinceCommitIfNeededCall; + }; + + extern CommitJob& commitJob; + + } +} diff --git a/src/mongo/db/dur_journal.cpp b/src/mongo/db/dur_journal.cpp new file mode 100644 index 00000000000..6a6609f55ee --- /dev/null +++ b/src/mongo/db/dur_journal.cpp @@ -0,0 +1,748 @@ +// @file dur_journal.cpp writing to the writeahead logging journal + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "client.h" +#include "namespace.h" +#include "dur_journal.h" +#include "dur_journalformat.h" +#include "dur_stats.h" +#include "../util/logfile.h" +#include "../util/timer.h" +#include "../util/alignedbuilder.h" +#include "../util/net/listen.h" // getelapsedtimemillis +#include <boost/static_assert.hpp> +#include <boost/filesystem.hpp> +#undef assert +#define assert MONGO_assert +#include "../util/mongoutils/str.h" +#include "dur_journalimpl.h" +#include "../util/file.h" +#include "../util/checksum.h" +#include "../util/concurrency/race.h" +#include "../util/compress.h" +#include "../server.h" + +using namespace mongoutils; + +namespace mongo { + + class AlignedBuilder; + + unsigned goodRandomNumberSlow(); + + namespace dur { + // Rotate after reaching this data size in a journal (j._<n>) file + // We use a smaller size for 32 bit as the journal is mmapped during recovery (only) + // Note if you take a set of datafiles, including journal files, from 32->64 or vice-versa, it must + // work. (and should as-is) + // --smallfiles makes the limit small. + +#if defined(_DEBUG) + unsigned long long DataLimitPerJournalFile = 128 * 1024 * 1024; +#elif defined(__APPLE__) + // assuming a developer box if OS X + unsigned long long DataLimitPerJournalFile = 256 * 1024 * 1024; +#else + unsigned long long DataLimitPerJournalFile = (sizeof(void*)==4) ? 256 * 1024 * 1024 : 1 * 1024 * 1024 * 1024; +#endif + + BOOST_STATIC_ASSERT( sizeof(Checksum) == 16 ); + BOOST_STATIC_ASSERT( sizeof(JHeader) == 8192 ); + BOOST_STATIC_ASSERT( sizeof(JSectHeader) == 20 ); + BOOST_STATIC_ASSERT( sizeof(JSectFooter) == 32 ); + BOOST_STATIC_ASSERT( sizeof(JEntry) == 12 ); + BOOST_STATIC_ASSERT( sizeof(LSNFile) == 88 ); + + bool usingPreallocate = false; + + void removeOldJournalFile(path p); + + boost::filesystem::path getJournalDir() { + boost::filesystem::path p(dbpath); + p /= "journal"; + return p; + } + + path lsnPath() { + return getJournalDir()/"lsn"; + } + + /** this should be called when something really bad happens so that we can flag appropriately + */ + void journalingFailure(const char *msg) { + /** todo: + (1) don't log too much + (2) make an indicator in the journal dir that something bad happened. + (2b) refuse to do a recovery startup if that is there without manual override. + */ + log() << "journaling failure/error: " << msg << endl; + assert(false); + } + + JSectFooter::JSectFooter() { + memset(this, 0, sizeof(*this)); + sentinel = JEntry::OpCode_Footer; + } + + JSectFooter::JSectFooter(const void* begin, int len) { // needs buffer to compute hash + sentinel = JEntry::OpCode_Footer; + reserved = 0; + magic[0] = magic[1] = magic[2] = magic[3] = '\n'; + + Checksum c; + c.gen(begin, (unsigned) len); + memcpy(hash, c.bytes, sizeof(hash)); + } + + bool JSectFooter::checkHash(const void* begin, int len) const { + if( !magicOk() ) { + log() << "journal footer not valid" << endl; + return false; + } + Checksum c; + c.gen(begin, len); + DEV log() << "checkHash len:" << len << " hash:" << toHex(hash, 16) << " current:" << toHex(c.bytes, 16) << endl; + if( memcmp(hash, c.bytes, sizeof(hash)) == 0 ) + return true; + log() << "journal checkHash mismatch, got: " << toHex(c.bytes, 16) << " expected: " << toHex(hash,16) << endl; + return false; + } + + JHeader::JHeader(string fname) { + magic[0] = 'j'; magic[1] = '\n'; + _version = CurrentVersion; + memset(ts, 0, sizeof(ts)); + time_t t = time(0); + strncpy(ts, time_t_to_String_short(t).c_str(), sizeof(ts)-1); + memset(dbpath, 0, sizeof(dbpath)); + strncpy(dbpath, fname.c_str(), sizeof(dbpath)-1); + { + fileId = t&0xffffffff; + fileId |= ((unsigned long long)goodRandomNumberSlow()) << 32; + } + memset(reserved3, 0, sizeof(reserved3)); + txt2[0] = txt2[1] = '\n'; + n1 = n2 = n3 = n4 = '\n'; + } + + Journal j; + + const unsigned long long LsnShutdownSentinel = ~((unsigned long long)0); + + Journal::Journal() : + _curLogFileMutex("JournalLfMutex") { + _ageOut = true; + _written = 0; + _nextFileNumber = 0; + _curLogFile = 0; + _curFileId = 0; + _preFlushTime = 0; + _lastFlushTime = 0; + _writeToLSNNeeded = false; + } + + path Journal::getFilePathFor(int filenumber) const { + boost::filesystem::path p(dir); + p /= string(str::stream() << "j._" << filenumber); + return p; + } + + /** never throws + @return true if journal dir is not empty + */ + bool haveJournalFiles() { + try { + for ( boost::filesystem::directory_iterator i( getJournalDir() ); + i != boost::filesystem::directory_iterator(); + ++i ) { + string fileName = boost::filesystem::path(*i).leaf(); + if( str::startsWith(fileName, "j._") ) + return true; + } + } + catch(...) { } + return false; + } + + /** throws */ + void removeJournalFiles() { + log() << "removeJournalFiles" << endl; + try { + for ( boost::filesystem::directory_iterator i( getJournalDir() ); + i != boost::filesystem::directory_iterator(); + ++i ) { + string fileName = boost::filesystem::path(*i).leaf(); + if( str::startsWith(fileName, "j._") ) { + try { + removeOldJournalFile(*i); + } + catch(std::exception& e) { + log() << "couldn't remove " << fileName << ' ' << e.what() << endl; + throw; + } + } + } + try { + boost::filesystem::remove(lsnPath()); + } + catch(...) { + log() << "couldn't remove " << lsnPath().string() << endl; + throw; + } + } + catch( std::exception& e ) { + log() << "error removing journal files " << e.what() << endl; + throw; + } + assert(!haveJournalFiles()); + + flushMyDirectory(getJournalDir() / "file"); // flushes parent of argument (in this case journal dir) + + log(1) << "removeJournalFiles end" << endl; + } + + /** at clean shutdown */ + bool okToCleanUp = false; // successful recovery would set this to true + void Journal::cleanup(bool _log) { + if( !okToCleanUp ) + return; + + if( _log ) + log() << "journalCleanup..." << endl; + try { + SimpleMutex::scoped_lock lk(_curLogFileMutex); + closeCurrentJournalFile(); + removeJournalFiles(); + } + catch(std::exception& e) { + log() << "error couldn't remove journal file during shutdown " << e.what() << endl; + throw; + } + } + void journalCleanup(bool log) { j.cleanup(log); } + + bool _preallocateIsFaster() { + bool faster = false; + boost::filesystem::path p = getJournalDir() / "tempLatencyTest"; + try { remove(p); } catch(...) { } + try { + AlignedBuilder b(8192); + int millis[2]; + const int N = 50; + for( int pass = 0; pass < 2; pass++ ) { + LogFile f(p.string()); + Timer t; + for( int i = 0 ; i < N; i++ ) { + f.synchronousAppend(b.buf(), 8192); + } + millis[pass] = t.millis(); + // second time through, file exists and is prealloc case + } + int diff = millis[0] - millis[1]; + if( diff > 2 * N ) { + // at least 2ms faster for prealloc case? + faster = true; + log() << "preallocateIsFaster=true " << diff / (1.0*N) << endl; + } + } + catch(...) { + log() << "info preallocateIsFaster couldn't run; returning false" << endl; + } + try { remove(p); } catch(...) { } + return faster; + } + bool preallocateIsFaster() { + Timer t; + bool res = false; + if( _preallocateIsFaster() && _preallocateIsFaster() ) { + // maybe system is just super busy at the moment? sleep a second to let it calm down. + // deciding to to prealloc is a medium big decision: + sleepsecs(1); + res = _preallocateIsFaster(); + } + if( t.millis() > 3000 ) + log() << "preallocateIsFaster check took " << t.millis()/1000.0 << " secs" << endl; + return res; + } + + // throws + void preallocateFile(boost::filesystem::path p, unsigned long long len) { + if( exists(p) ) + return; + + log() << "preallocating a journal file " << p.string() << endl; + + const unsigned BLKSZ = 1024 * 1024; + assert( len % BLKSZ == 0 ); + + AlignedBuilder b(BLKSZ); + memset((void*)b.buf(), 0, BLKSZ); + + ProgressMeter m(len, 3/*secs*/, 10/*hits between time check (once every 6.4MB)*/); + + File f; + f.open( p.string().c_str() , /*read-only*/false , /*direct-io*/false ); + assert( f.is_open() ); + fileofs loc = 0; + while ( loc < len ) { + f.write( loc , b.buf() , BLKSZ ); + loc += BLKSZ; + m.hit(BLKSZ); + } + assert( loc == len ); + f.fsync(); + } + + const int NUM_PREALLOC_FILES = 3; + inline boost::filesystem::path preallocPath(int n) { + assert(n >= 0); + assert(n < NUM_PREALLOC_FILES); + string fn = str::stream() << "prealloc." << n; + return getJournalDir() / fn; + } + + // throws + void _preallocateFiles() { + for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) { + boost::filesystem::path filepath = preallocPath(i); + + unsigned long long limit = DataLimitPerJournalFile; + if( debug && i == 1 ) { + // moving 32->64, the prealloc files would be short. that is "ok", but we want to exercise that + // case, so we force exercising here when _DEBUG is set by arbitrarily stopping prealloc at a low + // limit for a file. also we want to be able to change in the future the constant without a lot of + // work anyway. + limit = 16 * 1024 * 1024; + } + preallocateFile(filepath, limit); + } + } + + void checkFreeSpace() { + unsigned long long spaceNeeded = static_cast<unsigned long long>(3 * DataLimitPerJournalFile * 1.1); // add 10% for headroom + unsigned long long freeSpace = File::freeSpace(getJournalDir().string()); + unsigned long long prealloced = 0; + for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) { + boost::filesystem::path filepath = preallocPath(i); + if (exists(filepath)) + prealloced += file_size(filepath); + } + + if (freeSpace + prealloced < spaceNeeded) { + log() << endl; + error() << "Insufficient free space for journals." << endl; + log() << "Please make at least " << spaceNeeded/(1024*1024) << "MB available in " << getJournalDir().string() << endl; + log() << endl; + throw UserException(15926, "Insufficient free space for journals"); + } + } + + void preallocateFiles() { + if (! (cmdLine.durOptions & CmdLine::DurNoCheckSpace)) + checkFreeSpace(); + + if( exists(preallocPath(0)) || // if enabled previously, keep using + exists(preallocPath(1)) || + ( cmdLine.preallocj && preallocateIsFaster() ) ) { + usingPreallocate = true; + try { + _preallocateFiles(); + } + catch(...) { + log() << "warning caught exception in preallocateFiles, continuing" << endl; + } + } + j.open(); + } + + void removeOldJournalFile(path p) { + if( usingPreallocate ) { + try { + for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) { + boost::filesystem::path filepath = preallocPath(i); + if( !boost::filesystem::exists(filepath) ) { + // we can recycle this file into this prealloc file location + boost::filesystem::path temppath = filepath.string() + ".temp"; + boost::filesystem::rename(p, temppath); + { + // zero the header + File f; + f.open(temppath.string().c_str(), false, false); + char buf[8192]; + memset(buf, 0, 8192); + f.write(0, buf, 8192); + f.truncate(DataLimitPerJournalFile); + f.fsync(); + } + boost::filesystem::rename(temppath, filepath); + return; + } + } + } catch(...) { + log() << "warning exception in dur::removeOldJournalFile " << p.string() << endl; + // fall through and try to delete the file + } + } + + // already have 3 prealloc files, so delete this file + try { + boost::filesystem::remove(p); + } + catch(...) { + log() << "warning exception removing " << p.string() << endl; + } + } + + // find a prealloc.<n> file, presumably to take and use + path findPrealloced() { + try { + for( int i = 0; i < NUM_PREALLOC_FILES; i++ ) { + boost::filesystem::path filepath = preallocPath(i); + if( boost::filesystem::exists(filepath) ) + return filepath; + } + } catch(...) { + log() << "warning exception in dur::findPrealloced()" << endl; + } + return path(); + } + + /** assure journal/ dir exists. throws. call during startup. */ + void journalMakeDir() { + j.init(); + + boost::filesystem::path p = getJournalDir(); + j.dir = p.string(); + log() << "journal dir=" << j.dir << endl; + if( !exists(j.dir) ) { + try { + create_directory(j.dir); + } + catch(std::exception& e) { + log() << "error creating directory " << j.dir << ' ' << e.what() << endl; + throw; + } + } + } + + void Journal::_open() { + _curFileId = 0; + assert( _curLogFile == 0 ); + path fname = getFilePathFor(_nextFileNumber); + + // if we have a prealloced file, use it + { + path p = findPrealloced(); + if( !p.empty() ) { + try { + { + // JHeader::fileId must be updated before renaming to be race-safe + LogFile f(p.string()); + JHeader h(p.string()); + AlignedBuilder b(8192); + b.appendStruct(h); + f.synchronousAppend(b.buf(), b.len()); + } + boost::filesystem::rename(p, fname); + } + catch(...) { + log() << "warning couldn't write to / rename file " << p.string() << endl; + } + } + } + + _curLogFile = new LogFile(fname.string()); + _nextFileNumber++; + { + JHeader h(fname.string()); + _curFileId = h.fileId; + assert(_curFileId); + AlignedBuilder b(8192); + b.appendStruct(h); + _curLogFile->synchronousAppend(b.buf(), b.len()); + } + } + + void Journal::init() { + assert( _curLogFile == 0 ); + MongoFile::notifyPreFlush = preFlush; + MongoFile::notifyPostFlush = postFlush; + } + + void Journal::open() { + assert( MongoFile::notifyPreFlush == preFlush ); + SimpleMutex::scoped_lock lk(_curLogFileMutex); + _open(); + } + + void LSNFile::set(unsigned long long x) { + memset(this, 0, sizeof(*this)); + lsn = x; + checkbytes = ~x; + } + + /** logs details of the situation, and returns 0, if anything surprising in the LSNFile + if something highly surprising, throws to abort + */ + unsigned long long LSNFile::get() { + uassert(13614, str::stream() << "unexpected version number of lsn file in journal/ directory got: " << ver , ver == 0); + if( ~lsn != checkbytes ) { + log() << "lsnfile not valid. recovery will be from log start. lsn: " << hex << lsn << " checkbytes: " << hex << checkbytes << endl; + return 0; + } + return lsn; + } + + /** called during recovery (the error message text below assumes that) + */ + unsigned long long journalReadLSN() { + if( !MemoryMappedFile::exists(lsnPath()) ) { + log() << "info no lsn file in journal/ directory" << endl; + return 0; + } + + try { + // os can flush as it likes. if it flushes slowly, we will just do extra work on recovery. + // however, given we actually close the file when writing, that seems unlikely. + LSNFile L; + File f; + f.open(lsnPath().string().c_str()); + assert(f.is_open()); + if( f.len() == 0 ) { + // this could be 'normal' if we crashed at the right moment + log() << "info lsn file is zero bytes long" << endl; + return 0; + } + f.read(0,(char*)&L, sizeof(L)); + unsigned long long lsn = L.get(); + return lsn; + } + catch(std::exception& e) { + uasserted(13611, str::stream() << "can't read lsn file in journal directory : " << e.what()); + } + return 0; + } + + unsigned long long getLastDataFileFlushTime() { + return j.lastFlushTime(); + } + + /** remember "last sequence number" to speed recoveries + concurrency: called by durThread only. + */ + void Journal::updateLSNFile() { + RACECHECK + if( !_writeToLSNNeeded ) + return; + _writeToLSNNeeded = false; + try { + // os can flush as it likes. if it flushes slowly, we will just do extra work on recovery. + // however, given we actually close the file, that seems unlikely. + File f; + f.open(lsnPath().string().c_str()); + if( !f.is_open() ) { + // can get 0 if an i/o error + log() << "warning: open of lsn file failed" << endl; + return; + } + LOG(1) << "lsn set " << _lastFlushTime << endl; + LSNFile lsnf; + lsnf.set(_lastFlushTime); + f.write(0, (char*)&lsnf, sizeof(lsnf)); + // do we want to fsync here? if we do it probably needs to be async so the durthread + // is not delayed. + } + catch(std::exception& e) { + log() << "warning: write to lsn file failed " << e.what() << endl; + // keep running (ignore the error). recovery will be slow. + } + } + + void Journal::preFlush() { + j._preFlushTime = Listener::getElapsedTimeMillis(); + } + + void Journal::postFlush() { + j._lastFlushTime = j._preFlushTime; + j._writeToLSNNeeded = true; + } + + // call from within _curLogFileMutex + void Journal::closeCurrentJournalFile() { + if (!_curLogFile) + return; + + JFile jf; + jf.filename = _curLogFile->_name; + jf.lastEventTimeMs = Listener::getElapsedTimeMillis(); + _oldJournalFiles.push_back(jf); + + delete _curLogFile; // close + _curLogFile = 0; + _written = 0; + } + + /** remove older journal files. + be in _curLogFileMutex but not dbMutex when calling + */ + void Journal::removeUnneededJournalFiles() { + while( !_oldJournalFiles.empty() ) { + JFile f = _oldJournalFiles.front(); + + if( f.lastEventTimeMs < _lastFlushTime + ExtraKeepTimeMs ) { + // eligible for deletion + path p( f.filename ); + log() << "old journal file will be removed: " << f.filename << endl; + removeOldJournalFile(p); + } + else { + break; + } + + _oldJournalFiles.pop_front(); + } + } + + /*int getAgeOutJournalFiles() { + mutex::try_lock lk(j._curLogFileMutex, 4000); + if( !lk.ok ) + return -1; + return j._ageOut ? 1 : 0; + }*/ + void setAgeOutJournalFiles(bool a) { + SimpleMutex::scoped_lock lk(j._curLogFileMutex); + j._ageOut = a; + } + + void Journal::_rotate() { + if( d.dbMutex.atLeastReadLocked() ) { + LOGSOME << "info journal _rotate called insider dbMutex - ok but should be somewhat rare" << endl; + } + + RACECHECK; + + _curLogFileMutex.dassertLocked(); + + if ( inShutdown() || !_curLogFile ) + return; + + j.updateLSNFile(); + + if( _curLogFile && _written < DataLimitPerJournalFile ) + return; + + if( _curLogFile ) { + _curLogFile->truncate(); + closeCurrentJournalFile(); + removeUnneededJournalFiles(); + } + + try { + Timer t; + _open(); + int ms = t.millis(); + if( ms >= 200 ) { + log() << "DR101 latency warning on journal file open " << ms << "ms" << endl; + } + } + catch(std::exception& e) { + log() << "warning exception opening journal file " << e.what() << endl; + throw; + } + } + + /** write (append) the buffer we have built to the journal and fsync it. + outside of dbMutex lock as this could be slow. + @param uncompressed - a buffer that will be written to the journal after compression + will not return until on disk + */ + void WRITETOJOURNAL(JSectHeader h, AlignedBuilder& uncompressed) { + Timer t; + j.journal(h, uncompressed); + stats.curr->_writeToJournalMicros += t.micros(); + } + void Journal::journal(const JSectHeader& h, const AlignedBuilder& uncompressed) { + RACECHECK + static AlignedBuilder b(32*1024*1024); + /* buffer to journal will be + JSectHeader + compressed operations + JSectFooter + */ + const unsigned headTailSize = sizeof(JSectHeader) + sizeof(JSectFooter); + const unsigned max = maxCompressedLength(uncompressed.len()) + headTailSize; + b.reset(max); + + { + dassert( h.sectionLen() == (unsigned) 0xffffffff ); // we will backfill later + b.appendStruct(h); + } + + size_t compressedLength = 0; + rawCompress(uncompressed.buf(), uncompressed.len(), b.cur(), &compressedLength); + assert( compressedLength < 0xffffffff ); + assert( compressedLength < max ); + b.skip(compressedLength); + + // footer + unsigned L = 0xffffffff; + { + // pad to alignment, and set the total section length in the JSectHeader + assert( 0xffffe000 == (~(Alignment-1)) ); + unsigned lenUnpadded = b.len() + sizeof(JSectFooter); + L = (lenUnpadded + Alignment-1) & (~(Alignment-1)); + dassert( L >= lenUnpadded ); + + ((JSectHeader*)b.atOfs(0))->setSectionLen(lenUnpadded); + + JSectFooter f(b.buf(), b.len()); // computes checksum + b.appendStruct(f); + dassert( b.len() == lenUnpadded ); + + b.skip(L - lenUnpadded); + dassert( b.len() % Alignment == 0 ); + } + + try { + SimpleMutex::scoped_lock lk(_curLogFileMutex); + + // must already be open -- so that _curFileId is correct for previous buffer building + assert( _curLogFile ); + + stats.curr->_uncompressedBytes += b.len(); + unsigned w = b.len(); + _written += w; + assert( w <= L ); + stats.curr->_journaledBytes += L; + _curLogFile->synchronousAppend((const void *) b.buf(), L); + _rotate(); + } + catch(std::exception& e) { + log() << "error exception in dur::journal " << e.what() << endl; + throw; + } + } + + } +} + +/* todo + test (and handle) disk full on journal append. best quick thing to do is to terminate. + if we roll back operations, there are nuances such as is ReplSetImpl::lastOpTimeWritten too new in ram then? +*/ diff --git a/src/mongo/db/dur_journal.h b/src/mongo/db/dur_journal.h new file mode 100644 index 00000000000..664f63942e0 --- /dev/null +++ b/src/mongo/db/dur_journal.h @@ -0,0 +1,68 @@ +// @file dur_journal.h + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +namespace mongo { + class AlignedBuilder; + + namespace dur { + + /** true if ok to cleanup journal files at termination. otherwise, files journal will be retained. + */ + extern bool okToCleanUp; + + /** at termination after db files closed & fsynced + also after recovery + closes and removes journal files + @param log report in log that we are cleaning up if we actually do any work + */ + void journalCleanup(bool log = false); + + /** assure journal/ dir exists. throws */ + void journalMakeDir(); + + /** check if time to rotate files; assure a file is open. + done separately from the journal() call as we can do this part + outside of lock. + only called by durThread. + */ + void journalRotate(); + + /** flag that something has gone wrong during writing to the journal + (not for recovery mode) + */ + void journalingFailure(const char *msg); + + /** read lsn from disk from the last run before doing recovery */ + unsigned long long journalReadLSN(); + + unsigned long long getLastDataFileFlushTime(); + + /** never throws. + @return true if there are any journal files in the journal dir. + */ + bool haveJournalFiles(); + + // in case disk controller buffers writes + const long long ExtraKeepTimeMs = 10000; + + const unsigned JournalCommitIntervalDefault = 100; + + } +} diff --git a/src/mongo/db/dur_journalformat.h b/src/mongo/db/dur_journalformat.h new file mode 100644 index 00000000000..10ed8487b71 --- /dev/null +++ b/src/mongo/db/dur_journalformat.h @@ -0,0 +1,174 @@ +// @file dur_journalformat.h The format of our journal files. + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +namespace mongo { + + namespace dur { + + const unsigned Alignment = 8192; + +#pragma pack(1) + /** beginning header for a journal/j._<n> file + there is nothing important int this header at this time. except perhaps version #. + */ + struct JHeader { + JHeader() { } + JHeader(string fname); + + char magic[2]; // "j\n". j means journal, then a linefeed, fwiw if you were to run "less" on the file or something... + + // x4142 is asci--readable if you look at the file with head/less -- thus the starting values were near + // that. simply incrementing the version # is safe on a fwd basis. +#if defined(_NOCOMPRESS) + enum { CurrentVersion = 0x4148 }; +#else + enum { CurrentVersion = 0x4149 }; +#endif + unsigned short _version; + + // these are just for diagnostic ease (make header more useful as plain text) + char n1; // '\n' + char ts[20]; // ascii timestamp of file generation. for user reading, not used by code. + char n2; // '\n' + char dbpath[128]; // path/filename of this file for human reading and diagnostics. not used by code. + char n3, n4; // '\n', '\n' + + unsigned long long fileId; // unique identifier that will be in each JSectHeader. important as we recycle prealloced files + + char reserved3[8026]; // 8KB total for the file header + char txt2[2]; // "\n\n" at the end + + bool versionOk() const { return _version == CurrentVersion; } + bool valid() const { return magic[0] == 'j' && txt2[1] == '\n' && fileId; } + }; + + /** "Section" header. A section corresponds to a group commit. + len is length of the entire section including header and footer. + header and footer are not compressed, just the stuff in between. + */ + struct JSectHeader { + private: + unsigned _sectionLen; // unpadded length in bytes of the whole section + public: + unsigned long long seqNumber; // sequence number that can be used on recovery to not do too much work + unsigned long long fileId; // matches JHeader::fileId + unsigned sectionLen() const { return _sectionLen; } + + // we store the unpadded length so we can use that when we uncompress. to + // get the true total size this must be rounded up to the Alignment. + void setSectionLen(unsigned lenUnpadded) { _sectionLen = lenUnpadded; } + + unsigned sectionLenWithPadding() const { + unsigned x = (sectionLen() + (Alignment-1)) & (~(Alignment-1)); + dassert( x % Alignment == 0 ); + return x; + } + }; + + /** an individual write operation within a group commit section. Either the entire section should + be applied, or nothing. (We check the md5 for the whole section before doing anything on recovery.) + */ + struct JEntry { + enum OpCodes { + OpCode_Footer = 0xffffffff, + OpCode_DbContext = 0xfffffffe, + OpCode_FileCreated = 0xfffffffd, + OpCode_DropDb = 0xfffffffc, + OpCode_Min = 0xfffff000 + }; + union { + unsigned len; // length in bytes of the data of the JEntry. does not include the JEntry header + OpCodes opcode; + }; + + unsigned ofs; // offset in file + + // sentinel and masks for _fileNo + enum { + DotNsSuffix = 0x7fffffff, // ".ns" file + LocalDbBit = 0x80000000 // assuming "local" db instead of using the JDbContext + }; + int _fileNo; // high bit is set to indicate it should be the <dbpath>/local database + // char data[len] follows + + const char * srcData() const { + const int *i = &_fileNo; + return (const char *) (i+1); + } + + int getFileNo() const { return _fileNo & (~LocalDbBit); } + void setFileNo(int f) { _fileNo = f; } + bool isNsSuffix() const { return getFileNo() == DotNsSuffix; } + + void setLocalDbContextBit() { _fileNo |= LocalDbBit; } + bool isLocalDbContext() const { return _fileNo & LocalDbBit; } + void clearLocalDbContextBit() { _fileNo = getFileNo(); } + + static string suffix(int fileno) { + if( fileno == DotNsSuffix ) return "ns"; + stringstream ss; + ss << fileno; + return ss.str(); + } + }; + + /** group commit section footer. md5 is a key field. */ + struct JSectFooter { + JSectFooter(); + JSectFooter(const void* begin, int len); // needs buffer to compute hash + unsigned sentinel; + unsigned char hash[16]; + unsigned long long reserved; + char magic[4]; // "\n\n\n\n" + + /** used by recovery to see if buffer is valid + @param begin the buffer + @param len buffer len + @return true if buffer looks valid + */ + bool checkHash(const void* begin, int len) const; + + bool magicOk() const { return *((unsigned*)magic) == 0x0a0a0a0a; } + }; + + /** declares "the next entry(s) are for this database / file path prefix" */ + struct JDbContext { + JDbContext() : sentinel(JEntry::OpCode_DbContext) { } + const unsigned sentinel; // compare to JEntry::len -- zero is our sentinel + //char dbname[]; + }; + + /** "last sequence number" */ + struct LSNFile { + unsigned ver; + unsigned reserved2; + unsigned long long lsn; + unsigned long long checkbytes; + unsigned long long reserved[8]; + + void set(unsigned long long lsn); + unsigned long long get(); + }; + +#pragma pack() + + } + +} diff --git a/src/mongo/db/dur_journalimpl.h b/src/mongo/db/dur_journalimpl.h new file mode 100644 index 00000000000..8aad70b0e5c --- /dev/null +++ b/src/mongo/db/dur_journalimpl.h @@ -0,0 +1,103 @@ +// @file dur_journal.h + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "dur_journalformat.h" +#include "../util/logfile.h" + +namespace mongo { + namespace dur { + + /** the writeahead journal for durability */ + class Journal { + public: + string dir; // set by journalMakeDir() during initialization + + Journal(); + + /** call during startup by journalMakeDir() */ + void init(); + + /** check if time to rotate files. assure a file is open. + done separately from the journal() call as we can do this part + outside of lock. + thread: durThread() + */ + void rotate(); + + /** append to the journal file + */ + void journal(const JSectHeader& h, const AlignedBuilder& b); + + boost::filesystem::path getFilePathFor(int filenumber) const; + + unsigned long long lastFlushTime() const { return _lastFlushTime; } + void cleanup(bool log); // closes and removes journal files + + unsigned long long curFileId() const { return _curFileId; } + + void assureLogFileOpen() { + SimpleMutex::scoped_lock lk(_curLogFileMutex); + if( _curLogFile == 0 ) + _open(); + } + + /** open a journal file to journal operations to. */ + void open(); + + private: + /** check if time to rotate files. assure a file is open. + * internally called with every commit + */ + void _rotate(); + + void _open(); + void closeCurrentJournalFile(); + void removeUnneededJournalFiles(); + + unsigned long long _written; // bytes written so far to the current journal (log) file + unsigned _nextFileNumber; + public: + SimpleMutex _curLogFileMutex; + bool _ageOut; + private: + + LogFile *_curLogFile; // use _curLogFileMutex + unsigned long long _curFileId; // current file id see JHeader::fileId + + struct JFile { + string filename; + unsigned long long lastEventTimeMs; + }; + + // files which have been closed but not unlinked (rotated out) yet + // ordered oldest to newest + list<JFile> _oldJournalFiles; // use _curLogFileMutex + + // lsn related + static void preFlush(); + static void postFlush(); + unsigned long long _preFlushTime; + unsigned long long _lastFlushTime; // data < this time is fsynced in the datafiles (unless hard drive controller is caching) + bool _writeToLSNNeeded; + void updateLSNFile(); + }; + + } +} diff --git a/src/mongo/db/dur_preplogbuffer.cpp b/src/mongo/db/dur_preplogbuffer.cpp new file mode 100644 index 00000000000..10b63c0e549 --- /dev/null +++ b/src/mongo/db/dur_preplogbuffer.cpp @@ -0,0 +1,177 @@ +// @file dur_preplogbuffer.cpp + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* + PREPLOGBUFFER + we will build an output buffer ourself and then use O_DIRECT + we could be in read lock for this + for very large objects write directly to redo log in situ? + @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc +*/ + +#include "pch.h" +#include "cmdline.h" +#include "dur.h" +#include "dur_journal.h" +#include "dur_journalimpl.h" +#include "dur_commitjob.h" +#include "../util/mongoutils/hash.h" +#include "../util/mongoutils/str.h" +#include "../util/alignedbuilder.h" +#include "../util/timer.h" +#include "dur_stats.h" +#include "../server.h" + +using namespace mongoutils; + +namespace mongo { + namespace dur { + + extern Journal j; + + RelativePath local = RelativePath::fromRelativePath("local"); + + static MongoMMF* findMMF_inlock(void *ptr, size_t &ofs) { + MongoMMF *f = privateViews.find_inlock(ptr, ofs); + if( f == 0 ) { + error() << "findMMF_inlock failed " << privateViews.numberOfViews_inlock() << endl; + printStackTrace(); // we want a stack trace and the assert below didn't print a trace once in the real world - not sure why + stringstream ss; + ss << "view pointer cannot be resolved " << hex << (size_t) ptr; + journalingFailure(ss.str().c_str()); // asserts, which then abends + } + return f; + } + + /** put the basic write operation into the buffer (bb) to be journaled */ + static void prepBasicWrite_inlock(AlignedBuilder&bb, const WriteIntent *i, RelativePath& lastDbPath) { + size_t ofs = 1; + MongoMMF *mmf = findMMF_inlock(i->start(), /*out*/ofs); + + if( unlikely(!mmf->willNeedRemap()) ) { + // tag this mmf as needed a remap of its private view later. + // usually it will already be dirty/already set, so we do the if above first + // to avoid possibility of cpu cache line contention + mmf->willNeedRemap() = true; + } + + // since we have already looked up the mmf, we go ahead and remember the write view location + // so we don't have to find the MongoMMF again later in WRITETODATAFILES() + // + // this was for WRITETODATAFILES_Impl2 so commented out now + // + /* + dassert( i->w_ptr == 0 ); + i->w_ptr = ((char*)mmf->view_write()) + ofs; + */ + + JEntry e; + e.len = min(i->length(), (unsigned)(mmf->length() - ofs)); //dont write past end of file + assert( ofs <= 0x80000000 ); + e.ofs = (unsigned) ofs; + e.setFileNo( mmf->fileSuffixNo() ); + if( mmf->relativePath() == local ) { + e.setLocalDbContextBit(); + } + else if( mmf->relativePath() != lastDbPath ) { + lastDbPath = mmf->relativePath(); + JDbContext c; + bb.appendStruct(c); + bb.appendStr(lastDbPath.toString()); + } + bb.appendStruct(e); +#if defined(_EXPERIMENTAL) + i->ofsInJournalBuffer = bb.len(); +#endif + bb.appendBuf(i->start(), e.len); + + if (unlikely(e.len != (unsigned)i->length())) { + log() << "journal info splitting prepBasicWrite at boundary" << endl; + + // This only happens if we write to the last byte in a file and + // the fist byte in another file that is mapped adjacently. I + // think most OSs leave at least a one page gap between + // mappings, but better to be safe. + + WriteIntent next ((char*)i->start() + e.len, i->length() - e.len); + prepBasicWrite_inlock(bb, &next, lastDbPath); + } + } + + /** basic write ops / write intents. note there is no particular order to these : if we have + two writes to the same location during the group commit interval, it is likely + (although not assured) that it is journaled here once. + */ + static void prepBasicWrites(AlignedBuilder& bb) { + scoped_lock lk(privateViews._mutex()); + + // each time events switch to a different database we journal a JDbContext + RelativePath lastDbPath; + + for( set<WriteIntent>::iterator i = commitJob.writes().begin(); i != commitJob.writes().end(); i++ ) { + prepBasicWrite_inlock(bb, &(*i), lastDbPath); + } + } + + static void resetLogBuffer(/*out*/JSectHeader& h, AlignedBuilder& bb) { + bb.reset(); + + h.setSectionLen(0xffffffff); // total length, will fill in later + h.seqNumber = getLastDataFileFlushTime(); + h.fileId = j.curFileId(); + } + + /** we will build an output buffer ourself and then use O_DIRECT + we could be in read lock for this + caller handles locking + @return partially populated sectheader and _ab set + */ + static void _PREPLOGBUFFER(JSectHeader& h) { + assert( cmdLine.dur ); + + { + // now that we are locked, fully drain deferred notes of write intents + DEV d.dbMutex.assertAtLeastReadLocked(); + Writes& writes = commitJob.wi(); + writes._deferred.invoke(); + writes._drained = true; + } + + AlignedBuilder& bb = commitJob._ab; + resetLogBuffer(h, bb); // adds JSectHeader + + // ops other than basic writes (DurOp's) + { + for( vector< shared_ptr<DurOp> >::iterator i = commitJob.ops().begin(); i != commitJob.ops().end(); ++i ) { + (*i)->serialize(bb); + } + } + + prepBasicWrites(bb); + + return; + } + void PREPLOGBUFFER(/*out*/ JSectHeader& h) { + Timer t; + j.assureLogFileOpen(); // so fileId is set + _PREPLOGBUFFER(h); + stats.curr->_prepLogBufferMicros += t.micros(); + } + + } +} diff --git a/src/mongo/db/dur_recover.cpp b/src/mongo/db/dur_recover.cpp new file mode 100644 index 00000000000..a0a8843572c --- /dev/null +++ b/src/mongo/db/dur_recover.cpp @@ -0,0 +1,544 @@ +// @file dur_recover.cpp crash recovery via the journal + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" + +#include "dur.h" +#include "dur_stats.h" +#include "dur_recover.h" +#include "dur_journal.h" +#include "dur_journalformat.h" +#include "durop.h" +#include "namespace.h" +#include "../util/mongoutils/str.h" +#include "../util/bufreader.h" +#include "../util/concurrency/race.h" +#include "pdfile.h" +#include "database.h" +#include "db.h" +#include "../util/unittest.h" +#include "../util/checksum.h" +#include "cmdline.h" +#include "curop.h" +#include "mongommf.h" +#include "../util/compress.h" + +#include <sys/stat.h> +#include <fcntl.h> + +using namespace mongoutils; + +namespace mongo { + + namespace dur { + + struct ParsedJournalEntry { /*copyable*/ + ParsedJournalEntry() : e(0) { } + + // relative path of database for the operation. + // might be a pointer into mmaped Journal file + const char *dbName; + + // thse are pointers into the memory mapped journal file + const JEntry *e; // local db sentinel is already parsed out here into dbName + + // if not one of the two simple JEntry's above, this is the operation: + shared_ptr<DurOp> op; + }; + + void removeJournalFiles(); + path getJournalDir(); + + /** get journal filenames, in order. throws if unexpected content found */ + static void getFiles(path dir, vector<path>& files) { + map<unsigned,path> m; + for ( boost::filesystem::directory_iterator i( dir ); + i != boost::filesystem::directory_iterator(); + ++i ) { + boost::filesystem::path filepath = *i; + string fileName = boost::filesystem::path(*i).leaf(); + if( str::startsWith(fileName, "j._") ) { + unsigned u = str::toUnsigned( str::after(fileName, '_') ); + if( m.count(u) ) { + uasserted(13531, str::stream() << "unexpected files in journal directory " << dir.string() << " : " << fileName); + } + m.insert( pair<unsigned,path>(u,filepath) ); + } + } + for( map<unsigned,path>::iterator i = m.begin(); i != m.end(); ++i ) { + if( i != m.begin() && m.count(i->first - 1) == 0 ) { + uasserted(13532, + str::stream() << "unexpected file in journal directory " << dir.string() + << " : " << boost::filesystem::path(i->second).leaf() << " : can't find its preceeding file"); + } + files.push_back(i->second); + } + } + + /** read through the memory mapped data of a journal file (journal/j._<n> file) + throws + */ + class JournalSectionIterator : boost::noncopyable { + auto_ptr<BufReader> _entries; + const JSectHeader _h; + const char *_lastDbName; // pointer into mmaped journal file + const bool _doDurOps; + string _uncompressed; + public: + JournalSectionIterator(const JSectHeader& h, const void *compressed, unsigned compressedLen, bool doDurOpsRecovering) : + _h(h), + _lastDbName(0) + , _doDurOps(doDurOpsRecovering) + { + assert( doDurOpsRecovering ); + bool ok = uncompress((const char *)compressed, compressedLen, &_uncompressed); + if( !ok ) { + // it should always be ok (i think?) as there is a previous check to see that the JSectFooter is ok + log() << "couldn't uncompress journal section" << endl; + msgasserted(15874, "couldn't uncompress journal section"); + } + const char *p = _uncompressed.c_str(); + assert( compressedLen == _h.sectionLen() - sizeof(JSectFooter) - sizeof(JSectHeader) ); + _entries = auto_ptr<BufReader>( new BufReader(p, _uncompressed.size()) ); + } + + // we work with the uncompressed buffer when doing a WRITETODATAFILES (for speed) + JournalSectionIterator(const JSectHeader &h, const void *p, unsigned len) : + _entries( new BufReader((const char *) p, len) ), + _h(h), + _lastDbName(0) + , _doDurOps(false) + + { } + + bool atEof() const { return _entries->atEof(); } + + unsigned long long seqNumber() const { return _h.seqNumber; } + + /** get the next entry from the log. this function parses and combines JDbContext and JEntry's. + * throws on premature end of section. + */ + void next(ParsedJournalEntry& e) { + unsigned lenOrOpCode; + _entries->read(lenOrOpCode); + + if (lenOrOpCode > JEntry::OpCode_Min) { + switch( lenOrOpCode ) { + + case JEntry::OpCode_Footer: { + assert( false ); + } + + case JEntry::OpCode_FileCreated: + case JEntry::OpCode_DropDb: { + e.dbName = 0; + boost::shared_ptr<DurOp> op = DurOp::read(lenOrOpCode, *_entries); + if (_doDurOps) { + e.op = op; + } + return; + } + + case JEntry::OpCode_DbContext: { + _lastDbName = (const char*) _entries->pos(); + const unsigned limit = std::min((unsigned)Namespace::MaxNsLen, _entries->remaining()); + const unsigned len = strnlen(_lastDbName, limit); + massert(13533, "problem processing journal file during recovery", _lastDbName[len] == '\0'); + _entries->skip(len+1); // skip '\0' too + _entries->read(lenOrOpCode); // read this for the fall through + } + // fall through as a basic operation always follows jdbcontext, and we don't have anything to return yet + + default: + // fall through + ; + } + } + + // JEntry - a basic write + assert( lenOrOpCode && lenOrOpCode < JEntry::OpCode_Min ); + _entries->rewind(4); + e.e = (JEntry *) _entries->skip(sizeof(JEntry)); + e.dbName = e.e->isLocalDbContext() ? "local" : _lastDbName; + assert( e.e->len == lenOrOpCode ); + _entries->skip(e.e->len); + } + + }; + + static string fileName(const char* dbName, int fileNo) { + stringstream ss; + ss << dbName << '.'; + assert( fileNo >= 0 ); + if( fileNo == JEntry::DotNsSuffix ) + ss << "ns"; + else + ss << fileNo; + + // relative name -> full path name + path full(dbpath); + full /= ss.str(); + return full.string(); + } + + RecoveryJob::~RecoveryJob() { + DESTRUCTOR_GUARD( + if( !_mmfs.empty() ) + close(); + ) + } + + void RecoveryJob::close() { + scoped_lock lk(_mx); + _close(); + } + + void RecoveryJob::_close() { + MongoFile::flushAll(true); + _mmfs.clear(); + } + + void RecoveryJob::write(const ParsedJournalEntry& entry) { + //TODO(mathias): look into making some of these dasserts + assert(entry.e); + assert(entry.dbName); + assert(strnlen(entry.dbName, MaxDatabaseNameLen) < MaxDatabaseNameLen); + + const string fn = fileName(entry.dbName, entry.e->getFileNo()); + MongoFile* file; + { + MongoFileFinder finder; // must release lock before creating new MongoMMF + file = finder.findByPath(fn); + } + + MongoMMF* mmf; + if (file) { + assert(file->isMongoMMF()); + mmf = (MongoMMF*)file; + } + else { + if( !_recovering ) { + log() << "journal error applying writes, file " << fn << " is not open" << endl; + assert(false); + } + boost::shared_ptr<MongoMMF> sp (new MongoMMF); + assert(sp->open(fn, false)); + _mmfs.push_back(sp); + mmf = sp.get(); + } + + if ((entry.e->ofs + entry.e->len) <= mmf->length()) { + assert(mmf->view_write()); + assert(entry.e->srcData()); + + void* dest = (char*)mmf->view_write() + entry.e->ofs; + memcpy(dest, entry.e->srcData(), entry.e->len); + stats.curr->_writeToDataFilesBytes += entry.e->len; + } + else { + massert(13622, "Trying to write past end of file in WRITETODATAFILES", _recovering); + } + } + + void RecoveryJob::applyEntry(const ParsedJournalEntry& entry, bool apply, bool dump) { + if( entry.e ) { + if( dump ) { + stringstream ss; + ss << " BASICWRITE " << setw(20) << entry.dbName << '.'; + if( entry.e->isNsSuffix() ) + ss << "ns"; + else + ss << setw(2) << entry.e->getFileNo(); + ss << ' ' << setw(6) << entry.e->len << ' ' << /*hex << setw(8) << (size_t) fqe.srcData << dec <<*/ + " " << hexdump(entry.e->srcData(), entry.e->len); + log() << ss.str() << endl; + } + if( apply ) { + write(entry); + } + } + else if(entry.op) { + // a DurOp subclass operation + if( dump ) { + log() << " OP " << entry.op->toString() << endl; + } + if( apply ) { + if( entry.op->needFilesClosed() ) { + _close(); // locked in processSection + } + entry.op->replay(); + } + } + } + + void RecoveryJob::applyEntries(const vector<ParsedJournalEntry> &entries) { + bool apply = (cmdLine.durOptions & CmdLine::DurScanOnly) == 0; + bool dump = cmdLine.durOptions & CmdLine::DurDumpJournal; + if( dump ) + log() << "BEGIN section" << endl; + + for( vector<ParsedJournalEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i ) { + applyEntry(*i, apply, dump); + } + + if( dump ) + log() << "END section" << endl; + } + + void RecoveryJob::processSection(const JSectHeader *h, const void *p, unsigned len, const JSectFooter *f) { + scoped_lock lk(_mx); + RACECHECK + + /** todo: we should really verify the checksum to see that seqNumber is ok? + that is expensive maybe there is some sort of checksum of just the header + within the header itself + */ + if( _recovering && _lastDataSyncedFromLastRun > h->seqNumber + ExtraKeepTimeMs ) { + if( h->seqNumber != _lastSeqMentionedInConsoleLog ) { + static int n; + if( ++n < 10 ) { + log() << "recover skipping application of section seq:" << h->seqNumber << " < lsn:" << _lastDataSyncedFromLastRun << endl; + } + else if( n == 10 ) { + log() << "recover skipping application of section more..." << endl; + } + _lastSeqMentionedInConsoleLog = h->seqNumber; + } + return; + } + + auto_ptr<JournalSectionIterator> i; + if( _recovering ) { + i = auto_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, p, len, _recovering)); + } + else { + i = auto_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, /*after header*/p, /*w/out header*/len)); + } + + // we use a static so that we don't have to reallocate every time through. occasionally we + // go back to a small allocation so that if there were a spiky growth it won't stick forever. + static vector<ParsedJournalEntry> entries; + entries.clear(); +/** TEMP uncomment + RARELY OCCASIONALLY { + if( entries.capacity() > 2048 ) { + entries.shrink_to_fit(); + entries.reserve(2048); + } + } +*/ + + // first read all entries to make sure this section is valid + ParsedJournalEntry e; + while( !i->atEof() ) { + i->next(e); + entries.push_back(e); + } + + // after the entries check the footer checksum + if( _recovering ) { + assert( ((const char *)h) + sizeof(JSectHeader) == p ); + if( !f->checkHash(h, len + sizeof(JSectHeader)) ) { + msgasserted(13594, "journal checksum doesn't match"); + } + } + + // got all the entries for one group commit. apply them: + applyEntries(entries); + } + + /** apply a specific journal file, that is already mmap'd + @param p start of the memory mapped file + @return true if this is detected to be the last file (ends abruptly) + */ + bool RecoveryJob::processFileBuffer(const void *p, unsigned len) { + try { + unsigned long long fileId; + BufReader br(p,len); + + { + // read file header + JHeader h; + br.read(h); + + /* [dm] not automatically handled. we should eventually handle this automatically. i think: + (1) if this is the final journal file + (2) and the file size is just the file header in length (or less) -- this is a bit tricky to determine if prealloced + then can just assume recovery ended cleanly and not error out (still should log). + */ + uassert(13537, + "journal file header invalid. This could indicate corruption in a journal file, or perhaps a crash where sectors in file header were in flight written out of order at time of crash (unlikely but possible).", + h.valid()); + + if( !h.versionOk() ) { + log() << "journal file version number mismatch got:" << hex << h._version + << " expected:" << hex << (unsigned) JHeader::CurrentVersion + << ". if you have just upgraded, recover with old version of mongod, terminate cleanly, then upgrade." + << endl; + uasserted(13536, str::stream() << "journal version number mismatch " << h._version); + } + fileId = h.fileId; + if(cmdLine.durOptions & CmdLine::DurDumpJournal) { + log() << "JHeader::fileId=" << fileId << endl; + } + } + + // read sections + while ( !br.atEof() ) { + JSectHeader h; + br.peek(h); + if( h.fileId != fileId ) { + if( debug || (cmdLine.durOptions & CmdLine::DurDumpJournal) ) { + log() << "Ending processFileBuffer at differing fileId want:" << fileId << " got:" << h.fileId << endl; + log() << " sect len:" << h.sectionLen() << " seqnum:" << h.seqNumber << endl; + } + return true; + } + unsigned slen = h.sectionLen(); + unsigned dataLen = slen - sizeof(JSectHeader) - sizeof(JSectFooter); + const char *hdr = (const char *) br.skip(h.sectionLenWithPadding()); + const char *data = hdr + sizeof(JSectHeader); + const char *footer = data + dataLen; + processSection((const JSectHeader*) hdr, data, dataLen, (const JSectFooter*) footer); + + // ctrl c check + killCurrentOp.checkForInterrupt(false); + } + } + catch( BufReader::eof& ) { + if( cmdLine.durOptions & CmdLine::DurDumpJournal ) + log() << "ABRUPT END" << endl; + return true; // abrupt end + } + + return false; // non-abrupt end + } + + /** apply a specific journal file */ + bool RecoveryJob::processFile(path journalfile) { + log() << "recover " << journalfile.string() << endl; + + try { + if( boost::filesystem::file_size( journalfile.string() ) == 0 ) { + log() << "recover info " << journalfile.string() << " has zero length" << endl; + return true; + } + } catch(...) { + // if something weird like a permissions problem keep going so the massert down below can happen (presumably) + log() << "recover exception checking filesize" << endl; + } + + MemoryMappedFile f; + void *p = f.mapWithOptions(journalfile.string().c_str(), MongoFile::READONLY | MongoFile::SEQUENTIAL); + massert(13544, str::stream() << "recover error couldn't open " << journalfile.string(), p); + return processFileBuffer(p, (unsigned) f.length()); + } + + /** @param files all the j._0 style files we need to apply for recovery */ + void RecoveryJob::go(vector<path>& files) { + log() << "recover begin" << endl; + _recovering = true; + + // load the last sequence number synced to the datafiles on disk before the last crash + _lastDataSyncedFromLastRun = journalReadLSN(); + log() << "recover lsn: " << _lastDataSyncedFromLastRun << endl; + + for( unsigned i = 0; i != files.size(); ++i ) { + bool abruptEnd = processFile(files[i]); + if( abruptEnd && i+1 < files.size() ) { + log() << "recover error: abrupt end to file " << files[i].string() << ", yet it isn't the last journal file" << endl; + close(); + uasserted(13535, "recover abrupt journal file end"); + } + } + + close(); + + if( cmdLine.durOptions & CmdLine::DurScanOnly ) { + uasserted(13545, str::stream() << "--durOptions " << (int) CmdLine::DurScanOnly << " (scan only) specified"); + } + + log() << "recover cleaning up" << endl; + removeJournalFiles(); + log() << "recover done" << endl; + okToCleanUp = true; + _recovering = false; + } + + void _recover() { + assert( cmdLine.dur ); + + boost::filesystem::path p = getJournalDir(); + if( !exists(p) ) { + log() << "directory " << p.string() << " does not exist, there will be no recovery startup step" << endl; + okToCleanUp = true; + return; + } + + vector<path> journalFiles; + getFiles(p, journalFiles); + + if( journalFiles.empty() ) { + log() << "recover : no journal files present, no recovery needed" << endl; + okToCleanUp = true; + return; + } + + RecoveryJob::get().go(journalFiles); + } + + extern mutex groupCommitMutex; + + /** recover from a crash + called during startup + throws on error + */ + void recover() { + // we use a lock so that exitCleanly will wait for us + // to finish (or at least to notice what is up and stop) + writelock lk; + + // this is so the mutexdebugger doesn't get confused. we are actually single threaded + // at this point in the program so it wouldn't have been a true problem (I think) + scoped_lock lk2(groupCommitMutex); + + _recover(); // throws on interruption + } + + struct BufReaderY { int a,b; }; + class BufReaderUnitTest : public UnitTest { + public: + void run() { + BufReader r((void*) "abcdabcdabcd", 12); + char x; + BufReaderY y; + r.read(x); //cout << x; // a + assert( x == 'a' ); + r.read(y); + r.read(x); + assert( x == 'b' ); + } + } brunittest; + + // can't free at termination because order of destruction of global vars is arbitrary + RecoveryJob &RecoveryJob::_instance = *(new RecoveryJob()); + + } // namespace dur + +} // namespace mongo + diff --git a/src/mongo/db/dur_recover.h b/src/mongo/db/dur_recover.h new file mode 100644 index 00000000000..955e730ea05 --- /dev/null +++ b/src/mongo/db/dur_recover.h @@ -0,0 +1,50 @@ +// @file dur.h durability support + +#pragma once + +#include "dur_journalformat.h" +#include "../util/concurrency/mutex.h" +#include "../util/file.h" + +namespace mongo { + class MongoMMF; + + namespace dur { + struct ParsedJournalEntry; + + /** call go() to execute a recovery from existing journal files. + */ + class RecoveryJob : boost::noncopyable { + public: + RecoveryJob() : _lastDataSyncedFromLastRun(0), + _mx("recovery"), _recovering(false) { _lastSeqMentionedInConsoleLog = 1; } + void go(vector<path>& files); + ~RecoveryJob(); + + /** @param data data between header and footer. compressed if recovering. */ + void processSection(const JSectHeader *h, const void *data, unsigned len, const JSectFooter *f); + + void close(); // locks and calls _close() + + static RecoveryJob & get() { return _instance; } + private: + void write(const ParsedJournalEntry& entry); // actually writes to the file + void applyEntry(const ParsedJournalEntry& entry, bool apply, bool dump); + void applyEntries(const vector<ParsedJournalEntry> &entries); + bool processFileBuffer(const void *, unsigned len); + bool processFile(path journalfile); + void _close(); // doesn't lock + + list<boost::shared_ptr<MongoMMF> > _mmfs; + + unsigned long long _lastDataSyncedFromLastRun; + unsigned long long _lastSeqMentionedInConsoleLog; + public: + mongo::mutex _mx; // protects _mmfs; see setNoJournal() too + private: + bool _recovering; // are we in recovery or WRITETODATAFILES + + static RecoveryJob &_instance; + }; + } +} diff --git a/src/mongo/db/dur_stats.h b/src/mongo/db/dur_stats.h new file mode 100644 index 00000000000..50a26d1f215 --- /dev/null +++ b/src/mongo/db/dur_stats.h @@ -0,0 +1,49 @@ +// @file dur_stats.h + +namespace mongo { + namespace dur { + + /** journaling stats. the model here is that the commit thread is the only writer, and that reads are + uncommon (from a serverStatus command and such). Thus, there should not be multicore chatter overhead. + */ + struct Stats { + Stats(); + void rotate(); + BSONObj asObj(); + unsigned _intervalMicros; + struct S { + BSONObj _asObj(); + string _asCSV(); + string _CSVHeader(); + void reset(); + + unsigned _commits; + unsigned _earlyCommits; // count of early commits from commitIfNeeded() or from getDur().commitNow() + unsigned long long _journaledBytes; + unsigned long long _uncompressedBytes; + unsigned long long _writeToDataFilesBytes; + + unsigned long long _prepLogBufferMicros; + unsigned long long _writeToJournalMicros; + unsigned long long _writeToDataFilesMicros; + unsigned long long _remapPrivateViewMicros; + + // undesirable to be in write lock for the group commit (it can be done in a read lock), so good if we + // have visibility when this happens. can happen for a couple reasons + // - read lock starvation + // - file being closed + // - data being written faster than the normal group commit interval + unsigned _commitsInWriteLock; + + unsigned _dtMillis; + }; + S *curr; + private: + S _a,_b; + unsigned long long _lastRotate; + S* other(); + }; + extern Stats stats; + + } +} diff --git a/src/mongo/db/dur_writetodatafiles.cpp b/src/mongo/db/dur_writetodatafiles.cpp new file mode 100644 index 00000000000..d77b0482c20 --- /dev/null +++ b/src/mongo/db/dur_writetodatafiles.cpp @@ -0,0 +1,94 @@ +// @file dur_writetodatafiles.cpp apply the writes back to the non-private MMF after they are for certain in redo log + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "dur_commitjob.h" +#include "dur_stats.h" +#include "dur_recover.h" +#include "../util/timer.h" + +namespace mongo { + namespace dur { + + void debugValidateAllMapsMatch(); + + static void WRITETODATAFILES_Impl1(const JSectHeader& h, AlignedBuilder& uncompressed) { + LockMongoFilesShared lk; + LOG(3) << "journal WRITETODATAFILES 1" << endl; + RecoveryJob::get().processSection(&h, uncompressed.buf(), uncompressed.len(), 0); + LOG(3) << "journal WRITETODATAFILES 2" << endl; + } + +#if 0 + // the old implementation. doesn't work with groupCommitWithLimitedLocks() + void WRITETODATAFILES_Impl2() { + /* we go backwards as what is at the end is most likely in the cpu cache. it won't be much, but we'll take it. */ + for( set<WriteIntent>::const_iterator it(commitJob.writes().begin()), end(commitJob.writes().end()); it != end; ++it ) { + const WriteIntent& intent = *it; + stats.curr->_writeToDataFilesBytes += intent.length(); + dassert(intent.w_ptr); + memcpy(intent.w_ptr, intent.start(), intent.length()); + } + } +#endif + +#if defined(_EXPERIMENTAL) + // doesn't work with groupCommitWithLimitedLocks() + void WRITETODATAFILES_Impl3() { + /* we go backwards as what is at the end is most likely in the cpu cache. it won't be much, but we'll take it. */ + for( set<WriteIntent>::const_iterator it(commitJob.writes().begin()), end(commitJob.writes().end()); it != end; ++it ) { + const WriteIntent& intent = *it; + stats.curr->_writeToDataFilesBytes += intent.length(); + dassert(intent.w_ptr); + memcpy(intent.w_ptr, + commitJob._ab.atOfs(intent.ofsInJournalBuffer), + intent.length()); + } + } +#endif + + /** apply the writes back to the non-private MMF after they are for certain in redo log + + (1) todo we don't need to write back everything every group commit. we MUST write back + that which is going to be a remapped on its private view - but that might not be all + views. + + (2) todo should we do this using N threads? would be quite easy + see Hackenberg paper table 5 and 6. 2 threads might be a good balance. + + (3) with enough work, we could do this outside the read lock. it's a bit tricky though. + - we couldn't do it from the private views then as they may be changing. would have to then + be from the journal alignedbuffer. + - we need to be careful the file isn't unmapped on us -- perhaps a mutex or something + with MongoMMF on closes or something to coordinate that. + + concurrency: in mmmutex, not necessarily in dbMutex + + @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc&hl=en + */ + + void WRITETODATAFILES(const JSectHeader& h, AlignedBuilder& uncompressed) { + Timer t; + WRITETODATAFILES_Impl1(h, uncompressed); + unsigned long long m = t.micros(); + stats.curr->_writeToDataFilesMicros += m; + LOG(2) << "journal WRITETODATAFILES " << m / 1000.0 << "ms" << endl; + } + + } +} diff --git a/src/mongo/db/durop.cpp b/src/mongo/db/durop.cpp new file mode 100644 index 00000000000..80ee5043410 --- /dev/null +++ b/src/mongo/db/durop.cpp @@ -0,0 +1,161 @@ +// @file durop.cpp + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "concurrency.h" +#include "../util/alignedbuilder.h" +#include "../util/mongoutils/str.h" +#include "../util/file.h" +#include "mongommf.h" +#include "durop.h" +#include "../util/file_allocator.h" + +using namespace mongoutils; + +namespace mongo { + + extern string dbpath; // --dbpath parm + + void _deleteDataFiles(const char *); + + namespace dur { + + /** read a durop from journal file referenced by br. + @param opcode the opcode which has already been written from the bufreader + */ + shared_ptr<DurOp> DurOp::read(unsigned opcode, BufReader& br) { + shared_ptr<DurOp> op; + switch( opcode ) { + case JEntry::OpCode_FileCreated: + op = shared_ptr<DurOp>( new FileCreatedOp(br) ); + break; + case JEntry::OpCode_DropDb: + op = shared_ptr<DurOp>( new DropDbOp(br) ); + break; + default: + massert(13546, (str::stream() << "journal recover: unrecognized opcode in journal " << opcode), false); + } + return op; + } + + void DurOp::serialize(AlignedBuilder& ab) { + ab.appendNum(_opcode); + _serialize(ab); + } + + DropDbOp::DropDbOp(BufReader& log) : DurOp(JEntry::OpCode_DropDb) { + unsigned long long reserved; + log.read(reserved); + log.read(reserved); + log.readStr(_db); + string reservedStr; + log.readStr(reservedStr); + } + + void DropDbOp::_serialize(AlignedBuilder& ab) { + ab.appendNum((unsigned long long) 0); // reserved for future use + ab.appendNum((unsigned long long) 0); // reserved for future use + ab.appendStr(_db); + ab.appendStr(""); // reserved + } + + /** throws */ + void DropDbOp::replay() { + log() << "recover replay drop db " << _db << endl; + _deleteDataFiles(_db.c_str()); + } + + FileCreatedOp::FileCreatedOp(string f, unsigned long long l) : + DurOp(JEntry::OpCode_FileCreated) { + _p = RelativePath::fromFullPath(f); + _len = l; + } + + FileCreatedOp::FileCreatedOp(BufReader& log) : DurOp(JEntry::OpCode_FileCreated) { + unsigned long long reserved; + log.read(reserved); + log.read(reserved); + log.read(_len); // size of file, not length of name + string s; + log.readStr(s); + _p._p = s; + } + + void FileCreatedOp::_serialize(AlignedBuilder& ab) { + ab.appendNum((unsigned long long) 0); // reserved for future use + ab.appendNum((unsigned long long) 0); // reserved for future use + ab.appendNum(_len); + ab.appendStr(_p.toString()); + } + + string FileCreatedOp::toString() { + return str::stream() << "FileCreatedOp " << _p.toString() << ' ' << _len/1024.0/1024.0 << "MB"; + } + + // if an operation deletes or creates a file (or moves etc.), it may need files closed. + bool FileCreatedOp::needFilesClosed() { + return exists( _p.asFullPath() ); + } + + void FileCreatedOp::replay() { + // i believe the code assumes new files are filled with zeros. thus we have to recreate the file, + // or rewrite at least, even if it were the right length. perhaps one day we should change that + // although easier to avoid defects if we assume it is zeros perhaps. + string full = _p.asFullPath(); + if( exists(full) ) { + try { + remove(full); + } + catch(std::exception& e) { + log(1) << "recover info FileCreateOp::replay unlink " << e.what() << endl; + } + } + + log() << "recover create file " << full << ' ' << _len/1024.0/1024.0 << "MB" << endl; + if( MemoryMappedFile::exists(full) ) { + // first delete if exists. + try { + remove(full); + } + catch(...) { + log() << "warning could not delete file " << full << endl; + } + } + ensureParentDirCreated(full); + File f; + f.open(full.c_str()); + massert(13547, str::stream() << "recover couldn't create file " << full, f.is_open()); + unsigned long long left = _len; + const unsigned blksz = 64 * 1024; + scoped_array<char> v( new char[blksz] ); + memset( v.get(), 0, blksz ); + fileofs ofs = 0; + while( left ) { + unsigned long long w = left < blksz ? left : blksz; + f.write(ofs, v.get(), (unsigned) w); + left -= w; + ofs += w; + } + f.fsync(); + flushMyDirectory(full); + massert(13628, str::stream() << "recover failure writing file " << full, !f.bad() ); + } + + } + +} diff --git a/src/mongo/db/durop.h b/src/mongo/db/durop.h new file mode 100644 index 00000000000..9ab1bfcbede --- /dev/null +++ b/src/mongo/db/durop.h @@ -0,0 +1,109 @@ +// @file durop.h class DurOp and descendants + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "dur_journalformat.h" +#include "../util/bufreader.h" +#include "../util/paths.h" + +namespace mongo { + + class AlignedBuilder; + + namespace dur { + + /** DurOp - Operations we journal that aren't just basic writes. + * + * Basic writes are logged as JEntry's, and indicated in ram temporarily as struct dur::WriteIntent. + * We don't make WriteIntent inherit from DurOp to keep it as lean as possible as there will be millions of + * them (we don't want a vtable for example there). + * + * For each op we want to journal, we define a subclass. + */ + class DurOp { /* copyable */ + public: + // @param opcode a sentinel value near max unsigned which uniquely identifies the operation. + // @see dur::JEntry + DurOp(unsigned opcode) : _opcode(opcode) { } + + virtual ~DurOp() { } + + /** serialize the op out to a builder which will then be written (presumably) to the journal */ + void serialize(AlignedBuilder& ab); + + /** read a durop from journal file referenced by br. + @param opcode the opcode which has already been written from the bufreader + */ + static shared_ptr<DurOp> read(unsigned opcode, BufReader& br); + + /** replay the operation (during recovery) + throws + + For now, these are not replayed during the normal WRITETODATAFILES phase, since these + operations are handled in other parts of the code. At some point this may change. + */ + virtual void replay() = 0; + + virtual string toString() = 0; + + /** if the op requires all file to be closed before doing its work, returns true. */ + virtual bool needFilesClosed() { return false; } + + protected: + /** DurOp will have already written the opcode for you */ + virtual void _serialize(AlignedBuilder& ab) = 0; + + private: + const unsigned _opcode; + }; + + /** indicates creation of a new file */ + class FileCreatedOp : public DurOp { + public: + FileCreatedOp(BufReader& log); + /** param f filename to create with path */ + FileCreatedOp(string f, unsigned long long l); + virtual void replay(); + virtual string toString(); + virtual bool needFilesClosed(); + protected: + virtual void _serialize(AlignedBuilder& ab); + private: + RelativePath _p; + unsigned long long _len; // size of file, not length of name + }; + + /** record drop of a database */ + class DropDbOp : public DurOp { + public: + DropDbOp(BufReader& log); + DropDbOp(string db) : + DurOp(JEntry::OpCode_DropDb), _db(db) { } + virtual void replay(); + virtual string toString() { return string("DropDbOp ") + _db; } + virtual bool needFilesClosed() { return true; } + protected: + virtual void _serialize(AlignedBuilder& ab); + private: + string _db; + }; + + } + +} diff --git a/src/mongo/db/extsort.cpp b/src/mongo/db/extsort.cpp new file mode 100644 index 00000000000..06a9756cc0a --- /dev/null +++ b/src/mongo/db/extsort.cpp @@ -0,0 +1,245 @@ +// extsort.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" + +#include "extsort.h" +#include "namespace-inl.h" +#include "../util/file.h" +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +namespace mongo { + + IndexInterface *BSONObjExternalSorter::extSortIdxInterface; + Ordering BSONObjExternalSorter::extSortOrder( Ordering::make(BSONObj()) ); + unsigned long long BSONObjExternalSorter::_compares = 0; + + BSONObjExternalSorter::BSONObjExternalSorter( IndexInterface &i, const BSONObj & order , long maxFileSize ) + : _idxi(i), _order( order.getOwned() ) , _maxFilesize( maxFileSize ) , + _arraySize(1000000), _cur(0), _curSizeSoFar(0), _sorted(0) { + + stringstream rootpath; + rootpath << dbpath; + if ( dbpath[dbpath.size()-1] != '/' ) + rootpath << "/"; + rootpath << "_tmp/esort." << time(0) << "." << rand() << "/"; + _root = rootpath.str(); + + log(1) << "external sort root: " << _root.string() << endl; + + create_directories( _root ); + _compares = 0; + } + + BSONObjExternalSorter::~BSONObjExternalSorter() { + if ( _cur ) { + delete _cur; + _cur = 0; + } + unsigned long removed = remove_all( _root ); + wassert( removed == 1 + _files.size() ); + } + + void BSONObjExternalSorter::_sortInMem() { + // extSortComp needs to use glbals + // qsort_r only seems available on bsd, which is what i really want to use + dblock l; + extSortIdxInterface = &_idxi; + extSortOrder = Ordering::make(_order); + _cur->sort( BSONObjExternalSorter::extSortComp ); + } + + void BSONObjExternalSorter::sort() { + uassert( 10048 , "already sorted" , ! _sorted ); + + _sorted = true; + + if ( _cur && _files.size() == 0 ) { + _sortInMem(); + log(1) << "\t\t not using file. size:" << _curSizeSoFar << " _compares:" << _compares << endl; + return; + } + + if ( _cur ) { + finishMap(); + } + + if ( _cur ) { + delete _cur; + _cur = 0; + } + + if ( _files.size() == 0 ) + return; + + } + + void BSONObjExternalSorter::add( const BSONObj& o , const DiskLoc & loc ) { + uassert( 10049 , "sorted already" , ! _sorted ); + + if ( ! _cur ) { + _cur = new InMemory( _arraySize ); + } + + Data& d = _cur->getNext(); + d.first = o.getOwned(); + d.second = loc; + + long size = o.objsize(); + _curSizeSoFar += size + sizeof( DiskLoc ) + sizeof( BSONObj ); + + if ( _cur->hasSpace() == false || _curSizeSoFar > _maxFilesize ) { + finishMap(); + log(1) << "finishing map" << endl; + } + + } + + void BSONObjExternalSorter::finishMap() { + uassert( 10050 , "bad" , _cur ); + + _curSizeSoFar = 0; + if ( _cur->size() == 0 ) + return; + + _sortInMem(); + + stringstream ss; + ss << _root.string() << "/file." << _files.size(); + string file = ss.str(); + + // todo: it may make sense to fadvise that this not be cached so that building the index doesn't + // eject other things the db is using from the file system cache. while we will soon be reading + // this back, if it fit in ram, there wouldn't have been a need for an external sort in the first + // place. + + ofstream out; + out.open( file.c_str() , ios_base::out | ios_base::binary ); + assertStreamGood( 10051 , (string)"couldn't open file: " + file , out ); + + int num = 0; + for ( InMemory::iterator i=_cur->begin(); i != _cur->end(); ++i ) { + Data p = *i; + out.write( p.first.objdata() , p.first.objsize() ); + out.write( (char*)(&p.second) , sizeof( DiskLoc ) ); + num++; + } + + _cur->clear(); + + _files.push_back( file ); + out.close(); + + log(2) << "Added file: " << file << " with " << num << "objects for external sort" << endl; + } + + // --------------------------------- + + BSONObjExternalSorter::Iterator::Iterator( BSONObjExternalSorter * sorter ) : + _cmp( sorter->_idxi, sorter->_order ) , _in( 0 ) { + + for ( list<string>::iterator i=sorter->_files.begin(); i!=sorter->_files.end(); i++ ) { + _files.push_back( new FileIterator( *i ) ); + _stash.push_back( pair<Data,bool>( Data( BSONObj() , DiskLoc() ) , false ) ); + } + + if ( _files.size() == 0 && sorter->_cur ) { + _in = sorter->_cur; + _it = sorter->_cur->begin(); + } + } + + BSONObjExternalSorter::Iterator::~Iterator() { + for ( vector<FileIterator*>::iterator i=_files.begin(); i!=_files.end(); i++ ) + delete *i; + _files.clear(); + } + + bool BSONObjExternalSorter::Iterator::more() { + + if ( _in ) + return _it != _in->end(); + + for ( vector<FileIterator*>::iterator i=_files.begin(); i!=_files.end(); i++ ) + if ( (*i)->more() ) + return true; + for ( vector< pair<Data,bool> >::iterator i=_stash.begin(); i!=_stash.end(); i++ ) + if ( i->second ) + return true; + return false; + } + + BSONObjExternalSorter::Data BSONObjExternalSorter::Iterator::next() { + + if ( _in ) { + Data& d = *_it; + ++_it; + return d; + } + + Data best; + int slot = -1; + + for ( unsigned i=0; i<_stash.size(); i++ ) { + + if ( ! _stash[i].second ) { + if ( _files[i]->more() ) + _stash[i] = pair<Data,bool>( _files[i]->next() , true ); + else + continue; + } + + if ( slot == -1 || _cmp( best , _stash[i].first ) == 0 ) { + best = _stash[i].first; + slot = i; + } + + } + + assert( slot >= 0 ); + _stash[slot].second = false; + + return best; + } + + // ----------------------------------- + + BSONObjExternalSorter::FileIterator::FileIterator( string file ) { + unsigned long long length; + _buf = (char*)_file.map( file.c_str() , length , MemoryMappedFile::SEQUENTIAL ); + massert( 10308 , "mmap failed" , _buf ); + assert( length == (unsigned long long) file_size( file ) ); + _end = _buf + length; + } + BSONObjExternalSorter::FileIterator::~FileIterator() {} + + bool BSONObjExternalSorter::FileIterator::more() { + return _buf < _end; + } + + BSONObjExternalSorter::Data BSONObjExternalSorter::FileIterator::next() { + BSONObj o( _buf ); + _buf += o.objsize(); + DiskLoc * l = (DiskLoc*)_buf; + _buf += 8; + return Data( o , *l ); + } + +} diff --git a/src/mongo/db/extsort.h b/src/mongo/db/extsort.h new file mode 100644 index 00000000000..15a6d441849 --- /dev/null +++ b/src/mongo/db/extsort.h @@ -0,0 +1,150 @@ +// extsort.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../pch.h" +#include "jsobj.h" +#include "namespace-inl.h" +#include "curop-inl.h" +#include "../util/array.h" + +namespace mongo { + + /** + for external (disk) sorting by BSONObj and attaching a value + */ + class BSONObjExternalSorter : boost::noncopyable { + public: + BSONObjExternalSorter( IndexInterface &i, const BSONObj & order = BSONObj() , long maxFileSize = 1024 * 1024 * 100 ); + ~BSONObjExternalSorter(); + typedef pair<BSONObj,DiskLoc> Data; + + private: + IndexInterface& _idxi; + + static int _compare(IndexInterface& i, const Data& l, const Data& r, const Ordering& order) { + RARELY killCurrentOp.checkForInterrupt(); + _compares++; + int x = i.keyCompare(l.first, r.first, order); + if ( x ) + return x; + return l.second.compare( r.second ); + } + + class MyCmp { + public: + MyCmp( IndexInterface& i, BSONObj order = BSONObj() ) : _i(i), _order( Ordering::make(order) ) {} + bool operator()( const Data &l, const Data &r ) const { + return _compare(_i, l, r, _order) < 0; + }; + private: + IndexInterface& _i; + const Ordering _order; + }; + + static IndexInterface *extSortIdxInterface; + static Ordering extSortOrder; + static int extSortComp( const void *lv, const void *rv ) { + DEV RARELY { + d.dbMutex.assertWriteLocked(); // must be as we use a global var + } + Data * l = (Data*)lv; + Data * r = (Data*)rv; + return _compare(*extSortIdxInterface, *l, *r, extSortOrder); + }; + + class FileIterator : boost::noncopyable { + public: + FileIterator( string file ); + ~FileIterator(); + bool more(); + Data next(); + private: + MemoryMappedFile _file; + char * _buf; + char * _end; + }; + + public: + + typedef FastArray<Data> InMemory; + + class Iterator : boost::noncopyable { + public: + + Iterator( BSONObjExternalSorter * sorter ); + ~Iterator(); + bool more(); + Data next(); + + private: + MyCmp _cmp; + vector<FileIterator*> _files; + vector< pair<Data,bool> > _stash; + + InMemory * _in; + InMemory::iterator _it; + + }; + + void add( const BSONObj& o , const DiskLoc & loc ); + void add( const BSONObj& o , int a , int b ) { + add( o , DiskLoc( a , b ) ); + } + + /* call after adding values, and before fetching the iterator */ + void sort(); + + auto_ptr<Iterator> iterator() { + uassert( 10052 , "not sorted" , _sorted ); + return auto_ptr<Iterator>( new Iterator( this ) ); + } + + int numFiles() { + return _files.size(); + } + + long getCurSizeSoFar() { return _curSizeSoFar; } + + void hintNumObjects( long long numObjects ) { + if ( numObjects < _arraySize ) + _arraySize = (int)(numObjects + 100); + } + + private: + + void _sortInMem(); + + void sort( string file ); + void finishMap(); + + BSONObj _order; + long _maxFilesize; + path _root; + + int _arraySize; + InMemory * _cur; + long _curSizeSoFar; + + list<string> _files; + bool _sorted; + + static unsigned long long _compares; + }; +} diff --git a/src/mongo/db/filever.h b/src/mongo/db/filever.h new file mode 100644 index 00000000000..e89a8243dcf --- /dev/null +++ b/src/mongo/db/filever.h @@ -0,0 +1,30 @@ +/* filever.h */ + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +namespace mongo { + + inline void checkDataFileVersion(NamespaceDetails& d) { + } + + inline void checkIndexFileVersion(NamespaceDetails& d) { + } + +} + diff --git a/src/mongo/db/flushtest.cpp b/src/mongo/db/flushtest.cpp new file mode 100644 index 00000000000..2009d922950 --- /dev/null +++ b/src/mongo/db/flushtest.cpp @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "pch.h" +#include <stdio.h> +#include "../util/goodies.h" +#include <fcntl.h> + +namespace mongo { + +#if defined(F_FULLFSYNC) + void fullsync(int f) { + fcntl( f, F_FULLFSYNC ); + } +#else + void fullsync(int f) { + fdatasync(f); + } +#endif + + int main(int argc, char* argv[], char *envp[] ) { + cout << "hello" << endl; + + FILE *f = fopen("/data/db/temptest", "a"); + + if ( f == 0 ) { + cout << "can't open file\n"; + return 1; + } + + { + Timer t; + for ( int i = 0; i < 50000; i++ ) + fwrite("abc", 3, 1, f); + cout << "small writes: " << t.millis() << "ms" << endl; + } + + { + Timer t; + for ( int i = 0; i < 10000; i++ ) { + fwrite("abc", 3, 1, f); + fflush(f); + fsync( fileno( f ) ); + } + int ms = t.millis(); + cout << "flush: " << ms << "ms, " << ms / 10000.0 << "ms/request" << endl; + } + + { + Timer t; + for ( int i = 0; i < 500; i++ ) { + fwrite("abc", 3, 1, f); + fflush(f); + fsync( fileno( f ) ); + sleepmillis(2); + } + int ms = t.millis() - 500 * 2; + cout << "flush with sleeps: " << ms << "ms, " << ms / 500.0 << "ms/request" << endl; + } + + char buf[8192]; + for ( int pass = 0; pass < 2; pass++ ) { + cout << "pass " << pass << endl; + { + Timer t; + int n = 500; + for ( int i = 0; i < n; i++ ) { + if ( pass == 0 ) + fwrite("abc", 3, 1, f); + else + fwrite(buf, 8192, 1, f); + buf[0]++; + fflush(f); + fullsync(fileno(f)); + } + int ms = t.millis(); + cout << "fullsync: " << ms << "ms, " << ms / ((double) n) << "ms/request" << endl; + } + + { + Timer t; + for ( int i = 0; i < 500; i++ ) { + if ( pass == 0 ) + fwrite("abc", 3, 1, f); + else + fwrite(buf, 8192, 1, f); + buf[0]++; + fflush(f); + fullsync(fileno(f)); + sleepmillis(2); + } + int ms = t.millis() - 2 * 500; + cout << "fullsync with sleeps: " << ms << "ms, " << ms / 500.0 << "ms/request" << endl; + } + } + + // without growing + { + fclose(f); + /* try from beginning of the file, where we aren't appending and changing the file length, + to see if this is faster as the directory entry then doesn't have to be flushed (if noatime in effect). + */ + f = fopen("/data/db/temptest", "r+"); + Timer t; + int n = 500; + for ( int i = 0; i < n; i++ ) { + fwrite("xyz", 3, 1, f); + fflush(f); + fullsync(fileno(f)); + } + int ms = t.millis(); + cout << "fullsync without growing: " << ms << "ms, " << ms / ((double) n) << "ms/request" << endl; + } + + // without growing, with delay + { + fclose(f); + /* try from beginning of the file, where we aren't appending and changing the file length, + to see if this is faster as the directory entry then doesn't have to be flushed (if noatime in effect). + */ + f = fopen("/data/db/temptest", "r+"); + Timer t; + int n = 500; + for ( int i = 0; i < n; i++ ) { + fwrite("xyz", 3, 1, f); + fflush(f); + fullsync(fileno(f)); + sleepmillis(2); + } + int ms = t.millis() - 2 * 500; + cout << "fullsync without growing with sleeps: " << ms << "ms, " << ms / ((double) n) << "ms/request" << endl; + } + + return 0; + } + +} // namespace mongo diff --git a/src/mongo/db/geo/2d.cpp b/src/mongo/db/geo/2d.cpp new file mode 100644 index 00000000000..f05ce4315b2 --- /dev/null +++ b/src/mongo/db/geo/2d.cpp @@ -0,0 +1,3289 @@ +// geo2d.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "../namespace-inl.h" +#include "../jsobj.h" +#include "../index.h" +#include "../../util/unittest.h" +#include "../commands.h" +#include "../pdfile.h" +#include "../btree.h" +#include "../curop-inl.h" +#include "../matcher.h" +#include "../queryutil.h" +#include "core.h" +#include "../../util/timer.h" + +// Note: we use indexinterface herein to talk to the btree code. In the future it would be nice to +// be able to use the V1 key class (see key.h) instead of toBson() which has some cost. +// toBson() is new with v1 so this could be slower than it used to be? a quick profiling +// might make sense. + +namespace mongo { + + class GeoKeyNode { + GeoKeyNode(); + public: + GeoKeyNode( DiskLoc bucket, int keyOfs, DiskLoc r, BSONObj k) : _bucket( bucket ), _keyOfs( keyOfs ), recordLoc(r), _key(k) { } + const DiskLoc _bucket; + const int _keyOfs; + const DiskLoc recordLoc; + const BSONObj _key; + }; + + // just use old indexes for geo for now. todo. +// typedef BtreeBucket<V0> GeoBtreeBucket; +// typedef GeoBtreeBucket::KeyNode GeoKeyNode; + +//#define BTREE btree<V0> + +#if 0 +# define CDEBUG -1 +#else +# define CDEBUG 10 +#endif + +#if 0 +# define GEODEBUGGING +# define GEODEBUG(x) cout << x << endl; +# define GEODEBUGPRINT(x) PRINT(x) + inline void PREFIXDEBUG(GeoHash prefix, const GeoConvert* g) { + if (!prefix.constrains()) { + cout << "\t empty prefix" << endl; + return ; + } + + Point ll (g, prefix); // lower left + prefix.move(1,1); + Point tr (g, prefix); // top right + + Point center ( (ll._x+tr._x)/2, (ll._y+tr._y)/2 ); + double radius = fabs(ll._x - tr._x) / 2; + + cout << "\t ll: " << ll.toString() << " tr: " << tr.toString() + << " center: " << center.toString() << " radius: " << radius << endl; + + } +#else +# define GEODEBUG(x) +# define GEODEBUGPRINT(x) +# define PREFIXDEBUG(x, y) +#endif + + const double EARTH_RADIUS_KM = 6371; + const double EARTH_RADIUS_MILES = EARTH_RADIUS_KM * 0.621371192; + + enum GeoDistType { + GEO_PLAIN, + GEO_SPHERE + }; + + inline double computeXScanDistance(double y, double maxDistDegrees) { + // TODO: this overestimates for large madDistDegrees far from the equator + return maxDistDegrees / min(cos(deg2rad(min(+89.0, y + maxDistDegrees))), + cos(deg2rad(max(-89.0, y - maxDistDegrees)))); + } + + GeoBitSets geoBitSets; + + const string GEO2DNAME = "2d"; + + class Geo2dType : public IndexType , public GeoConvert { + public: + virtual ~Geo2dType() { } + + Geo2dType( const IndexPlugin * plugin , const IndexSpec* spec ) + : IndexType( plugin , spec ) { + + BSONObjBuilder orderBuilder; + + BSONObjIterator i( spec->keyPattern ); + while ( i.more() ) { + BSONElement e = i.next(); + if ( e.type() == String && GEO2DNAME == e.valuestr() ) { + uassert( 13022 , "can't have 2 geo field" , _geo.size() == 0 ); + uassert( 13023 , "2d has to be first in index" , _other.size() == 0 ); + _geo = e.fieldName(); + } + else { + _other.push_back( e.fieldName() ); + } + orderBuilder.append( "" , 1 ); + } + + uassert( 13024 , "no geo field specified" , _geo.size() ); + + double bits = _configval( spec , "bits" , 26 ); // for lat/long, ~ 1ft + + uassert( 13028 , "bits in geo index must be between 1 and 32" , bits > 0 && bits <= 32 ); + + _bits = (unsigned) bits; + + _max = _configval( spec , "max" , 180.0 ); + _min = _configval( spec , "min" , -180.0 ); + + double numBuckets = (1024 * 1024 * 1024 * 4.0); + + _scaling = numBuckets / ( _max - _min ); + + _order = orderBuilder.obj(); + + GeoHash a(0, 0, _bits); + GeoHash b = a; + b.move(1, 1); + + // Epsilon is 1/100th of a bucket size + // TODO: Can we actually find error bounds for the sqrt function? + double epsilon = 0.001 / _scaling; + _error = distance(a, b) + epsilon; + + // Error in radians + _errorSphere = deg2rad( _error ); + + } + + double _configval( const IndexSpec* spec , const string& name , double def ) { + BSONElement e = spec->info[name]; + if ( e.isNumber() ) { + return e.numberDouble(); + } + return def; + } + + virtual BSONObj fixKey( const BSONObj& in ) { + if ( in.firstElement().type() == BinData ) + return in; + + BSONObjBuilder b(in.objsize()+16); + + if ( in.firstElement().isABSONObj() ) + _hash( in.firstElement().embeddedObject() ).append( b , "" ); + else if ( in.firstElement().type() == String ) + GeoHash( in.firstElement().valuestr() ).append( b , "" ); + else if ( in.firstElement().type() == RegEx ) + GeoHash( in.firstElement().regex() ).append( b , "" ); + else + return in; + + BSONObjIterator i(in); + i.next(); + while ( i.more() ) + b.append( i.next() ); + return b.obj(); + } + + /** Finds the key objects to put in an index */ + virtual void getKeys( const BSONObj& obj, BSONObjSet& keys ) const { + getKeys( obj, &keys, NULL ); + } + + /** Finds all locations in a geo-indexed object */ + // TODO: Can we just return references to the locs, if they won't change? + void getKeys( const BSONObj& obj, vector< BSONObj >& locs ) const { + getKeys( obj, NULL, &locs ); + } + + /** Finds the key objects and/or locations for a geo-indexed object */ + void getKeys( const BSONObj &obj, BSONObjSet* keys, vector< BSONObj >* locs ) const { + + BSONElementMSet bSet; + + // Get all the nested location fields, but don't return individual elements from + // the last array, if it exists. + obj.getFieldsDotted(_geo.c_str(), bSet, false); + + if( bSet.empty() ) + return; + + for( BSONElementMSet::iterator setI = bSet.begin(); setI != bSet.end(); ++setI ) { + + BSONElement geo = *setI; + + GEODEBUG( "Element " << geo << " found for query " << _geo.c_str() ); + + if ( geo.eoo() || ! geo.isABSONObj() ) + continue; + + // + // Grammar for location lookup: + // locs ::= [loc,loc,...,loc]|{<k>:loc,<k>:loc,...,<k>:loc}|loc + // loc ::= { <k1> : #, <k2> : # }|[#, #]|{} + // + // Empty locations are ignored, preserving single-location semantics + // + + BSONObj embed = geo.embeddedObject(); + if ( embed.isEmpty() ) + continue; + + // Differentiate between location arrays and locations + // by seeing if the first element value is a number + bool singleElement = embed.firstElement().isNumber(); + + BSONObjIterator oi(embed); + + while( oi.more() ) { + + BSONObj locObj; + + if( singleElement ) locObj = embed; + else { + BSONElement locElement = oi.next(); + + uassert( 13654, str::stream() << "location object expected, location array not in correct format", + locElement.isABSONObj() ); + + locObj = locElement.embeddedObject(); + + if( locObj.isEmpty() ) + continue; + } + + BSONObjBuilder b(64); + + // Remember the actual location object if needed + if( locs ) + locs->push_back( locObj ); + + // Stop if we don't need to get anything but location objects + if( ! keys ) { + if( singleElement ) break; + else continue; + } + + _hash( locObj ).append( b , "" ); + + // Go through all the other index keys + for ( vector<string>::const_iterator i = _other.begin(); i != _other.end(); ++i ) { + + // Get *all* fields for the index key + BSONElementSet eSet; + obj.getFieldsDotted( *i, eSet ); + + + if ( eSet.size() == 0 ) + b.appendAs( _spec->missingField(), "" ); + else if ( eSet.size() == 1 ) + b.appendAs( *(eSet.begin()), "" ); + else { + + // If we have more than one key, store as an array of the objects + + BSONArrayBuilder aBuilder; + + for( BSONElementSet::iterator ei = eSet.begin(); ei != eSet.end(); ++ei ) { + aBuilder.append( *ei ); + } + + BSONArray arr = aBuilder.arr(); + + b.append( "", arr ); + + } + + } + + keys->insert( b.obj() ); + + if( singleElement ) break; + + } + } + + } + + BSONObj _fromBSONHash( const BSONElement& e ) const { + return _unhash( _tohash( e ) ); + } + + BSONObj _fromBSONHash( const BSONObj& o ) const { + return _unhash( _tohash( o.firstElement() ) ); + } + + GeoHash _tohash( const BSONElement& e ) const { + if ( e.isABSONObj() ) + return _hash( e.embeddedObject() ); + + return GeoHash( e , _bits ); + } + + GeoHash _hash( const BSONObj& o ) const { + BSONObjIterator i(o); + uassert( 13067 , "geo field is empty" , i.more() ); + BSONElement x = i.next(); + uassert( 13068 , "geo field only has 1 element" , i.more() ); + BSONElement y = i.next(); + + uassert( 13026 , "geo values have to be numbers: " + o.toString() , x.isNumber() && y.isNumber() ); + + return hash( x.number() , y.number() ); + } + + GeoHash hash( const Point& p ) const { + return hash( p._x, p._y ); + } + + GeoHash hash( double x , double y ) const { + return GeoHash( _convert(x), _convert(y) , _bits ); + } + + BSONObj _unhash( const GeoHash& h ) const { + unsigned x , y; + h.unhash( x , y ); + BSONObjBuilder b; + b.append( "x" , _unconvert( x ) ); + b.append( "y" , _unconvert( y ) ); + return b.obj(); + } + + unsigned _convert( double in ) const { + uassert( 13027 , str::stream() << "point not in interval of [ " << _min << ", " << _max << " )", in < _max && in >= _min ); + in -= _min; + assert( in >= 0 ); + return (unsigned)(in * _scaling); + } + + double _unconvert( unsigned in ) const { + double x = in; + x /= _scaling; + x += _min; + return x; + } + + void unhash( const GeoHash& h , double& x , double& y ) const { + unsigned a,b; + h.unhash(a,b); + x = _unconvert( a ); + y = _unconvert( b ); + } + + double distance( const GeoHash& a , const GeoHash& b ) const { + double ax,ay,bx,by; + unhash( a , ax , ay ); + unhash( b , bx , by ); + + double dx = bx - ax; + double dy = by - ay; + + return sqrt( ( dx * dx ) + ( dy * dy ) ); + } + + double sizeDiag( const GeoHash& a ) const { + GeoHash b = a; + b.move( 1 , 1 ); + return distance( a , b ); + } + + double sizeEdge( const GeoHash& a ) const { + + if( ! a.constrains() ) + return _max - _min; + + double ax,ay,bx,by; + GeoHash b = a; + b.move( 1 , 1 ); + unhash( a, ax, ay ); + unhash( b, bx, by ); + + // _min and _max are a singularity + if (bx == _min) + bx = _max; + + return (fabs(ax-bx)); + } + + const IndexDetails* getDetails() const { + return _spec->getDetails(); + } + + virtual shared_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const; + + virtual IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const { + BSONElement e = query.getFieldDotted(_geo.c_str()); + switch ( e.type() ) { + case Object: { + BSONObj sub = e.embeddedObject(); + switch ( sub.firstElement().getGtLtOp() ) { + case BSONObj::opNEAR: + case BSONObj::opWITHIN: + return OPTIMAL; + default: + // We can try to match if there's no other indexing defined, + // this is assumed a point + return HELPFUL; + } + } + case Array: + // We can try to match if there's no other indexing defined, + // this is assumed a point + return HELPFUL; + default: + return USELESS; + } + } + + string _geo; + vector<string> _other; + + unsigned _bits; + double _max; + double _min; + double _scaling; + + BSONObj _order; + double _error; + double _errorSphere; + }; + + class Box { + public: + + Box( const Geo2dType * g , const GeoHash& hash ) + : _min( g , hash ) , + _max( _min._x + g->sizeEdge( hash ) , _min._y + g->sizeEdge( hash ) ) { + } + + Box( double x , double y , double size ) + : _min( x , y ) , + _max( x + size , y + size ) { + } + + Box( Point min , Point max ) + : _min( min ) , _max( max ) { + } + + Box() {} + + BSONArray toBSON() const { + return BSON_ARRAY( BSON_ARRAY( _min._x << _min._y ) << BSON_ARRAY( _max._x << _max._y ) ); + } + + string toString() const { + StringBuilder buf(64); + buf << _min.toString() << " -->> " << _max.toString(); + return buf.str(); + } + + bool between( double min , double max , double val , double fudge=0) const { + return val + fudge >= min && val <= max + fudge; + } + + bool onBoundary( double bound, double val, double fudge = 0 ) const { + return ( val >= bound - fudge && val <= bound + fudge ); + } + + bool mid( double amin , double amax , double bmin , double bmax , bool min , double& res ) const { + assert( amin <= amax ); + assert( bmin <= bmax ); + + if ( amin < bmin ) { + if ( amax < bmin ) + return false; + res = min ? bmin : amax; + return true; + } + if ( amin > bmax ) + return false; + res = min ? amin : bmax; + return true; + } + + double intersects( const Box& other ) const { + + Point boundMin(0,0); + Point boundMax(0,0); + + if ( mid( _min._x , _max._x , other._min._x , other._max._x , true , boundMin._x ) == false || + mid( _min._x , _max._x , other._min._x , other._max._x , false , boundMax._x ) == false || + mid( _min._y , _max._y , other._min._y , other._max._y , true , boundMin._y ) == false || + mid( _min._y , _max._y , other._min._y , other._max._y , false , boundMax._y ) == false ) + return 0; + + Box intersection( boundMin , boundMax ); + + return intersection.area() / area(); + } + + double area() const { + return ( _max._x - _min._x ) * ( _max._y - _min._y ); + } + + double maxDim() const { + return max( _max._x - _min._x, _max._y - _min._y ); + } + + Point center() const { + return Point( ( _min._x + _max._x ) / 2 , + ( _min._y + _max._y ) / 2 ); + } + + void truncate( const Geo2dType* g ) { + if( _min._x < g->_min ) _min._x = g->_min; + if( _min._y < g->_min ) _min._y = g->_min; + if( _max._x > g->_max ) _max._x = g->_max; + if( _max._y > g->_max ) _max._y = g->_max; + } + + void fudge( const Geo2dType* g ) { + _min._x -= g->_error; + _min._y -= g->_error; + _max._x += g->_error; + _max._y += g->_error; + } + + bool onBoundary( Point p, double fudge = 0 ) { + return onBoundary( _min._x, p._x, fudge ) || + onBoundary( _max._x, p._x, fudge ) || + onBoundary( _min._y, p._y, fudge ) || + onBoundary( _max._y, p._y, fudge ); + } + + bool inside( Point p , double fudge = 0 ) { + bool res = inside( p._x , p._y , fudge ); + //cout << "is : " << p.toString() << " in " << toString() << " = " << res << endl; + return res; + } + + bool inside( double x , double y , double fudge = 0 ) { + return + between( _min._x , _max._x , x , fudge ) && + between( _min._y , _max._y , y , fudge ); + } + + bool contains(const Box& other, double fudge=0) { + return inside(other._min, fudge) && inside(other._max, fudge); + } + + Point _min; + Point _max; + }; + + + class Polygon { + public: + + Polygon( void ) : _centroidCalculated( false ) {} + + Polygon( vector<Point> points ) : _centroidCalculated( false ), + _points( points ) { } + + void add( Point p ) { + _centroidCalculated = false; + _points.push_back( p ); + } + + int size( void ) const { + return _points.size(); + } + + /** + * Determine if the point supplied is contained by the current polygon. + * + * The algorithm uses a ray casting method. + */ + bool contains( const Point& p ) const { + return contains( p, 0 ) > 0; + } + + int contains( const Point &p, double fudge ) const { + + Box fudgeBox( Point( p._x - fudge, p._y - fudge ), Point( p._x + fudge, p._y + fudge ) ); + + int counter = 0; + Point p1 = _points[0]; + for ( int i = 1; i <= size(); i++ ) { + Point p2 = _points[i % size()]; + + GEODEBUG( "Doing intersection check of " << fudgeBox.toString() << " with seg " << p1.toString() << " to " << p2.toString() ); + + // We need to check whether or not this segment intersects our error box + if( fudge > 0 && + // Points not too far below box + fudgeBox._min._y <= std::max( p1._y, p2._y ) && + // Points not too far above box + fudgeBox._max._y >= std::min( p1._y, p2._y ) && + // Points not too far to left of box + fudgeBox._min._x <= std::max( p1._x, p2._x ) && + // Points not too far to right of box + fudgeBox._max._x >= std::min( p1._x, p2._x ) ) { + + GEODEBUG( "Doing detailed check" ); + + // If our box contains one or more of these points, we need to do an exact check. + if( fudgeBox.inside(p1) ) { + GEODEBUG( "Point 1 inside" ); + return 0; + } + if( fudgeBox.inside(p2) ) { + GEODEBUG( "Point 2 inside" ); + return 0; + } + + // Do intersection check for vertical sides + if ( p1._y != p2._y ) { + + double invSlope = ( p2._x - p1._x ) / ( p2._y - p1._y ); + + double xintersT = ( fudgeBox._max._y - p1._y ) * invSlope + p1._x; + if( fudgeBox._min._x <= xintersT && fudgeBox._max._x >= xintersT ) { + GEODEBUG( "Top intersection @ " << xintersT ); + return 0; + } + + double xintersB = ( fudgeBox._min._y - p1._y ) * invSlope + p1._x; + if( fudgeBox._min._x <= xintersB && fudgeBox._max._x >= xintersB ) { + GEODEBUG( "Bottom intersection @ " << xintersB ); + return 0; + } + + } + + // Do intersection check for horizontal sides + if( p1._x != p2._x ) { + + double slope = ( p2._y - p1._y ) / ( p2._x - p1._x ); + + double yintersR = ( p1._x - fudgeBox._max._x ) * slope + p1._y; + if( fudgeBox._min._y <= yintersR && fudgeBox._max._y >= yintersR ) { + GEODEBUG( "Right intersection @ " << yintersR ); + return 0; + } + + double yintersL = ( p1._x - fudgeBox._min._x ) * slope + p1._y; + if( fudgeBox._min._y <= yintersL && fudgeBox._max._y >= yintersL ) { + GEODEBUG( "Left intersection @ " << yintersL ); + return 0; + } + + } + + } + else if( fudge == 0 ){ + + // If this is an exact vertex, we won't intersect, so check this + if( p._y == p1._y && p._x == p1._x ) return 1; + else if( p._y == p2._y && p._x == p2._x ) return 1; + + // If this is a horizontal line we won't intersect, so check this + if( p1._y == p2._y && p._y == p1._y ){ + // Check that the x-coord lies in the line + if( p._x >= std::min( p1._x, p2._x ) && p._x <= std::max( p1._x, p2._x ) ) return 1; + } + + } + + // Normal intersection test. + // TODO: Invert these for clearer logic? + if ( p._y > std::min( p1._y, p2._y ) ) { + if ( p._y <= std::max( p1._y, p2._y ) ) { + if ( p._x <= std::max( p1._x, p2._x ) ) { + if ( p1._y != p2._y ) { + double xinters = (p._y-p1._y)*(p2._x-p1._x)/(p2._y-p1._y)+p1._x; + // Special case of point on vertical line + if ( p1._x == p2._x && p._x == p1._x ){ + + // Need special case for the vertical edges, for example: + // 1) \e pe/-----> + // vs. + // 2) \ep---e/-----> + // + // if we count exact as intersection, then 1 is in but 2 is out + // if we count exact as no-int then 1 is out but 2 is in. + + return 1; + } + else if( p1._x == p2._x || p._x <= xinters ) { + counter++; + } + } + } + } + } + + p1 = p2; + } + + if ( counter % 2 == 0 ) { + return -1; + } + else { + return 1; + } + } + + /** + * Calculate the centroid, or center of mass of the polygon object. + */ + Point centroid( void ) { + + /* Centroid is cached, it won't change betwen points */ + if ( _centroidCalculated ) { + return _centroid; + } + + Point cent; + double signedArea = 0.0; + double area = 0.0; // Partial signed area + + /// For all vertices except last + int i = 0; + for ( i = 0; i < size() - 1; ++i ) { + area = _points[i]._x * _points[i+1]._y - _points[i+1]._x * _points[i]._y ; + signedArea += area; + cent._x += ( _points[i]._x + _points[i+1]._x ) * area; + cent._y += ( _points[i]._y + _points[i+1]._y ) * area; + } + + // Do last vertex + area = _points[i]._x * _points[0]._y - _points[0]._x * _points[i]._y; + cent._x += ( _points[i]._x + _points[0]._x ) * area; + cent._y += ( _points[i]._y + _points[0]._y ) * area; + signedArea += area; + signedArea *= 0.5; + cent._x /= ( 6 * signedArea ); + cent._y /= ( 6 * signedArea ); + + _centroidCalculated = true; + _centroid = cent; + + return cent; + } + + Box bounds( void ) { + + // TODO: Cache this + + _bounds._max = _points[0]; + _bounds._min = _points[0]; + + for ( int i = 1; i < size(); i++ ) { + + _bounds._max._x = max( _bounds._max._x, _points[i]._x ); + _bounds._max._y = max( _bounds._max._y, _points[i]._y ); + _bounds._min._x = min( _bounds._min._x, _points[i]._x ); + _bounds._min._y = min( _bounds._min._y, _points[i]._y ); + + } + + return _bounds; + + } + + private: + + bool _centroidCalculated; + Point _centroid; + + Box _bounds; + + vector<Point> _points; + }; + + class Geo2dPlugin : public IndexPlugin { + public: + Geo2dPlugin() : IndexPlugin( GEO2DNAME ) { + } + + virtual IndexType* generate( const IndexSpec* spec ) const { + return new Geo2dType( this , spec ); + } + } geo2dplugin; + + void __forceLinkGeoPlugin() { + geo2dplugin.getName(); + } + + + + class GeoHopper; + + class GeoPoint { + public: + + GeoPoint() : _distance( -1 ), _exact( false ), _dirty( false ) + {} + + //// Distance not used //// + + GeoPoint( const GeoKeyNode& node ) + : _key( node._key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ), _distance( -1 ) , _exact( false ), _dirty( false ), _bucket( node._bucket ), _pos( node._keyOfs ) { + } + + //// Immediate initialization of distance //// + + GeoPoint( const GeoKeyNode& node, double distance, bool exact ) + : _key( node._key ) , _loc( node.recordLoc ) , _o( node.recordLoc.obj() ), _distance( distance ), _exact( exact ), _dirty( false ) { + } + + GeoPoint( const GeoPoint& pt, double distance, bool exact ) + : _key( pt.key() ) , _loc( pt.loc() ) , _o( pt.obj() ), _distance( distance ), _exact( exact ), _dirty( false ) { + } + + bool operator<( const GeoPoint& other ) const { + if( _distance != other._distance ) return _distance < other._distance; + if( _exact != other._exact ) return _exact < other._exact; + return _loc < other._loc; + } + + double distance() const { + return _distance; + } + + bool isExact() const { + return _exact; + } + + BSONObj key() const { + return _key; + } + + bool hasLoc() const { + return _loc.isNull(); + } + + DiskLoc loc() const { + assert( ! _dirty ); + return _loc; + } + + BSONObj obj() const { + return _o; + } + + BSONObj pt() const { + return _pt; + } + + bool isEmpty() { + return _o.isEmpty(); + } + + bool isCleanAndEmpty() { + return isEmpty() && ! isDirty(); + } + + string toString() const { + return str::stream() << "Point from " << _key << " - " << _o << " dist : " << _distance << ( _exact ? " (ex)" : " (app)" ); + } + + + // TODO: Recover from yield by finding all the changed disk locs here, modifying the _seenPts array. + // Not sure yet the correct thing to do about _seen. + // Definitely need to re-find our current max/min locations too + bool unDirty( const Geo2dType* g, DiskLoc& oldLoc ){ + + assert( _dirty ); + assert( ! _id.isEmpty() ); + + oldLoc = _loc; + _loc = DiskLoc(); + + // Fast undirty + IndexInterface& ii = g->getDetails()->idxInterface(); + // Check this position and the one immediately preceding + for( int i = 0; i < 2; i++ ){ + if( _pos - i < 0 ) continue; + + // log() << "bucket : " << _bucket << " pos " << _pos << endl; + + BSONObj key; + DiskLoc loc; + ii.keyAt( _bucket, _pos - i, key, loc ); + + // log() << "Loc: " << loc << " Key : " << key << endl; + + if( loc.isNull() ) continue; + + if( key.binaryEqual( _key ) && loc.obj()["_id"].wrap( "" ).binaryEqual( _id ) ){ + _pos = _pos - i; + _loc = loc; + _dirty = false; + _o = loc.obj(); + return true; + } + } + + // Slow undirty + scoped_ptr<BtreeCursor> cursor( BtreeCursor::make( nsdetails( g->getDetails()->parentNS().c_str() ), + *( g->getDetails() ), _key, _key, true, 1 ) ); + + int count = 0; + while( cursor->ok() ){ + count++; + if( cursor->current()["_id"].wrap( "" ).binaryEqual( _id ) ){ + _bucket = cursor->getBucket(); + _pos = cursor->getKeyOfs(); + _loc = cursor->currLoc(); + _o = _loc.obj(); + break; + } + else{ + LOG( CDEBUG + 1 ) << "Key doesn't match : " << cursor->current()["_id"] << " saved : " << _id << endl; + } + cursor->advance(); + } + + if( ! count ) { LOG( CDEBUG ) << "No key found for " << _key << endl; } + + _dirty = false; + + return _loc == oldLoc; + } + + bool isDirty(){ + return _dirty; + } + + bool makeDirty(){ + if( ! _dirty ){ + assert( ! obj()["_id"].eoo() ); + assert( ! _bucket.isNull() ); + assert( _pos >= 0 ); + + if( _id.isEmpty() ){ + _id = obj()["_id"].wrap( "" ).getOwned(); + } + _o = BSONObj(); + _key = _key.getOwned(); + _pt = _pt.getOwned(); + _dirty = true; + + return true; + } + + return false; + } + + BSONObj _key; + DiskLoc _loc; + BSONObj _o; + BSONObj _pt; + + double _distance; + bool _exact; + + BSONObj _id; + bool _dirty; + DiskLoc _bucket; + int _pos; + }; + + // GeoBrowse subclasses this + class GeoAccumulator { + public: + GeoAccumulator( const Geo2dType * g , const BSONObj& filter, bool uniqueDocs, bool needDistance ) + : _g(g) , + _lookedAt(0) , + _matchesPerfd(0) , + _objectsLoaded(0) , + _pointsLoaded(0) , + _found(0) , + _uniqueDocs( uniqueDocs ) , + _needDistance( needDistance ) + { + if ( ! filter.isEmpty() ) { + _matcher.reset( new CoveredIndexMatcher( filter , g->keyPattern() ) ); + GEODEBUG( "Matcher is now " << _matcher->docMatcher().toString() ); + } + } + + virtual ~GeoAccumulator() { } + + enum KeyResult { BAD, BORDER, GOOD }; + + virtual void add( const GeoKeyNode& node ) { + + GEODEBUG( "\t\t\t\t checking key " << node._key.toString() ) + + _lookedAt++; + + //// + // Approximate distance check using key data + //// + double keyD = 0; + Point keyP( _g, GeoHash( node._key.firstElement(), _g->_bits ) ); + KeyResult keyOk = approxKeyCheck( keyP, keyD ); + if ( keyOk == BAD ) { + GEODEBUG( "\t\t\t\t bad distance : " << node.recordLoc.obj() << "\t" << keyD ); + return; + } + GEODEBUG( "\t\t\t\t good distance : " << node.recordLoc.obj() << "\t" << keyD ); + + //// + // Check for match using other key (and potentially doc) criteria + //// + // Remember match results for each object + map<DiskLoc, bool>::iterator match = _matched.find( node.recordLoc ); + bool newDoc = match == _matched.end(); + if( newDoc ) { + + GEODEBUG( "\t\t\t\t matching new doc with " << (_matcher ? _matcher->docMatcher().toString() : "(empty)" ) ); + + // matcher + MatchDetails details; + if ( _matcher.get() ) { + bool good = _matcher->matchesWithSingleKeyIndex( node._key , node.recordLoc , &details ); + + _matchesPerfd++; + + if ( details._loadedObject ) + _objectsLoaded++; + + if ( ! good ) { + GEODEBUG( "\t\t\t\t didn't match : " << node.recordLoc.obj()["_id"] ); + _matched[ node.recordLoc ] = false; + return; + } + } + + _matched[ node.recordLoc ] = true; + + if ( ! details._loadedObject ) // don't double count + _objectsLoaded++; + + } + else if( !((*match).second) ) { + GEODEBUG( "\t\t\t\t previously didn't match : " << node.recordLoc.obj()["_id"] ); + return; + } + + //// + // Exact check with particular data fields + //// + // Can add multiple points + int diff = addSpecific( node , keyP, keyOk == BORDER, keyD, newDoc ); + if( diff > 0 ) _found += diff; + else _found -= -diff; + + } + + virtual void getPointsFor( const BSONObj& key, const BSONObj& obj, vector< BSONObj >& locsForNode, bool allPoints = false ){ + + // Find all the location objects from the keys + vector< BSONObj > locs; + _g->getKeys( obj, allPoints ? locsForNode : locs ); + _pointsLoaded++; + + if( allPoints ) return; + if( locs.size() == 1 ){ + locsForNode.push_back( locs[0] ); + return; + } + + // Find the particular location we want + GeoHash keyHash( key.firstElement(), _g->_bits ); + + // log() << "Hash: " << node.key << " and " << keyHash.getHash() << " unique " << _uniqueDocs << endl; + for( vector< BSONObj >::iterator i = locs.begin(); i != locs.end(); ++i ) { + + // Ignore all locations not hashed to the key's hash, since we may see + // those later + if( _g->_hash( *i ) != keyHash ) continue; + + locsForNode.push_back( *i ); + + } + + } + + virtual int addSpecific( const GeoKeyNode& node, const Point& p , bool inBounds, double d, bool newDoc ) = 0; + virtual KeyResult approxKeyCheck( const Point& p , double& keyD ) = 0; + virtual bool exactDocCheck( const Point& p , double& d ) = 0; + virtual bool expensiveExactCheck(){ return false; } + + + long long found() const { + return _found; + } + + const Geo2dType * _g; + map<DiskLoc, bool> _matched; + shared_ptr<CoveredIndexMatcher> _matcher; + + long long _lookedAt; + long long _matchesPerfd; + long long _objectsLoaded; + long long _pointsLoaded; + long long _found; + + bool _uniqueDocs; + bool _needDistance; + + }; + + + struct BtreeLocation { + BtreeLocation() { } + + scoped_ptr<BtreeCursor> _cursor; + scoped_ptr<FieldRangeSet> _frs; + scoped_ptr<IndexSpec> _spec; + + BSONObj key() { + return _cursor->currKey(); + } + + bool hasPrefix( const GeoHash& hash ) { + BSONObj k = key(); + BSONElement e = k.firstElement(); + if ( e.eoo() ) + return false; + return GeoHash( e ).hasPrefix( hash ); + } + + bool checkAndAdvance( const GeoHash& hash, int& totalFound, GeoAccumulator* all ){ + if( ! _cursor->ok() || ! hasPrefix( hash ) ) return false; + + if( all ){ + totalFound++; + GeoKeyNode n( _cursor->getBucket(), _cursor->getKeyOfs(), _cursor->currLoc(), _cursor->currKey() ); + all->add( n ); + } + _cursor->advance(); + + return true; + } + + void save(){ + _cursor->noteLocation(); + } + + void restore(){ + _cursor->checkLocation(); + } + + string toString() { + stringstream ss; + ss << "bucket: " << _cursor->getBucket().toString() << " pos: " << _cursor->getKeyOfs() << + ( _cursor->ok() ? ( str::stream() << " k: " << _cursor->currKey() << " o : " << _cursor->current()["_id"] ) : (string)"[none]" ) << endl; + return ss.str(); + } + + // Returns the min and max keys which bound a particular location. + // The only time these may be equal is when we actually equal the location + // itself, otherwise our expanding algorithm will fail. + static bool initial( const IndexDetails& id , const Geo2dType * spec , + BtreeLocation& min , BtreeLocation& max , + GeoHash start , + int & found , GeoAccumulator * hopper ) { + + //Ordering ordering = Ordering::make(spec->_order); + + // Would be nice to build this directly, but bug in max/min queries SERVER-3766 and lack of interface + // makes this easiest for now. + BSONObj minQuery = BSON( spec->_geo << BSON( "$gt" << MINKEY << start.wrap( "$lte" ).firstElement() ) ); + BSONObj maxQuery = BSON( spec->_geo << BSON( "$lt" << MAXKEY << start.wrap( "$gt" ).firstElement() ) ); + + // log() << "MinQuery: " << minQuery << endl; + // log() << "MaxQuery: " << maxQuery << endl; + + min._frs.reset( new FieldRangeSet( spec->getDetails()->parentNS().c_str(), + minQuery, + true, + false ) ); + + max._frs.reset( new FieldRangeSet( spec->getDetails()->parentNS().c_str(), + maxQuery, + true, + false ) ); + + + BSONObjBuilder bob; + bob.append( spec->_geo, 1 ); + for( vector<string>::const_iterator i = spec->_other.begin(); i != spec->_other.end(); i++ ){ + bob.append( *i, 1 ); + } + BSONObj iSpec = bob.obj(); + + min._spec.reset( new IndexSpec( iSpec ) ); + max._spec.reset( new IndexSpec( iSpec ) ); + + shared_ptr<FieldRangeVector> frvMin( new FieldRangeVector( *(min._frs), *(min._spec), -1 ) ); + shared_ptr<FieldRangeVector> frvMax( new FieldRangeVector( *(max._frs), *(max._spec), 1 ) ); + + min._cursor.reset( + BtreeCursor::make( nsdetails( spec->getDetails()->parentNS().c_str() ), *( spec->getDetails() ), + frvMin, -1 ) + ); + + max._cursor.reset( + BtreeCursor::make( nsdetails( spec->getDetails()->parentNS().c_str() ), *( spec->getDetails() ), + frvMax, 1 ) + ); + + // if( hopper ) min.checkCur( found, hopper ); + // if( hopper ) max.checkCur( found, hopper ); + + return min._cursor->ok() || max._cursor->ok(); + } + }; + + + class GeoCursorBase : public Cursor { + public: + + static const shared_ptr< CoveredIndexMatcher > emptyMatcher; + + GeoCursorBase( const Geo2dType * spec ) + : _spec( spec ), _id( _spec->getDetails() ) { + + } + + virtual DiskLoc refLoc() { return DiskLoc(); } + + virtual BSONObj indexKeyPattern() { + return _spec->keyPattern(); + } + + virtual void noteLocation() { + // no-op since these are meant to be safe + } + + /* called before query getmore block is iterated */ + virtual void checkLocation() { + // no-op since these are meant to be safe + } + + virtual bool supportGetMore() { return false; } + virtual bool supportYields() { return false; } + + virtual bool getsetdup(DiskLoc loc) { return false; } + virtual bool modifiedKeys() const { return true; } + virtual bool isMultiKey() const { return false; } + + virtual bool autoDedup() const { return false; } + + const Geo2dType * _spec; + const IndexDetails * _id; + }; + + const shared_ptr< CoveredIndexMatcher > GeoCursorBase::emptyMatcher( new CoveredIndexMatcher( BSONObj(), BSONObj(), false ) ); + + // TODO: Pull out the cursor bit from the browse, have GeoBrowse as field of cursor to clean up + // this hierarchy a bit. Also probably useful to look at whether GeoAccumulator can be a member instead + // of a superclass. + class GeoBrowse : public GeoCursorBase , public GeoAccumulator { + public: + + // The max points which should be added to an expanding box at one time + static const int maxPointsHeuristic = 50; + + // Expand states + enum State { + START , + DOING_EXPAND , + DONE_NEIGHBOR , + DONE + } _state; + + GeoBrowse( const Geo2dType * g , string type , BSONObj filter = BSONObj(), bool uniqueDocs = true, bool needDistance = false ) + : GeoCursorBase( g ), GeoAccumulator( g , filter, uniqueDocs, needDistance ) , + _type( type ) , _filter( filter ) , _firstCall(true), _noted( false ), _nscanned(), _nDirtied(0), _nChangedOnYield(0), _nRemovedOnYield(0), _centerPrefix(0, 0, 0) { + + // Set up the initial expand state + _state = START; + _neighbor = -1; + _foundInExp = 0; + + } + + virtual string toString() { + return (string)"GeoBrowse-" + _type; + } + + virtual bool ok() { + + bool filled = false; + + LOG( CDEBUG ) << "Checking cursor, in state " << (int) _state << ", first call " << _firstCall << + ", empty : " << _cur.isEmpty() << ", dirty : " << _cur.isDirty() << ", stack : " << _stack.size() << endl; + + bool first = _firstCall; + if ( _firstCall ) { + fillStack( maxPointsHeuristic ); + filled = true; + _firstCall = false; + } + if ( ! _cur.isCleanAndEmpty() || _stack.size() ) { + if ( first ) { + ++_nscanned; + } + + if( _noted && filled ) noteLocation(); + return true; + } + + while ( moreToDo() ) { + + LOG( CDEBUG ) << "Refilling stack..." << endl; + + fillStack( maxPointsHeuristic ); + filled = true; + + if ( ! _cur.isCleanAndEmpty() ) { + if ( first ) { + ++_nscanned; + } + + if( _noted && filled ) noteLocation(); + return true; + } + } + + if( _noted && filled ) noteLocation(); + return false; + } + + virtual bool advance() { + _cur._o = BSONObj(); + + if ( _stack.size() ) { + _cur = _stack.front(); + _stack.pop_front(); + ++_nscanned; + return true; + } + + if ( ! moreToDo() ) + return false; + + bool filled = false; + while ( _cur.isCleanAndEmpty() && moreToDo() ){ + fillStack( maxPointsHeuristic ); + filled = true; + } + + if( _noted && filled ) noteLocation(); + return ! _cur.isCleanAndEmpty() && ++_nscanned; + } + + virtual void noteLocation() { + _noted = true; + + LOG( CDEBUG ) << "Noting location with " << _stack.size() << ( _cur.isEmpty() ? "" : " + 1 " ) << " points " << endl; + + // Make sure we advance past the point we're at now, + // since the current location may move on an update/delete + // if( _state == DOING_EXPAND ){ + // if( _min.hasPrefix( _prefix ) ){ _min.advance( -1, _foundInExp, this ); } + // if( _max.hasPrefix( _prefix ) ){ _max.advance( 1, _foundInExp, this ); } + // } + + // Remember where our _max, _min are + _min.save(); + _max.save(); + + LOG( CDEBUG ) << "Min " << _min.toString() << endl; + LOG( CDEBUG ) << "Max " << _max.toString() << endl; + + // Dirty all our queued stuff + for( list<GeoPoint>::iterator i = _stack.begin(); i != _stack.end(); i++ ){ + + LOG( CDEBUG ) << "Undirtying stack point with id " << i->_id << endl; + + if( i->makeDirty() ) _nDirtied++; + assert( i->isDirty() ); + } + + // Check current item + if( ! _cur.isEmpty() ){ + if( _cur.makeDirty() ) _nDirtied++; + } + + // Our cached matches become invalid now + _matched.clear(); + } + + void fixMatches( DiskLoc oldLoc, DiskLoc newLoc ){ + map<DiskLoc, bool>::iterator match = _matched.find( oldLoc ); + if( match != _matched.end() ){ + bool val = match->second; + _matched.erase( oldLoc ); + _matched[ newLoc ] = val; + } + } + + /* called before query getmore block is iterated */ + virtual void checkLocation() { + + LOG( CDEBUG ) << "Restoring location with " << _stack.size() << ( ! _cur.isDirty() ? "" : " + 1 " ) << " points " << endl; + + // We can assume an error was thrown earlier if this database somehow disappears + + // Recall our _max, _min + _min.restore(); + _max.restore(); + + LOG( CDEBUG ) << "Min " << _min.toString() << endl; + LOG( CDEBUG ) << "Max " << _max.toString() << endl; + + // If the current key moved, we may have been advanced past the current point - need to check this + // if( _state == DOING_EXPAND ){ + // if( _min.hasPrefix( _prefix ) ){ _min.advance( -1, _foundInExp, this ); } + // if( _max.hasPrefix( _prefix ) ){ _max.advance( 1, _foundInExp, this ); } + //} + + // Undirty all the queued stuff + // Dirty all our queued stuff + list<GeoPoint>::iterator i = _stack.begin(); + while( i != _stack.end() ){ + + LOG( CDEBUG ) << "Undirtying stack point with id " << i->_id << endl; + + DiskLoc oldLoc; + if( i->unDirty( _spec, oldLoc ) ){ + // Document is in same location + LOG( CDEBUG ) << "Undirtied " << oldLoc << endl; + + i++; + } + else if( ! i->loc().isNull() ){ + + // Re-found document somewhere else + LOG( CDEBUG ) << "Changed location of " << i->_id << " : " << i->loc() << " vs " << oldLoc << endl; + + _nChangedOnYield++; + fixMatches( oldLoc, i->loc() ); + i++; + } + else { + + // Can't re-find document + LOG( CDEBUG ) << "Removing document " << i->_id << endl; + + _nRemovedOnYield++; + _found--; + assert( _found >= 0 ); + + // Can't find our key again, remove + i = _stack.erase( i ); + } + } + + if( _cur.isDirty() ){ + LOG( CDEBUG ) << "Undirtying cur point with id : " << _cur._id << endl; + } + + // Check current item + DiskLoc oldLoc; + if( _cur.isDirty() && ! _cur.unDirty( _spec, oldLoc ) ){ + if( _cur.loc().isNull() ){ + + // Document disappeared! + LOG( CDEBUG ) << "Removing cur point " << _cur._id << endl; + + _nRemovedOnYield++; + advance(); + } + else{ + + // Document moved + LOG( CDEBUG ) << "Changed location of cur point " << _cur._id << " : " << _cur.loc() << " vs " << oldLoc << endl; + + _nChangedOnYield++; + fixMatches( oldLoc, _cur.loc() ); + } + } + + _noted = false; + } + + virtual Record* _current() { assert(ok()); LOG( CDEBUG + 1 ) << "_current " << _cur._loc.obj()["_id"] << endl; return _cur._loc.rec(); } + virtual BSONObj current() { assert(ok()); LOG( CDEBUG + 1 ) << "current " << _cur._o << endl; return _cur._o; } + virtual DiskLoc currLoc() { assert(ok()); LOG( CDEBUG + 1 ) << "currLoc " << _cur._loc << endl; return _cur._loc; } + virtual BSONObj currKey() const { return _cur._key; } + + virtual CoveredIndexMatcher* matcher() const { + if( _matcher.get() ) return _matcher.get(); + else return GeoCursorBase::emptyMatcher.get(); + } + + virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { + if( _matcher.get() ) return _matcher; + else return GeoCursorBase::emptyMatcher; + } + + // Are we finished getting points? + virtual bool moreToDo() { + return _state != DONE; + } + + virtual bool supportGetMore() { return true; } + + // Fills the stack, but only checks a maximum number of maxToCheck points at a time. + // Further calls to this function will continue the expand/check neighbors algorithm. + virtual void fillStack( int maxToCheck, int maxToAdd = -1, bool onlyExpand = false ) { + +#ifdef GEODEBUGGING + log() << "Filling stack with maximum of " << maxToCheck << ", state : " << (int) _state << endl; +#endif + + if( maxToAdd < 0 ) maxToAdd = maxToCheck; + int maxFound = _foundInExp + maxToCheck; + assert( maxToCheck > 0 ); + assert( maxFound > 0 ); + assert( _found <= 0x7fffffff ); // conversion to int + int maxAdded = static_cast<int>(_found) + maxToAdd; + assert( maxAdded >= 0 ); // overflow check + + bool isNeighbor = _centerPrefix.constrains(); + + // Starting a box expansion + if ( _state == START ) { + + // Get the very first hash point, if required + if( ! isNeighbor ) + _prefix = expandStartHash(); + + GEODEBUG( "initializing btree" ); + +#ifdef GEODEBUGGING + log() << "Initializing from b-tree with hash of " << _prefix << " @ " << Box( _g, _prefix ) << endl; +#endif + + if ( ! BtreeLocation::initial( *_id , _spec , _min , _max , _prefix , _foundInExp , this ) ) + _state = isNeighbor ? DONE_NEIGHBOR : DONE; + else { + _state = DOING_EXPAND; + _lastPrefix.reset(); + } + + GEODEBUG( (_state == DONE_NEIGHBOR || _state == DONE ? "not initialized" : "initializedFig") ); + + } + + // Doing the actual box expansion + if ( _state == DOING_EXPAND ) { + + while ( true ) { + + GEODEBUG( "box prefix [" << _prefix << "]" ); +#ifdef GEODEBUGGING + if( _prefix.constrains() ) { + log() << "current expand box : " << Box( _g, _prefix ).toString() << endl; + } + else { + log() << "max expand box." << endl; + } +#endif + + GEODEBUG( "expanding box points... "); + + // Record the prefix we're actively exploring... + _expPrefix.reset( new GeoHash( _prefix ) ); + + // Find points inside this prefix + while ( _min.checkAndAdvance( _prefix, _foundInExp, this ) && _foundInExp < maxFound && _found < maxAdded ); + while ( _max.checkAndAdvance( _prefix, _foundInExp, this ) && _foundInExp < maxFound && _found < maxAdded ); + +#ifdef GEODEBUGGING + + log() << "finished expand, checked : " << ( maxToCheck - ( maxFound - _foundInExp ) ) + << " found : " << ( maxToAdd - ( maxAdded - _found ) ) + << " max : " << maxToCheck << " / " << maxToAdd << endl; + +#endif + + GEODEBUG( "finished expand, found : " << ( maxToAdd - ( maxAdded - _found ) ) ); + if( _foundInExp >= maxFound || _found >= maxAdded ) return; + + // We've searched this prefix fully, remember + _lastPrefix.reset( new GeoHash( _prefix )); + + // If we've searched the entire space, we're finished. + if ( ! _prefix.constrains() ) { + GEODEBUG( "box exhausted" ); + _state = DONE; + notePrefix(); + return; + } + + // If we won't fit in the box, and we're not doing a sub-scan, increase the size + if ( ! fitsInBox( _g->sizeEdge( _prefix ) ) && _fringe.size() == 0 ) { + + // If we're still not expanded bigger than the box size, expand again + // TODO: Is there an advantage to scanning prior to expanding? + _prefix = _prefix.up(); + continue; + + } + + // log() << "finished box prefix [" << _prefix << "]" << endl; + + // We're done and our size is large enough + _state = DONE_NEIGHBOR; + + // Go to the next sub-box, if applicable + if( _fringe.size() > 0 ) _fringe.pop_back(); + // Go to the next neighbor if this was the last sub-search + if( _fringe.size() == 0 ) _neighbor++; + + break; + + } + + notePrefix(); + } + + // If we doeighbors + if( onlyExpand ) return; + + // If we're done expanding the current box... + if( _state == DONE_NEIGHBOR ) { + + // Iterate to the next neighbor + // Loop is useful for cases where we want to skip over boxes entirely, + // otherwise recursion increments the neighbors. + for ( ; _neighbor < 9; _neighbor++ ) { + + // If we have no fringe for the neighbor, make sure we have the default fringe + if( _fringe.size() == 0 ) _fringe.push_back( "" ); + + if( ! isNeighbor ) { + _centerPrefix = _prefix; + _centerBox = Box( _g, _centerPrefix ); + isNeighbor = true; + } + + int i = (_neighbor / 3) - 1; + int j = (_neighbor % 3) - 1; + + if ( ( i == 0 && j == 0 ) || + ( i < 0 && _centerPrefix.atMinX() ) || + ( i > 0 && _centerPrefix.atMaxX() ) || + ( j < 0 && _centerPrefix.atMinY() ) || + ( j > 0 && _centerPrefix.atMaxY() ) ) { + + //log() << "not moving to neighbor " << _neighbor << " @ " << i << " , " << j << " fringe : " << _fringe.size() << " " << _centerPrefix << endl; + //log() << _centerPrefix.atMinX() << " " + // << _centerPrefix.atMinY() << " " + // << _centerPrefix.atMaxX() << " " + // << _centerPrefix.atMaxY() << " " << endl; + + continue; // main box or wrapped edge + // TODO: We may want to enable wrapping in future, probably best as layer on top of + // this search. + } + + // Make sure we've got a reasonable center + assert( _centerPrefix.constrains() ); + + GeoHash _neighborPrefix = _centerPrefix; + _neighborPrefix.move( i, j ); + + //log() << "moving to neighbor " << _neighbor << " @ " << i << " , " << j << " fringe : " << _fringe.size() << " " << _centerPrefix << " " << _neighborPrefix << endl; + + GEODEBUG( "moving to neighbor " << _neighbor << " @ " << i << " , " << j << " fringe : " << _fringe.size() ); + PREFIXDEBUG( _centerPrefix, _g ); + PREFIXDEBUG( _neighborPrefix , _g ); + while( _fringe.size() > 0 ) { + + _prefix = _neighborPrefix + _fringe.back(); + Box cur( _g , _prefix ); + + PREFIXDEBUG( _prefix, _g ); + + double intAmt = intersectsBox( cur ); + + // No intersection + if( intAmt <= 0 ) { + GEODEBUG( "skipping box" << cur.toString() ); + _fringe.pop_back(); + continue; + } + // Small intersection, refine search + else if( intAmt < 0.5 && _prefix.canRefine() && _fringe.back().size() < 4 /* two bits */ ) { + + GEODEBUG( "Intersection small : " << intAmt << ", adding to fringe: " << _fringe.back() << " curr prefix : " << _prefix << " bits : " << _prefix.getBits() ); + + // log() << "Diving to level : " << ( _fringe.back().size() / 2 + 1 ) << endl; + + string lastSuffix = _fringe.back(); + _fringe.pop_back(); + _fringe.push_back( lastSuffix + "00" ); + _fringe.push_back( lastSuffix + "01" ); + _fringe.push_back( lastSuffix + "11" ); + _fringe.push_back( lastSuffix + "10" ); + + continue; + } + + // Restart our search from a diff box. + _state = START; + + assert( ! onlyExpand ); + + assert( _found <= 0x7fffffff ); + fillStack( maxFound - _foundInExp, maxAdded - static_cast<int>(_found) ); + + // When we return from the recursive fillStack call, we'll either have checked enough points or + // be entirely done. Max recurse depth is < 8 * 16. + + // If we're maxed out on points, return + if( _foundInExp >= maxFound || _found >= maxAdded ) { + // Make sure we'll come back to add more points + assert( _state == DOING_EXPAND ); + return; + } + + // Otherwise we must be finished to return + assert( _state == DONE ); + return; + + } + + } + + // Finished with neighbors + _state = DONE; + } + + } + + // The initial geo hash box for our first expansion + virtual GeoHash expandStartHash() = 0; + + // Whether the current box width is big enough for our search area + virtual bool fitsInBox( double width ) = 0; + + // The amount the current box overlaps our search area + virtual double intersectsBox( Box& cur ) = 0; + + bool remembered( BSONObj o ){ + BSONObj seenId = o["_id"].wrap("").getOwned(); + if( _seenIds.find( seenId ) != _seenIds.end() ){ + LOG( CDEBUG + 1 ) << "Object " << o["_id"] << " already seen." << endl; + return true; + } + else{ + _seenIds.insert( seenId ); + LOG( CDEBUG + 1 ) << "Object " << o["_id"] << " remembered." << endl; + return false; + } + } + + virtual int addSpecific( const GeoKeyNode& node , const Point& keyP , bool onBounds , double keyD , bool potentiallyNewDoc ) { + + int found = 0; + + // We need to handle every possible point in this method, even those not in the key value, to + // avoid us tracking which hashes we've already seen. + if( ! potentiallyNewDoc ){ + // log() << "Already handled doc!" << endl; + return 0; + } + + // Final check for new doc + // OK to touch, since we're probably returning this object now + if( remembered( node.recordLoc.obj() ) ) return 0; + + if( _uniqueDocs && ! onBounds ) { + //log() << "Added ind to " << _type << endl; + _stack.push_front( GeoPoint( node ) ); + found++; + } + else { + // We now handle every possible point in the document, even those not in the key value, + // since we're iterating through them anyway - prevents us from having to save the hashes + // we've seen per-doc + + // If we're filtering by hash, get the original + bool expensiveExact = expensiveExactCheck(); + + vector< BSONObj > locs; + getPointsFor( node._key, node.recordLoc.obj(), locs, true ); + for( vector< BSONObj >::iterator i = locs.begin(); i != locs.end(); ++i ){ + + double d = -1; + Point p( *i ); + + // We can avoid exact document checks by redoing approx checks, + // if the exact checks are more expensive. + bool needExact = true; + if( expensiveExact ){ + assert( false ); + KeyResult result = approxKeyCheck( p, d ); + if( result == BAD ) continue; + else if( result == GOOD ) needExact = false; + } + + if( ! needExact || exactDocCheck( p, d ) ){ + //log() << "Added mult to " << _type << endl; + _stack.push_front( GeoPoint( node ) ); + found++; + // If returning unique, just exit after first point is added + if( _uniqueDocs ) break; + } + } + } + + while( _cur.isCleanAndEmpty() && _stack.size() > 0 ){ + _cur = _stack.front(); + _stack.pop_front(); + } + + return found; + } + + virtual long long nscanned() { + if ( _firstCall ) { + ok(); + } + return _nscanned; + } + + virtual void explainDetails( BSONObjBuilder& b ){ + b << "lookedAt" << _lookedAt; + b << "matchesPerfd" << _matchesPerfd; + b << "objectsLoaded" << _objectsLoaded; + b << "pointsLoaded" << _pointsLoaded; + b << "pointsSavedForYield" << _nDirtied; + b << "pointsChangedOnYield" << _nChangedOnYield; + b << "pointsRemovedOnYield" << _nRemovedOnYield; + } + + virtual BSONObj prettyIndexBounds() const { + + vector<GeoHash>::const_iterator i = _expPrefixes.end(); + if( _expPrefixes.size() > 0 && *(--i) != *( _expPrefix.get() ) ) + _expPrefixes.push_back( *( _expPrefix.get() ) ); + + BSONObjBuilder bob; + BSONArrayBuilder bab; + for( i = _expPrefixes.begin(); i != _expPrefixes.end(); ++i ){ + bab << Box( _g, *i ).toBSON(); + } + bob << _g->_geo << bab.arr(); + + return bob.obj(); + + } + + void notePrefix() { + _expPrefixes.push_back( _prefix ); + } + + string _type; + BSONObj _filter; + list<GeoPoint> _stack; + set<BSONObj> _seenIds; + + GeoPoint _cur; + bool _firstCall; + bool _noted; + + long long _nscanned; + long long _nDirtied; + long long _nChangedOnYield; + long long _nRemovedOnYield; + + // The current box we're expanding (-1 is first/center box) + int _neighbor; + + // The points we've found so far + // TODO: Long long? + int _foundInExp; + + // The current hash prefix we're expanding and the center-box hash prefix + GeoHash _prefix; + shared_ptr<GeoHash> _lastPrefix; + GeoHash _centerPrefix; + list<string> _fringe; + int recurseDepth; + Box _centerBox; + + // Start and end of our search range in the current box + BtreeLocation _min; + BtreeLocation _max; + + shared_ptr<GeoHash> _expPrefix; + mutable vector<GeoHash> _expPrefixes; + + }; + + + class GeoHopper : public GeoBrowse { + public: + typedef multiset<GeoPoint> Holder; + + GeoHopper( const Geo2dType * g , unsigned max , const Point& n , const BSONObj& filter = BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN, bool uniqueDocs = false, bool needDistance = true ) + : GeoBrowse( g, "search", filter, uniqueDocs, needDistance ), _max( max ) , _near( n ), _maxDistance( maxDistance ), _type( type ), _distError( type == GEO_PLAIN ? g->_error : g->_errorSphere ), _farthest(0) + {} + + virtual KeyResult approxKeyCheck( const Point& p, double& d ) { + + // Always check approximate distance, since it lets us avoid doing + // checks of the rest of the object if it succeeds + + switch (_type) { + case GEO_PLAIN: + d = _near.distance( p ); + break; + case GEO_SPHERE: + checkEarthBounds( p ); + d = spheredist_deg( _near, p ); + break; + default: assert( false ); + } + assert( d >= 0 ); + + GEODEBUG( "\t\t\t\t\t\t\t checkDistance " << _near.toString() + << "\t" << p.toString() << "\t" << d + << " farthest: " << farthest() ); + + // If we need more points + double borderDist = ( _points.size() < _max ? _maxDistance : farthest() ); + + if( d >= borderDist - 2 * _distError && d <= borderDist + 2 * _distError ) return BORDER; + else return d < borderDist ? GOOD : BAD; + + } + + virtual bool exactDocCheck( const Point& p, double& d ){ + + bool within = false; + + // Get the appropriate distance for the type + switch ( _type ) { + case GEO_PLAIN: + d = _near.distance( p ); + within = _near.distanceWithin( p, _maxDistance ); + break; + case GEO_SPHERE: + checkEarthBounds( p ); + d = spheredist_deg( _near, p ); + within = ( d <= _maxDistance ); + break; + default: assert( false ); + } + + return within; + } + + // Always in distance units, whether radians or normal + double farthest() const { + return _farthest; + } + + virtual int addSpecific( const GeoKeyNode& node, const Point& keyP, bool onBounds, double keyD, bool potentiallyNewDoc ) { + + // Unique documents + + GeoPoint newPoint( node, keyD, false ); + + int prevSize = _points.size(); + + // STEP 1 : Remove old duplicate points from the set if needed + if( _uniqueDocs ){ + + // Lookup old point with same doc + map< DiskLoc , Holder::iterator >::iterator oldPointIt = _seenPts.find( newPoint.loc() ); + + if( oldPointIt != _seenPts.end() ){ + const GeoPoint& oldPoint = *(oldPointIt->second); + // We don't need to care if we've already seen this same approx pt or better, + // or we've already gone to disk once for the point + if( oldPoint < newPoint ){ + GEODEBUG( "\t\tOld point closer than new point" ); + return 0; + } + GEODEBUG( "\t\tErasing old point " << oldPointIt->first.obj() ); + _points.erase( oldPointIt->second ); + } + } + + Holder::iterator newIt = _points.insert( newPoint ); + if( _uniqueDocs ) _seenPts[ newPoint.loc() ] = newIt; + + GEODEBUG( "\t\tInserted new point " << newPoint.toString() << " approx : " << keyD ); + + assert( _max > 0 ); + + Holder::iterator lastPtIt = _points.end(); + lastPtIt--; + _farthest = lastPtIt->distance() + 2 * _distError; + + return _points.size() - prevSize; + + } + + // Removes extra points from end of _points set. + // Check can be a bit costly if we have lots of exact points near borders, + // so we'll do this every once and awhile. + void processExtraPoints(){ + + if( _points.size() == 0 ) return; + + int prevSize = _points.size(); + + // Erase all points from the set with a position >= _max *and* + // whose distance isn't close to the _max - 1 position distance + + int numToErase = _points.size() - _max; + if( numToErase < 0 ) numToErase = 0; + + // Get the first point definitely in the _points array + Holder::iterator startErase = _points.end(); + for( int i = 0; i < numToErase + 1; i++ ) startErase--; + _farthest = startErase->distance() + 2 * _distError; + + GEODEBUG( "\t\tPotentially erasing " << numToErase << " points, " << " size : " << _points.size() << " max : " << _max << " dist : " << startErase->distance() << " farthest dist : " << _farthest << " from error : " << _distError ); + + startErase++; + while( numToErase > 0 && startErase->distance() <= _farthest ){ + GEODEBUG( "\t\tNot erasing point " << startErase->toString() ); + numToErase--; + startErase++; + assert( startErase != _points.end() || numToErase == 0 ); + } + + if( _uniqueDocs ){ + for( Holder::iterator i = startErase; i != _points.end(); ++i ) + _seenPts.erase( i->loc() ); + } + + _points.erase( startErase, _points.end() ); + + int diff = _points.size() - prevSize; + if( diff > 0 ) _found += diff; + else _found -= -diff; + + } + + unsigned _max; + Point _near; + Holder _points; + double _maxDistance; + GeoDistType _type; + double _distError; + double _farthest; + + // Safe to use currently since we don't yield in $near searches. If we do start to yield, we may need to + // replace dirtied disklocs in our holder / ensure our logic is correct. + map< DiskLoc , Holder::iterator > _seenPts; + + }; + + + + class GeoSearch : public GeoHopper { + public: + GeoSearch( const Geo2dType * g , const Point& startPt , int numWanted=100 , BSONObj filter=BSONObj() , double maxDistance = numeric_limits<double>::max() , GeoDistType type=GEO_PLAIN, bool uniqueDocs = false, bool needDistance = false ) + : GeoHopper( g , numWanted , startPt , filter , maxDistance, type, uniqueDocs, needDistance ), + _start( g->hash( startPt._x, startPt._y ) ), + // TODO: Remove numWanted... + _numWanted( numWanted ), + _type(type) + { + + assert( g->getDetails() ); + _nscanned = 0; + _found = 0; + + if( _maxDistance < 0 ){ + _scanDistance = numeric_limits<double>::max(); + } + else if (type == GEO_PLAIN) { + _scanDistance = maxDistance + _spec->_error; + } + else if (type == GEO_SPHERE) { + checkEarthBounds( startPt ); + // TODO: consider splitting into x and y scan distances + _scanDistance = computeXScanDistance( startPt._y, rad2deg( _maxDistance ) + _spec->_error ); + } + + assert( _scanDistance > 0 ); + + } + + + /** Check if we've already looked at a key. ALSO marks as seen, anticipating a follow-up call + to add(). This is broken out to avoid some work extracting the key bson if it's an + already seen point. + */ + private: + set< pair<DiskLoc,int> > _seen; + public: + + void exec() { + + if( _numWanted == 0 ) return; + + /* + * Search algorithm + * 1) use geohash prefix to find X items + * 2) compute max distance from want to an item + * 3) find optimal set of boxes that complete circle + * 4) use regular btree cursors to scan those boxes + */ + +#ifdef GEODEBUGGING + + log() << "start near search for " << _numWanted << " points near " << _near << " (max dist " << _maxDistance << ")" << endl; + +#endif + + // Part 1 + { + do { + long long f = found(); + assert( f <= 0x7fffffff ); + fillStack( maxPointsHeuristic, _numWanted - static_cast<int>(f) , true ); + processExtraPoints(); + } while( _state != DONE && _state != DONE_NEIGHBOR && + found() < _numWanted && + (! _prefix.constrains() || _g->sizeEdge( _prefix ) <= _scanDistance ) ); + + // If we couldn't scan or scanned everything, we're done + if( _state == DONE ){ + expandEndPoints(); + return; + } + } + +#ifdef GEODEBUGGING + + log() << "part 1 of near search completed, found " << found() << " points (out of " << _foundInExp << " scanned)" + << " in expanded region " << _prefix << " @ " << Box( _g, _prefix ) + << " with furthest distance " << farthest() << endl; + +#endif + + // Part 2 + { + + // Find farthest distance for completion scan + double farDist = farthest(); + if( found() < _numWanted ) { + // Not enough found in Phase 1 + farDist = _scanDistance; + } + else if ( _type == GEO_PLAIN ) { + // Enough found, but need to search neighbor boxes + farDist += _spec->_error; + } + else if ( _type == GEO_SPHERE ) { + // Enough found, but need to search neighbor boxes + farDist = std::min( _scanDistance, computeXScanDistance( _near._y, rad2deg( farDist ) ) + 2 * _spec->_error ); + } + assert( farDist >= 0 ); + GEODEBUGPRINT( farDist ); + + // Find the box that includes all the points we need to return + _want = Box( _near._x - farDist , _near._y - farDist , farDist * 2 ); + GEODEBUGPRINT( _want.toString() ); + + // log() << "Found : " << found() << " wanted : " << _numWanted << " Far distance : " << farDist << " box : " << _want << endl; + + // Remember the far distance for further scans + _scanDistance = farDist; + + // Reset the search, our distances have probably changed + if( _state == DONE_NEIGHBOR ){ + _state = DOING_EXPAND; + _neighbor = -1; + } + +#ifdef GEODEBUGGING + + log() << "resetting search with start at " << _start << " (edge length " << _g->sizeEdge( _start ) << ")" << endl; + +#endif + + // Do regular search in the full region + do { + fillStack( maxPointsHeuristic ); + processExtraPoints(); + } + while( _state != DONE ); + + } + + GEODEBUG( "done near search with " << _points.size() << " points " ); + + expandEndPoints(); + + } + + void addExactPoints( const GeoPoint& pt, Holder& points, bool force ){ + int before, after; + addExactPoints( pt, points, before, after, force ); + } + + void addExactPoints( const GeoPoint& pt, Holder& points, int& before, int& after, bool force ){ + + before = 0; + after = 0; + + GEODEBUG( "Adding exact points for " << pt.toString() ); + + if( pt.isExact() ){ + if( force ) points.insert( pt ); + return; + } + + vector<BSONObj> locs; + getPointsFor( pt.key(), pt.obj(), locs, _uniqueDocs ); + + GeoPoint nearestPt( pt, -1, true ); + + for( vector<BSONObj>::iterator i = locs.begin(); i != locs.end(); i++ ){ + + Point loc( *i ); + + double d; + if( ! exactDocCheck( loc, d ) ) continue; + + if( _uniqueDocs && ( nearestPt.distance() < 0 || d < nearestPt.distance() ) ){ + nearestPt._distance = d; + nearestPt._pt = *i; + continue; + } + else if( ! _uniqueDocs ){ + GeoPoint exactPt( pt, d, true ); + exactPt._pt = *i; + GEODEBUG( "Inserting exact pt " << exactPt.toString() << " for " << pt.toString() << " exact : " << d << " is less? " << ( exactPt < pt ) << " bits : " << _g->_bits ); + points.insert( exactPt ); + exactPt < pt ? before++ : after++; + } + + } + + if( _uniqueDocs && nearestPt.distance() >= 0 ){ + GEODEBUG( "Inserting unique exact pt " << nearestPt.toString() << " for " << pt.toString() << " exact : " << nearestPt.distance() << " is less? " << ( nearestPt < pt ) << " bits : " << _g->_bits ); + points.insert( nearestPt ); + if( nearestPt < pt ) before++; + else after++; + } + + } + + // TODO: Refactor this back into holder class, allow to run periodically when we are seeing a lot of pts + void expandEndPoints( bool finish = true ){ + + processExtraPoints(); + + // All points in array *could* be in maxDistance + + // Step 1 : Trim points to max size + // TODO: This check will do little for now, but is skeleton for future work in incremental $near + // searches + if( _max > 0 ){ + + int numToErase = _points.size() - _max; + + if( numToErase > 0 ){ + + Holder tested; + + // Work backward through all points we're not sure belong in the set + Holder::iterator maybePointIt = _points.end(); + maybePointIt--; + double approxMin = maybePointIt->distance() - 2 * _distError; + + GEODEBUG( "\t\tNeed to erase " << numToErase << " max : " << _max << " min dist " << approxMin << " error : " << _distError << " starting from : " << (*maybePointIt).toString() ); + + // Insert all + int erased = 0; + while( _points.size() > 0 && ( maybePointIt->distance() >= approxMin || erased < numToErase ) ){ + + Holder::iterator current = maybePointIt--; + + addExactPoints( *current, tested, true ); + _points.erase( current ); + erased++; + + if( tested.size() ) + approxMin = tested.begin()->distance() - 2 * _distError; + + } + + GEODEBUG( "\t\tEnding search at point " << ( _points.size() == 0 ? "(beginning)" : maybePointIt->toString() ) ); + + int numToAddBack = erased - numToErase; + assert( numToAddBack >= 0 ); + + GEODEBUG( "\t\tNum tested valid : " << tested.size() << " erased : " << erased << " added back : " << numToAddBack ); + +#ifdef GEODEBUGGING + for( Holder::iterator it = tested.begin(); it != tested.end(); it++ ){ + log() << "Tested Point: " << *it << endl; + } +#endif + Holder::iterator testedIt = tested.begin(); + for( int i = 0; i < numToAddBack && testedIt != tested.end(); i++ ){ + _points.insert( *testedIt ); + testedIt++; + } + } + } + +#ifdef GEODEBUGGING + for( Holder::iterator it = _points.begin(); it != _points.end(); it++ ){ + log() << "Point: " << *it << endl; + } +#endif + // We've now trimmed first set of unneeded points + + GEODEBUG( "\t\t Start expanding, num points : " << _points.size() << " max : " << _max ); + + // Step 2: iterate through all points and add as needed + + unsigned expandedPoints = 0; + Holder::iterator it = _points.begin(); + double expandWindowEnd = -1; + while( it != _points.end() ){ + const GeoPoint& currPt = *it; + + // TODO: If one point is exact, maybe not 2 * _distError + + // See if we're in an expand window + bool inWindow = currPt.distance() <= expandWindowEnd; + // If we're not, and we're done with points, break + if( ! inWindow && expandedPoints >= _max ) break; + + bool expandApprox = ! currPt.isExact() && ( ! _uniqueDocs || ( finish && _needDistance ) || inWindow ); + + if( expandApprox ){ + + // Add new point(s) + // These will only be added in a radius of 2 * _distError around the current point, + // so should not affect previously valid points. + int before, after; + addExactPoints( currPt, _points, before, after, false ); + expandedPoints += before; + + if( _max > 0 && expandedPoints < _max ) + expandWindowEnd = currPt.distance() + 2 * _distError; + + // Iterate to the next point + Holder::iterator current = it++; + // Erase the current point + _points.erase( current ); + + } + else{ + expandedPoints++; + it++; + } + } + + GEODEBUG( "\t\tFinished expanding, num points : " << _points.size() << " max : " << _max ); + + // Finish + // TODO: Don't really need to trim? + for( ; expandedPoints > _max; expandedPoints-- ) it--; + _points.erase( it, _points.end() ); + +#ifdef GEODEBUGGING + for( Holder::iterator it = _points.begin(); it != _points.end(); it++ ){ + log() << "Point: " << *it << endl; + } +#endif + } + + virtual GeoHash expandStartHash(){ + return _start; + } + + // Whether the current box width is big enough for our search area + virtual bool fitsInBox( double width ){ + return width >= _scanDistance; + } + + // Whether the current box overlaps our search area + virtual double intersectsBox( Box& cur ){ + return cur.intersects( _want ); + } + + GeoHash _start; + int _numWanted; + double _scanDistance; + + long long _nscanned; + int _found; + GeoDistType _type; + + Box _want; + }; + + class GeoSearchCursor : public GeoCursorBase { + public: + + GeoSearchCursor( shared_ptr<GeoSearch> s ) + : GeoCursorBase( s->_spec ) , + _s( s ) , _cur( s->_points.begin() ) , _end( s->_points.end() ), _nscanned() { + if ( _cur != _end ) { + ++_nscanned; + } + } + + virtual ~GeoSearchCursor() {} + + virtual bool ok() { + return _cur != _end; + } + + virtual Record* _current() { assert(ok()); return _cur->_loc.rec(); } + virtual BSONObj current() { assert(ok()); return _cur->_o; } + virtual DiskLoc currLoc() { assert(ok()); return _cur->_loc; } + virtual bool advance() { + if( ok() ){ + _cur++; + incNscanned(); + return ok(); + } + return false; + } + virtual BSONObj currKey() const { return _cur->_key; } + + virtual string toString() { + return "GeoSearchCursor"; + } + + + virtual BSONObj prettyStartKey() const { + return BSON( _s->_g->_geo << _s->_prefix.toString() ); + } + virtual BSONObj prettyEndKey() const { + GeoHash temp = _s->_prefix; + temp.move( 1 , 1 ); + return BSON( _s->_g->_geo << temp.toString() ); + } + + virtual long long nscanned() { return _nscanned; } + + virtual CoveredIndexMatcher* matcher() const { + if( _s->_matcher.get() ) return _s->_matcher.get(); + else return emptyMatcher.get(); + } + + virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { + if( _s->_matcher.get() ) return _s->_matcher; + else return emptyMatcher; + } + + shared_ptr<GeoSearch> _s; + GeoHopper::Holder::iterator _cur; + GeoHopper::Holder::iterator _end; + + void incNscanned() { if ( ok() ) { ++_nscanned; } } + long long _nscanned; + }; + + class GeoCircleBrowse : public GeoBrowse { + public: + + GeoCircleBrowse( const Geo2dType * g , const BSONObj& circle , BSONObj filter = BSONObj() , const string& type="$center", bool uniqueDocs = true ) + : GeoBrowse( g , "circle" , filter, uniqueDocs ) { + + uassert( 13060 , "$center needs 2 fields (middle,max distance)" , circle.nFields() == 2 ); + + BSONObjIterator i(circle); + BSONElement center = i.next(); + + uassert( 13656 , "the first field of $center object must be a location object" , center.isABSONObj() ); + + // Get geohash and exact center point + // TODO: For wrapping search, may be useful to allow center points outside-of-bounds here. + // Calculating the nearest point as a hash start inside the region would then be required. + _start = g->_tohash(center); + _startPt = Point(center); + + _maxDistance = i.next().numberDouble(); + uassert( 13061 , "need a max distance >= 0 " , _maxDistance >= 0 ); + + if (type == "$center") { + // Look in box with bounds of maxDistance in either direction + _type = GEO_PLAIN; + _xScanDistance = _maxDistance + _g->_error; + _yScanDistance = _maxDistance + _g->_error; + } + else if (type == "$centerSphere") { + // Same, but compute maxDistance using spherical transform + + uassert(13461, "Spherical MaxDistance > PI. Are you sure you are using radians?", _maxDistance < M_PI); + checkEarthBounds( _startPt ); + + _type = GEO_SPHERE; + _yScanDistance = rad2deg( _maxDistance ) + _g->_error; + _xScanDistance = computeXScanDistance(_startPt._y, _yScanDistance); + + uassert(13462, "Spherical distance would require wrapping, which isn't implemented yet", + (_startPt._x + _xScanDistance < 180) && (_startPt._x - _xScanDistance > -180) && + (_startPt._y + _yScanDistance < 90) && (_startPt._y - _yScanDistance > -90)); + } + else { + uassert(13460, "invalid $center query type: " + type, false); + } + + // Bounding box includes fudge factor. + // TODO: Is this correct, since fudge factor may be spherically transformed? + _bBox._min = Point( _startPt._x - _xScanDistance, _startPt._y - _yScanDistance ); + _bBox._max = Point( _startPt._x + _xScanDistance, _startPt._y + _yScanDistance ); + + GEODEBUG( "Bounding box for circle query : " << _bBox.toString() << " (max distance : " << _maxDistance << ")" << " starting from " << _startPt.toString() ); + + ok(); + } + + virtual GeoHash expandStartHash() { + return _start; + } + + virtual bool fitsInBox( double width ) { + return width >= std::max(_xScanDistance, _yScanDistance); + } + + virtual double intersectsBox( Box& cur ) { + return cur.intersects( _bBox ); + } + + virtual KeyResult approxKeyCheck( const Point& p, double& d ) { + + // Inexact hash distance checks. + double error = 0; + switch (_type) { + case GEO_PLAIN: + d = _startPt.distance( p ); + error = _g->_error; + break; + case GEO_SPHERE: { + checkEarthBounds( p ); + d = spheredist_deg( _startPt, p ); + error = _g->_errorSphere; + break; + } + default: assert( false ); + } + + // If our distance is in the error bounds... + if( d >= _maxDistance - error && d <= _maxDistance + error ) return BORDER; + return d > _maxDistance ? BAD : GOOD; + } + + virtual bool exactDocCheck( const Point& p, double& d ){ + + switch (_type) { + case GEO_PLAIN: { + if( _startPt.distanceWithin( p, _maxDistance ) ) return true; + break; + } + case GEO_SPHERE: + checkEarthBounds( p ); + if( spheredist_deg( _startPt , p ) <= _maxDistance ) return true; + break; + default: assert( false ); + } + + return false; + } + + GeoDistType _type; + GeoHash _start; + Point _startPt; + double _maxDistance; // user input + double _xScanDistance; // effected by GeoDistType + double _yScanDistance; // effected by GeoDistType + Box _bBox; + + }; + + class GeoBoxBrowse : public GeoBrowse { + public: + + GeoBoxBrowse( const Geo2dType * g , const BSONObj& box , BSONObj filter = BSONObj(), bool uniqueDocs = true ) + : GeoBrowse( g , "box" , filter, uniqueDocs ) { + + uassert( 13063 , "$box needs 2 fields (bottomLeft,topRight)" , box.nFields() == 2 ); + + // Initialize an *exact* box from the given obj. + BSONObjIterator i(box); + _want._min = Point( i.next() ); + _want._max = Point( i.next() ); + + _wantRegion = _want; + _wantRegion.fudge( g ); // Need to make sure we're checking regions within error bounds of where we want + fixBox( g, _wantRegion ); + fixBox( g, _want ); + + uassert( 13064 , "need an area > 0 " , _want.area() > 0 ); + + Point center = _want.center(); + _start = _g->hash( center._x , center._y ); + + GEODEBUG( "center : " << center.toString() << "\t" << _prefix ); + + _fudge = _g->_error; + _wantLen = _fudge + + std::max( ( _want._max._x - _want._min._x ) , + ( _want._max._y - _want._min._y ) ) / 2; + + ok(); + } + + void fixBox( const Geo2dType* g, Box& box ) { + if( box._min._x > box._max._x ) + swap( box._min._x, box._max._x ); + if( box._min._y > box._max._y ) + swap( box._min._y, box._max._y ); + + double gMin = g->_min; + double gMax = g->_max; + + if( box._min._x < gMin ) box._min._x = gMin; + if( box._min._y < gMin ) box._min._y = gMin; + if( box._max._x > gMax) box._max._x = gMax; + if( box._max._y > gMax ) box._max._y = gMax; + } + + void swap( double& a, double& b ) { + double swap = a; + a = b; + b = swap; + } + + virtual GeoHash expandStartHash() { + return _start; + } + + virtual bool fitsInBox( double width ) { + return width >= _wantLen; + } + + virtual double intersectsBox( Box& cur ) { + return cur.intersects( _wantRegion ); + } + + virtual KeyResult approxKeyCheck( const Point& p, double& d ) { + if( _want.onBoundary( p, _fudge ) ) return BORDER; + else return _want.inside( p, _fudge ) ? GOOD : BAD; + + } + + virtual bool exactDocCheck( const Point& p, double& d ){ + return _want.inside( p ); + } + + Box _want; + Box _wantRegion; + double _wantLen; + double _fudge; + + GeoHash _start; + + }; + + class GeoPolygonBrowse : public GeoBrowse { + public: + + GeoPolygonBrowse( const Geo2dType* g , const BSONObj& polyPoints , + BSONObj filter = BSONObj(), bool uniqueDocs = true ) : GeoBrowse( g , "polygon" , filter, uniqueDocs ) { + + GEODEBUG( "In Polygon" ) + + BSONObjIterator i( polyPoints ); + BSONElement first = i.next(); + _poly.add( Point( first ) ); + + while ( i.more() ) { + _poly.add( Point( i.next() ) ); + } + + uassert( 14030, "polygon must be defined by three points or more", _poly.size() >= 3 ); + + _bounds = _poly.bounds(); + _bounds.fudge( g ); // We need to check regions within the error bounds of these bounds + _bounds.truncate( g ); // We don't need to look anywhere outside the space + + _maxDim = _g->_error + _bounds.maxDim() / 2; + + ok(); + } + + // The initial geo hash box for our first expansion + virtual GeoHash expandStartHash() { + return _g->hash( _bounds.center() ); + } + + // Whether the current box width is big enough for our search area + virtual bool fitsInBox( double width ) { + return _maxDim <= width; + } + + // Whether the current box overlaps our search area + virtual double intersectsBox( Box& cur ) { + return cur.intersects( _bounds ); + } + + virtual KeyResult approxKeyCheck( const Point& p, double& d ) { + + int in = _poly.contains( p, _g->_error ); + + if( in == 0 ) return BORDER; + else return in > 0 ? GOOD : BAD; + + } + + virtual bool exactDocCheck( const Point& p, double& d ){ + return _poly.contains( p ); + } + + private: + + Polygon _poly; + Box _bounds; + double _maxDim; + + GeoHash _start; + }; + + shared_ptr<Cursor> Geo2dType::newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const { + if ( numWanted < 0 ) + numWanted = numWanted * -1; + else if ( numWanted == 0 ) + numWanted = 100; + + BSONObjIterator i(query); + while ( i.more() ) { + BSONElement e = i.next(); + + if ( _geo != e.fieldName() ) + continue; + + if ( e.type() == Array ) { + // If we get an array query, assume it is a location, and do a $within { $center : [[x, y], 0] } search + shared_ptr<Cursor> c( new GeoCircleBrowse( this , BSON( "0" << e.embeddedObjectUserCheck() << "1" << 0 ), query.filterFieldsUndotted( BSON( _geo << "" ), false ), "$center", true ) ); + return c; + } + else if ( e.type() == Object ) { + + // TODO: Filter out _geo : { $special... } field so it doesn't get matched accidentally, + // if matcher changes + + switch ( e.embeddedObject().firstElement().getGtLtOp() ) { + case BSONObj::opNEAR: { + BSONObj n = e.embeddedObject(); + e = n.firstElement(); + + const char* suffix = e.fieldName() + 5; // strlen("$near") == 5; + GeoDistType type; + if (suffix[0] == '\0') { + type = GEO_PLAIN; + } + else if (strcmp(suffix, "Sphere") == 0) { + type = GEO_SPHERE; + } + else { + uassert(13464, string("invalid $near search type: ") + e.fieldName(), false); + type = GEO_PLAIN; // prevents uninitialized warning + } + + double maxDistance = numeric_limits<double>::max(); + if ( e.isABSONObj() && e.embeddedObject().nFields() > 2 ) { + BSONObjIterator i(e.embeddedObject()); + i.next(); + i.next(); + BSONElement e = i.next(); + if ( e.isNumber() ) + maxDistance = e.numberDouble(); + } + { + BSONElement e = n["$maxDistance"]; + if ( e.isNumber() ) + maxDistance = e.numberDouble(); + } + + bool uniqueDocs = false; + if( ! n["$uniqueDocs"].eoo() ) uniqueDocs = n["$uniqueDocs"].trueValue(); + + shared_ptr<GeoSearch> s( new GeoSearch( this , Point( e ) , numWanted , query , maxDistance, type, uniqueDocs ) ); + s->exec(); + shared_ptr<Cursor> c; + c.reset( new GeoSearchCursor( s ) ); + return c; + } + case BSONObj::opWITHIN: { + + e = e.embeddedObject().firstElement(); + uassert( 13057 , "$within has to take an object or array" , e.isABSONObj() ); + + BSONObj context = e.embeddedObject(); + e = e.embeddedObject().firstElement(); + string type = e.fieldName(); + + bool uniqueDocs = true; + if( ! context["$uniqueDocs"].eoo() ) uniqueDocs = context["$uniqueDocs"].trueValue(); + + if ( startsWith(type, "$center") ) { + uassert( 13059 , "$center has to take an object or array" , e.isABSONObj() ); + shared_ptr<Cursor> c( new GeoCircleBrowse( this , e.embeddedObjectUserCheck() , query , type, uniqueDocs ) ); + return c; + } + else if ( type == "$box" ) { + uassert( 13065 , "$box has to take an object or array" , e.isABSONObj() ); + shared_ptr<Cursor> c( new GeoBoxBrowse( this , e.embeddedObjectUserCheck() , query, uniqueDocs ) ); + return c; + } + else if ( startsWith( type, "$poly" ) ) { + uassert( 14029 , "$polygon has to take an object or array" , e.isABSONObj() ); + shared_ptr<Cursor> c( new GeoPolygonBrowse( this , e.embeddedObjectUserCheck() , query, uniqueDocs ) ); + return c; + } + throw UserException( 13058 , str::stream() << "unknown $within information : " << context << ", a shape must be specified." ); + } + default: + // Otherwise... assume the object defines a point, and we want to do a zero-radius $within $center + shared_ptr<Cursor> c( new GeoCircleBrowse( this , BSON( "0" << e.embeddedObjectUserCheck() << "1" << 0 ), query.filterFieldsUndotted( BSON( _geo << "" ), false ) ) ); + return c; + } + } + } + + throw UserException( 13042 , (string)"missing geo field (" + _geo + ") in : " + query.toString() ); + } + + // ------ + // commands + // ------ + + class Geo2dFindNearCmd : public Command { + public: + Geo2dFindNearCmd() : Command( "geoNear" ) {} + virtual LockType locktype() const { return READ; } + bool slaveOk() const { return true; } + void help(stringstream& h) const { h << "http://www.mongodb.org/display/DOCS/Geospatial+Indexing#GeospatialIndexing-geoNearCommand"; } + bool slaveOverrideOk() { return true; } + bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + string ns = dbname + "." + cmdObj.firstElement().valuestr(); + + NamespaceDetails * d = nsdetails( ns.c_str() ); + if ( ! d ) { + errmsg = "can't find ns"; + return false; + } + + vector<int> idxs; + d->findIndexByType( GEO2DNAME , idxs ); + + if ( idxs.size() > 1 ) { + errmsg = "more than 1 geo indexes :("; + return false; + } + + if ( idxs.size() == 0 ) { + errmsg = "no geo index :("; + return false; + } + + int geoIdx = idxs[0]; + + result.append( "ns" , ns ); + + IndexDetails& id = d->idx( geoIdx ); + Geo2dType * g = (Geo2dType*)id.getSpec().getType(); + assert( &id == g->getDetails() ); + + int numWanted = 100; + if ( cmdObj["num"].isNumber() ) { + numWanted = cmdObj["num"].numberInt(); + assert( numWanted >= 0 ); + } + + bool uniqueDocs = false; + if( ! cmdObj["uniqueDocs"].eoo() ) uniqueDocs = cmdObj["uniqueDocs"].trueValue(); + + bool includeLocs = false; + if( ! cmdObj["includeLocs"].eoo() ) includeLocs = cmdObj["includeLocs"].trueValue(); + + uassert(13046, "'near' param missing/invalid", !cmdObj["near"].eoo()); + const Point n( cmdObj["near"] ); + result.append( "near" , g->_tohash( cmdObj["near"] ).toString() ); + + BSONObj filter; + if ( cmdObj["query"].type() == Object ) + filter = cmdObj["query"].embeddedObject(); + + double maxDistance = numeric_limits<double>::max(); + if ( cmdObj["maxDistance"].isNumber() ) + maxDistance = cmdObj["maxDistance"].number(); + + GeoDistType type = GEO_PLAIN; + if ( cmdObj["spherical"].trueValue() ) + type = GEO_SPHERE; + + GeoSearch gs( g , n , numWanted , filter , maxDistance , type, uniqueDocs, true ); + + if ( cmdObj["start"].type() == String) { + GeoHash start ((string) cmdObj["start"].valuestr()); + gs._start = start; + } + + gs.exec(); + + double distanceMultiplier = 1; + if ( cmdObj["distanceMultiplier"].isNumber() ) + distanceMultiplier = cmdObj["distanceMultiplier"].number(); + + double totalDistance = 0; + + BSONObjBuilder arr( result.subarrayStart( "results" ) ); + int x = 0; + for ( GeoHopper::Holder::iterator i=gs._points.begin(); i!=gs._points.end(); i++ ) { + + const GeoPoint& p = *i; + double dis = distanceMultiplier * p.distance(); + totalDistance += dis; + + BSONObjBuilder bb( arr.subobjStart( BSONObjBuilder::numStr( x++ ) ) ); + bb.append( "dis" , dis ); + if( includeLocs ){ + if( p._pt.couldBeArray() ) bb.append( "loc", BSONArray( p._pt ) ); + else bb.append( "loc" , p._pt ); + } + bb.append( "obj" , p._o ); + bb.done(); + + if ( arr.len() > BSONObjMaxUserSize ) { + warning() << "Too many results to fit in single document. Truncating..." << endl; + break; + } + } + arr.done(); + + BSONObjBuilder stats( result.subobjStart( "stats" ) ); + stats.append( "time" , cc().curop()->elapsedMillis() ); + stats.appendNumber( "btreelocs" , gs._nscanned ); + stats.appendNumber( "nscanned" , gs._lookedAt ); + stats.appendNumber( "objectsLoaded" , gs._objectsLoaded ); + stats.append( "avgDistance" , totalDistance / x ); + stats.append( "maxDistance" , gs.farthest() ); + stats.done(); + + return true; + } + + } geo2dFindNearCmd; + + class GeoWalkCmd : public Command { + public: + GeoWalkCmd() : Command( "geoWalk" ) {} + virtual LockType locktype() const { return READ; } + bool slaveOk() const { return true; } + bool slaveOverrideOk() { return true; } + bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + string ns = dbname + "." + cmdObj.firstElement().valuestr(); + + NamespaceDetails * d = nsdetails( ns.c_str() ); + if ( ! d ) { + errmsg = "can't find ns"; + return false; + } + + int geoIdx = -1; + { + NamespaceDetails::IndexIterator ii = d->ii(); + while ( ii.more() ) { + IndexDetails& id = ii.next(); + if ( id.getSpec().getTypeName() == GEO2DNAME ) { + if ( geoIdx >= 0 ) { + errmsg = "2 geo indexes :("; + return false; + } + geoIdx = ii.pos() - 1; + } + } + } + + if ( geoIdx < 0 ) { + errmsg = "no geo index :("; + return false; + } + + + IndexDetails& id = d->idx( geoIdx ); + Geo2dType * g = (Geo2dType*)id.getSpec().getType(); + assert( &id == g->getDetails() ); + + int max = 100000; + + auto_ptr<BtreeCursor> bc( BtreeCursor::make( d , geoIdx , id , BSONObj() , BSONObj() , true , 1 ) ); + BtreeCursor &c = *bc; + while ( c.ok() && max-- ) { + GeoHash h( c.currKey().firstElement() ); + int len; + cout << "\t" << h.toString() + << "\t" << c.current()[g->_geo] + << "\t" << hex << h.getHash() + << "\t" << hex << ((long long*)c.currKey().firstElement().binData(len))[0] + << "\t" << c.current()["_id"] + << endl; + c.advance(); + } + + return true; + } + + } geoWalkCmd; + + struct GeoUnitTest : public UnitTest { + + int round( double d ) { + return (int)(.5+(d*1000)); + } + +#define GEOHEQ(a,b) if ( a.toString() != b ){ cout << "[" << a.toString() << "] != [" << b << "]" << endl; assert( a == GeoHash(b) ); } + + void run() { + assert( ! GeoHash::isBitSet( 0 , 0 ) ); + assert( ! GeoHash::isBitSet( 0 , 31 ) ); + assert( GeoHash::isBitSet( 1 , 31 ) ); + + IndexSpec i( BSON( "loc" << "2d" ) ); + Geo2dType g( &geo2dplugin , &i ); + { + double x = 73.01212; + double y = 41.352964; + BSONObj in = BSON( "x" << x << "y" << y ); + GeoHash h = g._hash( in ); + BSONObj out = g._unhash( h ); + assert( round(x) == round( out["x"].number() ) ); + assert( round(y) == round( out["y"].number() ) ); + assert( round( in["x"].number() ) == round( out["x"].number() ) ); + assert( round( in["y"].number() ) == round( out["y"].number() ) ); + } + + { + double x = -73.01212; + double y = 41.352964; + BSONObj in = BSON( "x" << x << "y" << y ); + GeoHash h = g._hash( in ); + BSONObj out = g._unhash( h ); + assert( round(x) == round( out["x"].number() ) ); + assert( round(y) == round( out["y"].number() ) ); + assert( round( in["x"].number() ) == round( out["x"].number() ) ); + assert( round( in["y"].number() ) == round( out["y"].number() ) ); + } + + { + GeoHash h( "0000" ); + h.move( 0 , 1 ); + GEOHEQ( h , "0001" ); + h.move( 0 , -1 ); + GEOHEQ( h , "0000" ); + + h.init( "0001" ); + h.move( 0 , 1 ); + GEOHEQ( h , "0100" ); + h.move( 0 , -1 ); + GEOHEQ( h , "0001" ); + + + h.init( "0000" ); + h.move( 1 , 0 ); + GEOHEQ( h , "0010" ); + } + + { + Box b( 5 , 5 , 2 ); + assert( "(5,5) -->> (7,7)" == b.toString() ); + } + + { + GeoHash a = g.hash( 1 , 1 ); + GeoHash b = g.hash( 4 , 5 ); + assert( 5 == (int)(g.distance( a , b ) ) ); + a = g.hash( 50 , 50 ); + b = g.hash( 42 , 44 ); + assert( round(10) == round(g.distance( a , b )) ); + } + + { + GeoHash x("0000"); + assert( 0 == x.getHash() ); + x.init( 0 , 1 , 32 ); + GEOHEQ( x , "0000000000000000000000000000000000000000000000000000000000000001" ) + + assert( GeoHash( "1100").hasPrefix( GeoHash( "11" ) ) ); + assert( ! GeoHash( "1000").hasPrefix( GeoHash( "11" ) ) ); + } + + { + GeoHash x("1010"); + GEOHEQ( x , "1010" ); + GeoHash y = x + "01"; + GEOHEQ( y , "101001" ); + } + + { + + GeoHash a = g.hash( 5 , 5 ); + GeoHash b = g.hash( 5 , 7 ); + GeoHash c = g.hash( 100 , 100 ); + /* + cout << "a: " << a << endl; + cout << "b: " << b << endl; + cout << "c: " << c << endl; + + cout << "a: " << a.toStringHex1() << endl; + cout << "b: " << b.toStringHex1() << endl; + cout << "c: " << c.toStringHex1() << endl; + */ + BSONObj oa = a.wrap(); + BSONObj ob = b.wrap(); + BSONObj oc = c.wrap(); + /* + cout << "a: " << oa.hexDump() << endl; + cout << "b: " << ob.hexDump() << endl; + cout << "c: " << oc.hexDump() << endl; + */ + assert( oa.woCompare( ob ) < 0 ); + assert( oa.woCompare( oc ) < 0 ); + + } + + { + GeoHash x( "000000" ); + x.move( -1 , 0 ); + GEOHEQ( x , "101010" ); + x.move( 1 , -1 ); + GEOHEQ( x , "010101" ); + x.move( 0 , 1 ); + GEOHEQ( x , "000000" ); + } + + { + GeoHash prefix( "110011000000" ); + GeoHash entry( "1100110000011100000111000001110000011100000111000001000000000000" ); + assert( ! entry.hasPrefix( prefix ) ); + + entry = GeoHash("1100110000001100000111000001110000011100000111000001000000000000"); + assert( entry.toString().find( prefix.toString() ) == 0 ); + assert( entry.hasPrefix( GeoHash( "1100" ) ) ); + assert( entry.hasPrefix( prefix ) ); + } + + { + GeoHash a = g.hash( 50 , 50 ); + GeoHash b = g.hash( 48 , 54 ); + assert( round( 4.47214 ) == round( g.distance( a , b ) ) ); + } + + + { + Box b( Point( 29.762283 , -95.364271 ) , Point( 29.764283000000002 , -95.36227099999999 ) ); + assert( b.inside( 29.763 , -95.363 ) ); + assert( ! b.inside( 32.9570255 , -96.1082497 ) ); + assert( ! b.inside( 32.9570255 , -96.1082497 , .01 ) ); + } + + { + GeoHash a( "11001111" ); + assert( GeoHash( "11" ) == a.commonPrefix( GeoHash("11") ) ); + assert( GeoHash( "11" ) == a.commonPrefix( GeoHash("11110000") ) ); + } + + { + int N = 10000; + { + Timer t; + for ( int i=0; i<N; i++ ) { + unsigned x = (unsigned)rand(); + unsigned y = (unsigned)rand(); + GeoHash h( x , y ); + unsigned a,b; + h.unhash_slow( a,b ); + assert( a == x ); + assert( b == y ); + } + //cout << "slow: " << t.millis() << endl; + } + + { + Timer t; + for ( int i=0; i<N; i++ ) { + unsigned x = (unsigned)rand(); + unsigned y = (unsigned)rand(); + GeoHash h( x , y ); + unsigned a,b; + h.unhash_fast( a,b ); + assert( a == x ); + assert( b == y ); + } + //cout << "fast: " << t.millis() << endl; + } + + } + + { + // see http://en.wikipedia.org/wiki/Great-circle_distance#Worked_example + + { + Point BNA (-86.67, 36.12); + Point LAX (-118.40, 33.94); + + double dist1 = spheredist_deg(BNA, LAX); + double dist2 = spheredist_deg(LAX, BNA); + + // target is 0.45306 + assert( 0.45305 <= dist1 && dist1 <= 0.45307 ); + assert( 0.45305 <= dist2 && dist2 <= 0.45307 ); + } + { + Point BNA (-1.5127, 0.6304); + Point LAX (-2.0665, 0.5924); + + double dist1 = spheredist_rad(BNA, LAX); + double dist2 = spheredist_rad(LAX, BNA); + + // target is 0.45306 + assert( 0.45305 <= dist1 && dist1 <= 0.45307 ); + assert( 0.45305 <= dist2 && dist2 <= 0.45307 ); + } + { + Point JFK (-73.77694444, 40.63861111 ); + Point LAX (-118.40, 33.94); + + double dist = spheredist_deg(JFK, LAX) * EARTH_RADIUS_MILES; + assert( dist > 2469 && dist < 2470 ); + } + + { + Point BNA (-86.67, 36.12); + Point LAX (-118.40, 33.94); + Point JFK (-73.77694444, 40.63861111 ); + assert( spheredist_deg(BNA, BNA) < 1e-6); + assert( spheredist_deg(LAX, LAX) < 1e-6); + assert( spheredist_deg(JFK, JFK) < 1e-6); + + Point zero (0, 0); + Point antizero (0,-180); + + // these were known to cause NaN + assert( spheredist_deg(zero, zero) < 1e-6); + assert( fabs(M_PI-spheredist_deg(zero, antizero)) < 1e-6); + assert( fabs(M_PI-spheredist_deg(antizero, zero)) < 1e-6); + } + } + } + } geoUnitTest; + + +} + diff --git a/src/mongo/db/geo/core.h b/src/mongo/db/geo/core.h new file mode 100644 index 00000000000..c49131e0162 --- /dev/null +++ b/src/mongo/db/geo/core.h @@ -0,0 +1,550 @@ +// core.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../../pch.h" +#include "../jsobj.h" + +#include <cmath> + +#ifndef M_PI +# define M_PI 3.14159265358979323846 +#endif + +namespace mongo { + + class GeoBitSets { + public: + GeoBitSets() { + for ( int i=0; i<32; i++ ) { + masks32[i] = ( 1 << ( 31 - i ) ); + } + for ( int i=0; i<64; i++ ) { + masks64[i] = ( 1LL << ( 63 - i ) ); + } + + for ( unsigned i=0; i<16; i++ ) { + unsigned fixed = 0; + for ( int j=0; j<4; j++ ) { + if ( i & ( 1 << j ) ) + fixed |= ( 1 << ( j * 2 ) ); + } + hashedToNormal[fixed] = i; + } + + long long currAllX = 0, currAllY = 0; + for ( int i = 0; i < 64; i++ ){ + if( i % 2 == 0 ){ + allX[ i / 2 ] = currAllX; + currAllX = currAllX + ( 1LL << ( 63 - i ) ); + } + else{ + allY[ i / 2 ] = currAllY; + currAllY = currAllY + ( 1LL << ( 63 - i ) ); + } + } + } + int masks32[32]; + long long masks64[64]; + long long allX[32]; + long long allY[32]; + + unsigned hashedToNormal[256]; + }; + + extern GeoBitSets geoBitSets; + + class GeoHash { + public: + + GeoHash() + : _hash(0),_bits(0) { + } + + explicit GeoHash( const char * hash ) { + init( hash ); + } + + explicit GeoHash( const string& hash ) { + init( hash ); + } + + static GeoHash makeFromBinData(const char *bindata, unsigned bits) { + GeoHash h; + h._bits = bits; + h._copy( (char*)&h._hash , bindata ); + h._fix(); + return h; + } + + explicit GeoHash( const BSONElement& e , unsigned bits=32 ) { + _bits = bits; + if ( e.type() == BinData ) { + int len = 0; + _copy( (char*)&_hash , e.binData( len ) ); + assert( len == 8 ); + _bits = bits; + } + else { + cout << "GeoHash bad element: " << e << endl; + uassert(13047,"wrong type for geo index. if you're using a pre-release version, need to rebuild index",0); + } + _fix(); + } + + GeoHash( unsigned x , unsigned y , unsigned bits=32) { + init( x , y , bits ); + } + + GeoHash( const GeoHash& old ) { + _hash = old._hash; + _bits = old._bits; + } + + GeoHash( long long hash , unsigned bits ) + : _hash( hash ) , _bits( bits ) { + _fix(); + } + + void init( unsigned x , unsigned y , unsigned bits ) { + assert( bits <= 32 ); + _hash = 0; + _bits = bits; + for ( unsigned i=0; i<bits; i++ ) { + if ( isBitSet( x , i ) ) _hash |= geoBitSets.masks64[i*2]; + if ( isBitSet( y , i ) ) _hash |= geoBitSets.masks64[(i*2)+1]; + } + } + + void unhash_fast( unsigned& x , unsigned& y ) const { + x = 0; + y = 0; + char * c = (char*)(&_hash); + for ( int i=0; i<8; i++ ) { + unsigned t = (unsigned)(c[i]) & 0x55; + y |= ( geoBitSets.hashedToNormal[t] << (4*(i)) ); + + t = ( (unsigned)(c[i]) >> 1 ) & 0x55; + x |= ( geoBitSets.hashedToNormal[t] << (4*(i)) ); + } + } + + void unhash_slow( unsigned& x , unsigned& y ) const { + x = 0; + y = 0; + for ( unsigned i=0; i<_bits; i++ ) { + if ( getBitX(i) ) + x |= geoBitSets.masks32[i]; + if ( getBitY(i) ) + y |= geoBitSets.masks32[i]; + } + } + + void unhash( unsigned& x , unsigned& y ) const { + unhash_fast( x , y ); + } + + /** + * @param 0 = high + */ + static bool isBitSet( unsigned val , unsigned bit ) { + return geoBitSets.masks32[bit] & val; + } + + GeoHash up() const { + return GeoHash( _hash , _bits - 1 ); + } + + bool hasPrefix( const GeoHash& other ) const { + assert( other._bits <= _bits ); + if ( other._bits == 0 ) + return true; + long long x = other._hash ^ _hash; + x = x >> (64-(other._bits*2)); + return x == 0; + } + + + string toString() const { + StringBuilder buf( _bits * 2 ); + for ( unsigned x=0; x<_bits*2; x++ ) + buf.append( _hash & geoBitSets.masks64[x] ? "1" : "0" ); + return buf.str(); + } + + string toStringHex1() const { + stringstream ss; + ss << hex << _hash; + return ss.str(); + } + + void init( const string& s ) { + _hash = 0; + _bits = s.size() / 2; + for ( unsigned pos=0; pos<s.size(); pos++ ) + if ( s[pos] == '1' ) + setBit( pos , 1 ); + } + + void setBit( unsigned pos , bool one ) { + assert( pos < _bits * 2 ); + if ( one ) + _hash |= geoBitSets.masks64[pos]; + else if ( _hash & geoBitSets.masks64[pos] ) + _hash &= ~geoBitSets.masks64[pos]; + } + + bool getBit( unsigned pos ) const { + return _hash & geoBitSets.masks64[pos]; + } + + bool getBitX( unsigned pos ) const { + assert( pos < 32 ); + return getBit( pos * 2 ); + } + + bool getBitY( unsigned pos ) const { + assert( pos < 32 ); + return getBit( ( pos * 2 ) + 1 ); + } + + BSONObj wrap( const char* name = "" ) const { + BSONObjBuilder b(20); + append( b , name ); + BSONObj o = b.obj(); + if( ! strlen( name ) ) assert( o.objsize() == 20 ); + return o; + } + + bool constrains() const { + return _bits > 0; + } + + bool canRefine() const { + return _bits < 32; + } + + bool atMinX() const { + return ( _hash & geoBitSets.allX[ _bits ] ) == 0; + } + + bool atMinY() const { + //log() << " MinY : " << hex << (unsigned long long) _hash << " " << _bits << " " << hex << (unsigned long long) geoBitSets.allY[ _bits ] << endl; + return ( _hash & geoBitSets.allY[ _bits ] ) == 0; + } + + bool atMaxX() const { + return ( _hash & geoBitSets.allX[ _bits ] ) == geoBitSets.allX[ _bits ]; + } + + bool atMaxY() const { + return ( _hash & geoBitSets.allY[ _bits ] ) == geoBitSets.allY[ _bits ]; + } + + void move( int x , int y ) { + assert( _bits ); + _move( 0 , x ); + _move( 1 , y ); + } + + void _move( unsigned offset , int d ) { + if ( d == 0 ) + return; + assert( d <= 1 && d>= -1 ); // TEMP + + bool from, to; + if ( d > 0 ) { + from = 0; + to = 1; + } + else { + from = 1; + to = 0; + } + + unsigned pos = ( _bits * 2 ) - 1; + if ( offset == 0 ) + pos--; + while ( true ) { + if ( getBit(pos) == from ) { + setBit( pos , to ); + return; + } + + if ( pos < 2 ) { + // overflow + for ( ; pos < ( _bits * 2 ) ; pos += 2 ) { + setBit( pos , from ); + } + return; + } + + setBit( pos , from ); + pos -= 2; + } + + assert(0); + } + + GeoHash& operator=(const GeoHash& h) { + _hash = h._hash; + _bits = h._bits; + return *this; + } + + bool operator==(const GeoHash& h ) const { + return _hash == h._hash && _bits == h._bits; + } + + bool operator!=(const GeoHash& h ) const { + return !( *this == h ); + } + + bool operator<(const GeoHash& h ) const { + if( _hash != h._hash ) return _hash < h._hash; + return _bits < h._bits; + } + + GeoHash& operator+=( const char * s ) { + unsigned pos = _bits * 2; + _bits += strlen(s) / 2; + assert( _bits <= 32 ); + while ( s[0] ) { + if ( s[0] == '1' ) + setBit( pos , 1 ); + pos++; + s++; + } + + return *this; + } + + GeoHash operator+( const char * s ) const { + GeoHash n = *this; + n+=s; + return n; + } + + GeoHash operator+( string s ) const { + return operator+( s.c_str() ); + } + + void _fix() { + static long long FULL = 0xFFFFFFFFFFFFFFFFLL; + long long mask = FULL << ( 64 - ( _bits * 2 ) ); + _hash &= mask; + } + + void append( BSONObjBuilder& b , const char * name ) const { + char buf[8]; + _copy( buf , (char*)&_hash ); + b.appendBinData( name , 8 , bdtCustom , buf ); + } + + long long getHash() const { + return _hash; + } + + unsigned getBits() const { + return _bits; + } + + GeoHash commonPrefix( const GeoHash& other ) const { + unsigned i=0; + for ( ; i<_bits && i<other._bits; i++ ) { + if ( getBitX( i ) == other.getBitX( i ) && + getBitY( i ) == other.getBitY( i ) ) + continue; + break; + } + return GeoHash(_hash,i); + } + + private: + + static void _copy( char * dst , const char * src ) { + for ( unsigned a=0; a<8; a++ ) { + dst[a] = src[7-a]; + } + } + + long long _hash; + unsigned _bits; // bits per field, so 1 to 32 + }; + + inline ostream& operator<<( ostream &s, const GeoHash &h ) { + s << h.toString(); + return s; + } + + class GeoConvert { + public: + virtual ~GeoConvert() {} + + virtual void unhash( const GeoHash& h , double& x , double& y ) const = 0; + virtual GeoHash hash( double x , double y ) const = 0; + }; + + class Point { + public: + + Point( const GeoConvert * g , const GeoHash& hash ) { + g->unhash( hash , _x , _y ); + } + + explicit Point( const BSONElement& e ) { + BSONObjIterator i(e.Obj()); + _x = i.next().number(); + _y = i.next().number(); + } + + explicit Point( const BSONObj& o ) { + BSONObjIterator i(o); + _x = i.next().number(); + _y = i.next().number(); + } + + Point( double x , double y ) + : _x( x ) , _y( y ) { + } + + Point() : _x(0),_y(0) { + } + + GeoHash hash( const GeoConvert * g ) { + return g->hash( _x , _y ); + } + + double distance( const Point& p ) const { + double a = _x - p._x; + double b = _y - p._y; + + // Avoid numerical error if possible... + if( a == 0 ) return abs( _y - p._y ); + if( b == 0 ) return abs( _x - p._x ); + + return sqrt( ( a * a ) + ( b * b ) ); + } + + /** + * Distance method that compares x or y coords when other direction is zero, + * avoids numerical error when distances are very close to radius but axis-aligned. + * + * An example of the problem is: + * (52.0 - 51.9999) - 0.0001 = 3.31965e-15 and 52.0 - 51.9999 > 0.0001 in double arithmetic + * but: + * 51.9999 + 0.0001 <= 52.0 + * + * This avoids some (but not all!) suprising results in $center queries where points are + * ( radius + center.x, center.y ) or vice-versa. + */ + bool distanceWithin( const Point& p, double radius ) const { + double a = _x - p._x; + double b = _y - p._y; + + if( a == 0 ) { + // + // Note: For some, unknown reason, when a 32-bit g++ optimizes this call, the sum is + // calculated imprecisely. We need to force the compiler to always evaluate it correctly, + // hence the weirdness. + // + // On some 32-bit linux machines, removing the volatile keyword or calculating the sum inline + // will make certain geo tests fail. Of course this check will force volatile for all 32-bit systems, + // not just affected systems. + if( sizeof(void*) <= 4 ){ + volatile double sum = _y > p._y ? p._y + radius : _y + radius; + return _y > p._y ? sum >= _y : sum >= p._y; + } + else { + // Original math, correct for most systems + return _y > p._y ? p._y + radius >= _y : _y + radius >= p._y; + } + } + if( b == 0 ) { + if( sizeof(void*) <= 4 ){ + volatile double sum = _x > p._x ? p._x + radius : _x + radius; + return _x > p._x ? sum >= _x : sum >= p._x; + } + else { + return _x > p._x ? p._x + radius >= _x : _x + radius >= p._x; + } + } + + return sqrt( ( a * a ) + ( b * b ) ) <= radius; + } + + string toString() const { + StringBuilder buf(32); + buf << "(" << _x << "," << _y << ")"; + return buf.str(); + + } + + double _x; + double _y; + }; + + + extern const double EARTH_RADIUS_KM; + extern const double EARTH_RADIUS_MILES; + + // Technically lat/long bounds, not really tied to earth radius. + inline void checkEarthBounds( Point p ) { + uassert( 14808, str::stream() << "point " << p.toString() << " must be in earth-like bounds of long : [-180, 180), lat : [-90, 90] ", + p._x >= -180 && p._x < 180 && p._y >= -90 && p._y <= 90 ); + } + + inline double deg2rad(double deg) { return deg * (M_PI/180); } + inline double rad2deg(double rad) { return rad * (180/M_PI); } + + // WARNING: _x and _y MUST be longitude and latitude in that order + // note: multiply by earth radius for distance + inline double spheredist_rad( const Point& p1, const Point& p2 ) { + // this uses the n-vector formula: http://en.wikipedia.org/wiki/N-vector + // If you try to match the code to the formula, note that I inline the cross-product. + // TODO: optimize with SSE + + double sin_x1(sin(p1._x)), cos_x1(cos(p1._x)); + double sin_y1(sin(p1._y)), cos_y1(cos(p1._y)); + double sin_x2(sin(p2._x)), cos_x2(cos(p2._x)); + double sin_y2(sin(p2._y)), cos_y2(cos(p2._y)); + + double cross_prod = + (cos_y1*cos_x1 * cos_y2*cos_x2) + + (cos_y1*sin_x1 * cos_y2*sin_x2) + + (sin_y1 * sin_y2); + + if (cross_prod >= 1 || cross_prod <= -1) { + // fun with floats + assert( fabs(cross_prod)-1 < 1e-6 ); + return cross_prod > 0 ? 0 : M_PI; + } + + return acos(cross_prod); + } + + // note: return is still in radians as that can be multiplied by radius to get arc length + inline double spheredist_deg( const Point& p1, const Point& p2 ) { + return spheredist_rad( + Point( deg2rad(p1._x), deg2rad(p1._y) ), + Point( deg2rad(p2._x), deg2rad(p2._y) ) + ); + } + +} diff --git a/src/mongo/db/geo/haystack.cpp b/src/mongo/db/geo/haystack.cpp new file mode 100644 index 00000000000..104665087f6 --- /dev/null +++ b/src/mongo/db/geo/haystack.cpp @@ -0,0 +1,318 @@ +// db/geo/haystack.cpp + +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "pch.h" +#include "../namespace-inl.h" +#include "../jsobj.h" +#include "../index.h" +#include "../../util/unittest.h" +#include "../commands.h" +#include "../pdfile.h" +#include "../btree.h" +#include "../curop-inl.h" +#include "../matcher.h" +#include "core.h" +#include "../../util/timer.h" + +#define GEOQUADDEBUG(x) +//#define GEOQUADDEBUG(x) cout << x << endl + +/** + * this is a geo based search piece, which is different than regular geo lookup + * this is useful when you want to look for something within a region where the ratio is low + * works well for search for restaurants withing 25 miles with a certain name + * should not be used for finding the closest restaurants that are open + */ +namespace mongo { + + string GEOSEARCHNAME = "geoHaystack"; + + class GeoHaystackSearchHopper { + public: + GeoHaystackSearchHopper( const BSONObj& n , double maxDistance , unsigned limit , const string& geoField ) + : _near( n ) , _maxDistance( maxDistance ) , _limit( limit ) , _geoField(geoField) { + + } + + void got( const DiskLoc& loc ) { + Point p( loc.obj().getFieldDotted( _geoField ) ); + if ( _near.distance( p ) > _maxDistance ) + return; + _locs.push_back( loc ); + } + + int append( BSONArrayBuilder& b ) { + for ( unsigned i=0; i<_locs.size() && i<_limit; i++ ) + b.append( _locs[i].obj() ); + return _locs.size(); + } + + Point _near; + double _maxDistance; + unsigned _limit; + string _geoField; + + vector<DiskLoc> _locs; + }; + + class GeoHaystackSearchIndex : public IndexType { + + public: + + GeoHaystackSearchIndex( const IndexPlugin* plugin , const IndexSpec* spec ) + : IndexType( plugin , spec ) { + + BSONElement e = spec->info["bucketSize"]; + uassert( 13321 , "need bucketSize" , e.isNumber() ); + _bucketSize = e.numberDouble(); + + BSONObjBuilder orderBuilder; + + BSONObjIterator i( spec->keyPattern ); + while ( i.more() ) { + BSONElement e = i.next(); + if ( e.type() == String && GEOSEARCHNAME == e.valuestr() ) { + uassert( 13314 , "can't have 2 geo fields" , _geo.size() == 0 ); + uassert( 13315 , "2d has to be first in index" , _other.size() == 0 ); + _geo = e.fieldName(); + } + else { + _other.push_back( e.fieldName() ); + } + orderBuilder.append( "" , 1 ); + } + + uassert( 13316 , "no geo field specified" , _geo.size() ); + uassert( 13317 , "no other fields specified" , _other.size() ); + uassert( 13326 , "quadrant search can only have 1 other field for now" , _other.size() == 1 ); + _order = orderBuilder.obj(); + } + + int hash( const BSONElement& e ) const { + uassert( 13322 , "not a number" , e.isNumber() ); + return hash( e.numberDouble() ); + } + + int hash( double d ) const { + d += 180; + d /= _bucketSize; + return (int)d; + } + + string makeString( int hashedX , int hashedY ) const { + stringstream ss; + ss << hashedX << "_" << hashedY; + return ss.str(); + } + + void _add( const BSONObj& obj, const string& root , const BSONElement& e , BSONObjSet& keys ) const { + BSONObjBuilder buf; + buf.append( "" , root ); + if ( e.eoo() ) + buf.appendNull( "" ); + else + buf.appendAs( e , "" ); + + BSONObj key = buf.obj(); + GEOQUADDEBUG( obj << "\n\t" << root << "\n\t" << key ); + keys.insert( key ); + } + + void getKeys( const BSONObj &obj, BSONObjSet &keys ) const { + + BSONElement loc = obj.getFieldDotted( _geo ); + if ( loc.eoo() ) + return; + + uassert( 13323 , "latlng not an array" , loc.isABSONObj() ); + string root; + { + BSONObjIterator i( loc.Obj() ); + BSONElement x = i.next(); + BSONElement y = i.next(); + root = makeString( hash(x) , hash(y) ); + } + + + assert( _other.size() == 1 ); + + BSONElementSet all; + obj.getFieldsDotted( _other[0] , all ); + + if ( all.size() == 0 ) { + _add( obj , root , BSONElement() , keys ); + } + else { + for ( BSONElementSet::iterator i=all.begin(); i!=all.end(); ++i ) { + _add( obj , root , *i , keys ); + } + } + + } + + shared_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const { + shared_ptr<Cursor> c; + assert(0); + return c; + } + + void searchCommand( NamespaceDetails* nsd , int idxNo , + const BSONObj& n /*near*/ , double maxDistance , const BSONObj& search , + BSONObjBuilder& result , unsigned limit ) { + + Timer t; + + log(1) << "SEARCH near:" << n << " maxDistance:" << maxDistance << " search: " << search << endl; + int x,y; + { + BSONObjIterator i( n ); + x = hash( i.next() ); + y = hash( i.next() ); + } + int scale = (int)ceil( maxDistance / _bucketSize ); + + GeoHaystackSearchHopper hopper(n,maxDistance,limit,_geo); + + long long btreeMatches = 0; + + for ( int a=-scale; a<=scale; a++ ) { + for ( int b=-scale; b<=scale; b++ ) { + + BSONObjBuilder bb; + bb.append( "" , makeString( x + a , y + b ) ); + for ( unsigned i=0; i<_other.size(); i++ ) { + BSONElement e = search.getFieldDotted( _other[i] ); + if ( e.eoo() ) + bb.appendNull( "" ); + else + bb.appendAs( e , "" ); + } + + BSONObj key = bb.obj(); + + GEOQUADDEBUG( "KEY: " << key ); + + set<DiskLoc> thisPass; + scoped_ptr<BtreeCursor> cursor( BtreeCursor::make( nsd , idxNo , *getDetails() , key , key , true , 1 ) ); + while ( cursor->ok() ) { + pair<set<DiskLoc>::iterator, bool> p = thisPass.insert( cursor->currLoc() ); + if ( p.second ) { + hopper.got( cursor->currLoc() ); + GEOQUADDEBUG( "\t" << cursor->current() ); + btreeMatches++; + } + cursor->advance(); + } + } + + } + + BSONArrayBuilder arr( result.subarrayStart( "results" ) ); + int num = hopper.append( arr ); + arr.done(); + + { + BSONObjBuilder b( result.subobjStart( "stats" ) ); + b.append( "time" , t.millis() ); + b.appendNumber( "btreeMatches" , btreeMatches ); + b.append( "n" , num ); + b.done(); + } + } + + const IndexDetails* getDetails() const { + return _spec->getDetails(); + } + + string _geo; + vector<string> _other; + + BSONObj _order; + + double _bucketSize; + }; + + class GeoHaystackSearchIndexPlugin : public IndexPlugin { + public: + GeoHaystackSearchIndexPlugin() : IndexPlugin( GEOSEARCHNAME ) { + } + + virtual IndexType* generate( const IndexSpec* spec ) const { + return new GeoHaystackSearchIndex( this , spec ); + } + + } nameIndexPlugin; + + + class GeoHaystackSearchCommand : public Command { + public: + GeoHaystackSearchCommand() : Command( "geoSearch" ) {} + virtual LockType locktype() const { return READ; } + bool slaveOk() const { return true; } + bool slaveOverrideOk() const { return true; } + bool run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + + string ns = dbname + "." + cmdObj.firstElement().valuestr(); + + NamespaceDetails * d = nsdetails( ns.c_str() ); + if ( ! d ) { + errmsg = "can't find ns"; + return false; + } + + vector<int> idxs; + d->findIndexByType( GEOSEARCHNAME , idxs ); + if ( idxs.size() == 0 ) { + errmsg = "no geoSearch index"; + return false; + } + if ( idxs.size() > 1 ) { + errmsg = "more than 1 geosearch index"; + return false; + } + + int idxNum = idxs[0]; + + IndexDetails& id = d->idx( idxNum ); + GeoHaystackSearchIndex * si = (GeoHaystackSearchIndex*)id.getSpec().getType(); + assert( &id == si->getDetails() ); + + BSONElement n = cmdObj["near"]; + BSONElement maxDistance = cmdObj["maxDistance"]; + BSONElement search = cmdObj["search"]; + + uassert( 13318 , "near needs to be an array" , n.isABSONObj() ); + uassert( 13319 , "maxDistance needs a number" , maxDistance.isNumber() ); + uassert( 13320 , "search needs to be an object" , search.type() == Object ); + + unsigned limit = 50; + if ( cmdObj["limit"].isNumber() ) + limit = (unsigned)cmdObj["limit"].numberInt(); + + si->searchCommand( d , idxNum , n.Obj() , maxDistance.numberDouble() , search.Obj() , result , limit ); + + return 1; + } + + } nameSearchCommand; + + + + + +} diff --git a/src/mongo/db/globals.h b/src/mongo/db/globals.h new file mode 100644 index 00000000000..093bec76a0e --- /dev/null +++ b/src/mongo/db/globals.h @@ -0,0 +1,54 @@ +// @file globals.h +// grouping of global variables to make concurrency work clearer + +#pragma once + +namespace mongo { + + void assertStartingUp(); + + // this is prototype for now, we'll see if it is helpful + + /** "value is Const After Server Init" helper + * + * Example: + * + * casi<int> foo = 3; + * foo.ref() = 4; // asserts if not still in server init + * int x = foo+1; // ok anytime + * + */ + template< class T > + class casi : boost::noncopyable { + T val; + public: + casi(const T& t) : val(t) { + DEV assertStartingUp(); + } + operator const T& () { return val; } + T& ref() { + DEV assertStartingUp(); + return val; + } + }; + + /** partially specialized for cases where out global variable is a pointer -- we want the value + * pointed at to be constant, not just the pointer itself + */ + template< typename T > + class casi<T*> : boost::noncopyable { + T * val; + void operator=(T*); + public: + casi(T* t) : val(t) { + DEV assertStartingUp(); + } + operator const T* () { return val; } + const T* get() { return val; } + T*& ref() { + DEV assertStartingUp(); + return val; + } + }; + +} diff --git a/src/mongo/db/helpers/dblogger.h b/src/mongo/db/helpers/dblogger.h new file mode 100644 index 00000000000..4d6ee6d78c4 --- /dev/null +++ b/src/mongo/db/helpers/dblogger.h @@ -0,0 +1,31 @@ +// @file db.logger.h + +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +namespace mongo { + + /** helper to log (and read log) of a capped collection in the database */ + class DBLogger { + bool _inited; + public: + const string _ns; + DBLogger(string ns) : _inited(false), _ns(ns) { } + }; + +} diff --git a/src/mongo/db/index.cpp b/src/mongo/db/index.cpp new file mode 100644 index 00000000000..5eaeab551df --- /dev/null +++ b/src/mongo/db/index.cpp @@ -0,0 +1,446 @@ +/** @file index.cpp */ + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "namespace-inl.h" +#include "index.h" +#include "btree.h" +#include "background.h" +#include "repl/rs.h" +#include "ops/delete.h" + + +namespace mongo { + + template< class V > + class IndexInterfaceImpl : public IndexInterface { + public: + typedef typename V::KeyOwned KeyOwned; + typedef Continuation<V> Cont; + virtual int keyCompare(const BSONObj& l,const BSONObj& r, const Ordering &ordering); + + Cont *c[NamespaceDetails::NIndexesMax]; + int n; + + public: + IndexInterfaceImpl() { n = 0; } + + /* lacking CONCURRENCY WRITE this supports only one writer */ + void _phasedBegin() { + // we do this here as phasedFinish can throw exceptions (we could catch there, but just as easy to do here) + for( int i = 0; i < n; i++ ) { + delete c[i]; + c[i] = 0; // defensive + } + n = 0; + } + void phasedQueueItemToInsert( + int idxNo, + DiskLoc thisLoc, DiskLoc _recordLoc, const BSONObj &_key, + const Ordering& _order, IndexDetails& _idx, bool dupsAllowed) + { + if( idxNo >= n ) + n = idxNo + 1; + Cont *C = c[idxNo] = new Cont(thisLoc, _recordLoc, _key, _order, _idx); + thisLoc.btree<V>()->twoStepInsert(thisLoc, *C, dupsAllowed); + } + void _phasedFinish() { + for( int i = 0; i < n; i++ ) { + // if mixing v0 and v1 indexes, in that case (only) there could be nulls in the list + if( c[i] ) { + c[i]->stepTwo(); + } + } + } + +/* virtual DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, + int& pos, bool& found, const DiskLoc &recordLoc, int direction) { + return thisLoc.btree<V>()->locate(idx, thisLoc, key, order, pos, found, recordLoc, direction); + } + */ + virtual long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order) { + return thisLoc.btree<V>()->fullValidate(thisLoc, order); + } + virtual DiskLoc findSingle(const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key) const { + return thisLoc.btree<V>()->findSingle(indexdetails,thisLoc,key); + } + virtual bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const { + return thisLoc.btree<V>()->unindex(thisLoc, id, key, recordLoc); + } + virtual int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc, + const BSONObj& key, const Ordering &order, bool dupsAllowed, + IndexDetails& idx, bool toplevel = true) const { + return thisLoc.btree<V>()->bt_insert(thisLoc, recordLoc, key, order, dupsAllowed, idx, toplevel); + } + virtual DiskLoc addBucket(const IndexDetails& id) { + return BtreeBucket<V>::addBucket(id); + } + virtual void uassertIfDups(IndexDetails& idx, vector<BSONObj*>& addedKeys, DiskLoc head, DiskLoc self, const Ordering& ordering) { + const BtreeBucket<V> *h = head.btree<V>(); + for( vector<BSONObj*>::iterator i = addedKeys.begin(); i != addedKeys.end(); i++ ) { + KeyOwned k(**i); + bool dup = h->wouldCreateDup(idx, head, k, ordering, self); + uassert( 11001 , h->dupKeyError( idx , k ) , !dup); + } + } + + // for geo: + virtual bool isUsed(DiskLoc thisLoc, int pos) { return thisLoc.btree<V>()->isUsed(pos); } + virtual void keyAt(DiskLoc thisLoc, int pos, BSONObj& key, DiskLoc& recordLoc) { + recordLoc = DiskLoc(); + const BtreeBucket<V>* bucket = thisLoc.btree<V>(); + int n = bucket->nKeys(); + + if( pos < 0 || pos >= n || n == 0xffff /* bucket deleted */ || ! bucket->isUsed( pos ) ){ + // log() << "Pos: " << pos << " n " << n << endl; + return; + } + + typename BtreeBucket<V>::KeyNode kn = bucket->keyNode(pos); + key = kn.key.toBson(); + recordLoc = kn.recordLoc; + } + virtual BSONObj keyAt(DiskLoc thisLoc, int pos) { + return thisLoc.btree<V>()->keyAt(pos).toBson(); + } + virtual DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, + int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) { + return thisLoc.btree<V>()->locate(idx, thisLoc, key, order, pos, found, recordLoc, direction); + } + virtual DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) { + return thisLoc.btree<V>()->advance(thisLoc,keyOfs,direction,caller); + } + }; + + int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o); // key.cpp + + template <> + int IndexInterfaceImpl< V0 >::keyCompare(const BSONObj& l, const BSONObj& r, const Ordering &ordering) { + return oldCompare(l, r, ordering); + } + + template <> + int IndexInterfaceImpl< V1 >::keyCompare(const BSONObj& l, const BSONObj& r, const Ordering &ordering) { + return l.woCompare(r, ordering, /*considerfieldname*/false); + } + + IndexInterfaceImpl<V0> iii_v0; + IndexInterfaceImpl<V1> iii_v1; + + IndexInterface *IndexDetails::iis[] = { &iii_v0, &iii_v1 }; + + void IndexInterface::phasedBegin() { + iii_v0._phasedBegin(); + iii_v1._phasedBegin(); + } + void IndexInterface::phasedFinish() { + iii_v0._phasedFinish(); + iii_v1._phasedFinish(); + } + + int removeFromSysIndexes(const char *ns, const char *idxName) { + string system_indexes = cc().database()->name + ".system.indexes"; + BSONObjBuilder b; + b.append("ns", ns); + b.append("name", idxName); // e.g.: { name: "ts_1", ns: "foo.coll" } + BSONObj cond = b.done(); + return (int) deleteObjects(system_indexes.c_str(), cond, false, false, true); + } + + /* this is just an attempt to clean up old orphaned stuff on a delete all indexes + call. repair database is the clean solution, but this gives one a lighter weight + partial option. see dropIndexes() + */ + void assureSysIndexesEmptied(const char *ns, IndexDetails *idIndex) { + string system_indexes = cc().database()->name + ".system.indexes"; + BSONObjBuilder b; + b.append("ns", ns); + if( idIndex ) { + b.append("name", BSON( "$ne" << idIndex->indexName().c_str() )); + } + BSONObj cond = b.done(); + int n = (int) deleteObjects(system_indexes.c_str(), cond, false, false, true); + if( n ) { + log() << "info: assureSysIndexesEmptied cleaned up " << n << " entries" << endl; + } + } + + int IndexDetails::keyPatternOffset( const string& key ) const { + BSONObjIterator i( keyPattern() ); + int n = 0; + while ( i.more() ) { + BSONElement e = i.next(); + if ( key == e.fieldName() ) + return n; + n++; + } + return -1; + } + + const IndexSpec& IndexDetails::getSpec() const { + SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex); + return NamespaceDetailsTransient::get_inlock( info.obj()["ns"].valuestr() ).getIndexSpec( this ); + } + + /* delete this index. does NOT clean up the system catalog + (system.indexes or system.namespaces) -- only NamespaceIndex. + */ + void IndexDetails::kill_idx() { + string ns = indexNamespace(); // e.g. foo.coll.$ts_1 + try { + + string pns = parentNS(); // note we need a copy, as parentNS() won't work after the drop() below + + // clean up parent namespace index cache + NamespaceDetailsTransient::get( pns.c_str() ).deletedIndex(); + + string name = indexName(); + + /* important to catch exception here so we can finish cleanup below. */ + try { + dropNS(ns.c_str()); + } + catch(DBException& ) { + log(2) << "IndexDetails::kill(): couldn't drop ns " << ns << endl; + } + head.setInvalid(); + info.setInvalid(); + + // clean up in system.indexes. we do this last on purpose. + int n = removeFromSysIndexes(pns.c_str(), name.c_str()); + wassert( n == 1 ); + + } + catch ( DBException &e ) { + log() << "exception in kill_idx: " << e << ", ns: " << ns << endl; + } + } + + void IndexDetails::getKeysFromObject( const BSONObj& obj, BSONObjSet& keys) const { + getSpec().getKeys( obj, keys ); + } + + void setDifference(BSONObjSet &l, BSONObjSet &r, vector<BSONObj*> &diff) { + // l and r must use the same ordering spec. + verify( 14819, l.key_comp().order() == r.key_comp().order() ); + BSONObjSet::iterator i = l.begin(); + BSONObjSet::iterator j = r.begin(); + while ( 1 ) { + if ( i == l.end() ) + break; + while ( j != r.end() && j->woCompare( *i ) < 0 ) + j++; + if ( j == r.end() || i->woCompare(*j) != 0 ) { + const BSONObj *jo = &*i; + diff.push_back( (BSONObj *) jo ); + } + i++; + } + } + + void getIndexChanges(vector<IndexChanges>& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj, bool &changedId) { + int z = d.nIndexesBeingBuilt(); + v.resize(z); + for( int i = 0; i < z; i++ ) { + IndexDetails& idx = d.idx(i); + BSONObj idxKey = idx.info.obj().getObjectField("key"); // eg { ts : 1 } + IndexChanges& ch = v[i]; + idx.getKeysFromObject(oldObj, ch.oldkeys); + idx.getKeysFromObject(newObj, ch.newkeys); + if( ch.newkeys.size() > 1 ) + d.setIndexIsMultikey(i); + setDifference(ch.oldkeys, ch.newkeys, ch.removed); + setDifference(ch.newkeys, ch.oldkeys, ch.added); + if ( ch.removed.size() > 0 && ch.added.size() > 0 && idx.isIdIndex() ) { + changedId = true; + } + } + } + + void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d, DiskLoc curObjLoc) { + int z = d.nIndexesBeingBuilt(); + for( int i = 0; i < z; i++ ) { + IndexDetails& idx = d.idx(i); + v[i].dupCheck(idx, curObjLoc); + } + } + + // should be { <something> : <simpletype[1|-1]>, .keyp.. } + static bool validKeyPattern(BSONObj kp) { + BSONObjIterator i(kp); + while( i.moreWithEOO() ) { + BSONElement e = i.next(); + if( e.type() == Object || e.type() == Array ) + return false; + } + return true; + } + + /* Prepare to build an index. Does not actually build it (except for a special _id case). + - We validate that the params are good + - That the index does not already exist + - Creates the source collection if it DNE + + example of 'io': + { ns : 'test.foo', name : 'z', key : { z : 1 } } + + throws DBException + + @param sourceNS - source NS we are indexing + @param sourceCollection - its details ptr + @return true if ok to continue. when false we stop/fail silently (index already exists) + */ + bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject ) { + sourceCollection = 0; + + // logical name of the index. todo: get rid of the name, we don't need it! + const char *name = io.getStringField("name"); + uassert(12523, "no index name specified", *name); + + // the collection for which we are building an index + sourceNS = io.getStringField("ns"); + uassert(10096, "invalid ns to index", sourceNS.find( '.' ) != string::npos); + uassert(10097, "bad table to index name on add index attempt", + cc().database()->name == nsToDatabase(sourceNS.c_str())); + + BSONObj key = io.getObjectField("key"); + uassert(12524, "index key pattern too large", key.objsize() <= 2048); + if( !validKeyPattern(key) ) { + string s = string("bad index key pattern ") + key.toString(); + uasserted(10098 , s.c_str()); + } + + if ( sourceNS.empty() || key.isEmpty() ) { + log(2) << "bad add index attempt name:" << (name?name:"") << "\n ns:" << + sourceNS << "\n idxobj:" << io.toString() << endl; + string s = "bad add index attempt " + sourceNS + " key:" + key.toString(); + uasserted(12504, s); + } + + sourceCollection = nsdetails(sourceNS.c_str()); + if( sourceCollection == 0 ) { + // try to create it + string err; + if ( !userCreateNS(sourceNS.c_str(), BSONObj(), err, false) ) { + problem() << "ERROR: failed to create collection while adding its index. " << sourceNS << endl; + return false; + } + sourceCollection = nsdetails(sourceNS.c_str()); + tlog() << "info: creating collection " << sourceNS << " on add index" << endl; + assert( sourceCollection ); + } + + if ( sourceCollection->findIndexByName(name) >= 0 ) { + // index already exists. + return false; + } + if( sourceCollection->findIndexByKeyPattern(key) >= 0 ) { + log(2) << "index already exists with diff name " << name << ' ' << key.toString() << endl; + return false; + } + + if ( sourceCollection->nIndexes >= NamespaceDetails::NIndexesMax ) { + stringstream ss; + ss << "add index fails, too many indexes for " << sourceNS << " key:" << key.toString(); + string s = ss.str(); + log() << s << '\n'; + uasserted(12505,s); + } + + /* we can't build a new index for the ns if a build is already in progress in the background - + EVEN IF this is a foreground build. + */ + uassert(12588, "cannot add index with a background operation in progress", + !BackgroundOperation::inProgForNs(sourceNS.c_str())); + + /* this is because we want key patterns like { _id : 1 } and { _id : <someobjid> } to + all be treated as the same pattern. + */ + if ( IndexDetails::isIdIndexPattern(key) ) { + if( !god ) { + ensureHaveIdIndex( sourceNS.c_str() ); + return false; + } + } + else { + /* is buildIndexes:false set for this replica set member? + if so we don't build any indexes except _id + */ + if( theReplSet && !theReplSet->buildIndexes() ) + return false; + } + + string pluginName = IndexPlugin::findPluginName( key ); + IndexPlugin * plugin = pluginName.size() ? IndexPlugin::get( pluginName ) : 0; + + + { + BSONObj o = io; + if ( plugin ) { + o = plugin->adjustIndexSpec(o); + } + BSONObjBuilder b; + int v = DefaultIndexVersionNumber; + if( !o["v"].eoo() ) { + double vv = o["v"].Number(); + // note (one day) we may be able to fresh build less versions than we can use + // isASupportedIndexVersionNumber() is what we can use + uassert(14803, str::stream() << "this version of mongod cannot build new indexes of version number " << vv, + vv == 0 || vv == 1); + v = (int) vv; + } + // idea is to put things we use a lot earlier + b.append("v", v); + b.append(o["key"]); + if( o["unique"].trueValue() ) + b.appendBool("unique", true); // normalize to bool true in case was int 1 or something... + b.append(o["ns"]); + + { + // stripping _id + BSONObjIterator i(o); + while ( i.more() ) { + BSONElement e = i.next(); + string s = e.fieldName(); + if( s != "_id" && s != "v" && s != "ns" && s != "unique" && s != "key" ) + b.append(e); + } + } + + fixedIndexObject = b.obj(); + } + + return true; + } + + void IndexSpec::reset( const IndexDetails * details ) { + _details = details; + reset( details->info ); + } + + void IndexSpec::reset( const BSONObj& _info ) { + info = _info; + keyPattern = info["key"].embeddedObjectUserCheck(); + if ( keyPattern.objsize() == 0 ) { + out() << info.toString() << endl; + assert(false); + } + _init(); + } + +} diff --git a/src/mongo/db/index.h b/src/mongo/db/index.h new file mode 100644 index 00000000000..d297f8a4ca1 --- /dev/null +++ b/src/mongo/db/index.h @@ -0,0 +1,237 @@ +// index.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../pch.h" +#include "diskloc.h" +#include "jsobj.h" +#include "indexkey.h" +#include "key.h" + +namespace mongo { + + class IndexInterface { + protected: + virtual ~IndexInterface() { } + public: + static void phasedBegin(); + virtual void phasedQueueItemToInsert( + int idxNo, + DiskLoc thisLoc, DiskLoc _recordLoc, const BSONObj &_key, + const Ordering& _order, IndexDetails& _idx, bool dupsAllowed) = 0; + static void phasedFinish(); + + virtual int keyCompare(const BSONObj& l,const BSONObj& r, const Ordering &ordering) = 0; + virtual long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order) = 0; + virtual DiskLoc findSingle(const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key) const = 0; + virtual bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const = 0; + virtual int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc, + const BSONObj& key, const Ordering &order, bool dupsAllowed, + IndexDetails& idx, bool toplevel = true) const = 0; + virtual DiskLoc addBucket(const IndexDetails&) = 0; + virtual void uassertIfDups(IndexDetails& idx, vector<BSONObj*>& addedKeys, DiskLoc head, + DiskLoc self, const Ordering& ordering) = 0; + + // these are for geo + virtual bool isUsed(DiskLoc thisLoc, int pos) = 0; + virtual void keyAt(DiskLoc thisLoc, int pos, BSONObj&, DiskLoc& recordLoc) = 0; + virtual BSONObj keyAt(DiskLoc thisLoc, int pos) = 0; + virtual DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, + int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) = 0; + virtual DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) = 0; + }; + + /* Details about a particular index. There is one of these effectively for each object in + system.namespaces (although this also includes the head pointer, which is not in that + collection). + + ** MemoryMapped Record ** (i.e., this is on disk data) + */ + class IndexDetails { + public: + /** + * btree head disk location + * TODO We should make this variable private, since btree operations + * may change its value and we don't want clients to rely on an old + * value. If we create a btree class, we can provide a btree object + * to clients instead of 'head'. + */ + DiskLoc head; + + /* Location of index info object. Format: + + { name:"nameofindex", ns:"parentnsname", key: {keypattobject} + [, unique: <bool>, background: <bool>, v:<version>] + } + + This object is in the system.indexes collection. Note that since we + have a pointer to the object here, the object in system.indexes MUST NEVER MOVE. + */ + DiskLoc info; + + /* extract key value from the query object + e.g., if key() == { x : 1 }, + { x : 70, y : 3 } -> { x : 70 } + */ + BSONObj getKeyFromQuery(const BSONObj& query) const { + BSONObj k = keyPattern(); + BSONObj res = query.extractFieldsUnDotted(k); + return res; + } + + /* pull out the relevant key objects from obj, so we + can index them. Note that the set is multiple elements + only when it's a "multikey" array. + keys will be left empty if key not found in the object. + */ + void getKeysFromObject( const BSONObj& obj, BSONObjSet& keys) const; + + /* get the key pattern for this object. + e.g., { lastname:1, firstname:1 } + */ + BSONObj keyPattern() const { + return info.obj().getObjectField("key"); + } + + /** + * @return offset into keyPattern for key + -1 if doesn't exist + */ + int keyPatternOffset( const string& key ) const; + bool inKeyPattern( const string& key ) const { return keyPatternOffset( key ) >= 0; } + + /* true if the specified key is in the index */ + bool hasKey(const BSONObj& key); + + // returns name of this index's storage area + // database.table.$index + string indexNamespace() const { + BSONObj io = info.obj(); + string s; + s.reserve(Namespace::MaxNsLen); + s = io.getStringField("ns"); + assert( !s.empty() ); + s += ".$"; + s += io.getStringField("name"); + return s; + } + + string indexName() const { // e.g. "ts_1" + BSONObj io = info.obj(); + return io.getStringField("name"); + } + + static bool isIdIndexPattern( const BSONObj &pattern ) { + BSONObjIterator i(pattern); + BSONElement e = i.next(); + if( strcmp(e.fieldName(), "_id") != 0 ) return false; + return i.next().eoo(); + } + + /* returns true if this is the _id index. */ + bool isIdIndex() const { + return isIdIndexPattern( keyPattern() ); + } + + /* gets not our namespace name (indexNamespace for that), + but the collection we index, its name. + */ + string parentNS() const { + BSONObj io = info.obj(); + return io.getStringField("ns"); + } + + static int versionForIndexObj( const BSONObj &obj ) { + BSONElement e = obj["v"]; + if( e.type() == NumberInt ) + return e._numberInt(); + // should normally be an int. this is for backward compatibility + int v = e.numberInt(); + uassert(14802, "index v field should be Integer type", v == 0); + return v; + } + + int version() const { + return versionForIndexObj( info.obj() ); + } + + /** @return true if index has unique constraint */ + bool unique() const { + BSONObj io = info.obj(); + return io["unique"].trueValue() || + /* temp: can we juse make unique:true always be there for _id and get rid of this? */ + isIdIndex(); + } + + /** return true if dropDups was set when building index (if any duplicates, dropdups drops the duplicating objects) */ + bool dropDups() const { + return info.obj().getBoolField( "dropDups" ); + } + + /** delete this index. does NOT clean up the system catalog + (system.indexes or system.namespaces) -- only NamespaceIndex. + */ + void kill_idx(); + + const IndexSpec& getSpec() const; + + string toString() const { + return info.obj().toString(); + } + + /** @return true if supported. supported means we can use the index, including adding new keys. + it may not mean we can build the index version in question: we may not maintain building + of indexes in old formats in the future. + */ + static bool isASupportedIndexVersionNumber(int v) { return (v&1)==v; } // v == 0 || v == 1 + + /** @return the interface for this interface, which varies with the index version. + used for backward compatibility of index versions/formats. + */ + IndexInterface& idxInterface() const { + int v = version(); + dassert( isASupportedIndexVersionNumber(v) ); + return *iis[v&1]; + } + + static IndexInterface *iis[]; + }; + + struct IndexChanges { /*on an update*/ + BSONObjSet oldkeys; + BSONObjSet newkeys; + vector<BSONObj*> removed; // these keys were removed as part of the change + vector<BSONObj*> added; // these keys were added as part of the change + + /** @curObjLoc - the object we want to add's location. if it is already in the + index, that is allowed here (for bg indexing case). + */ + void dupCheck(IndexDetails& idx, DiskLoc curObjLoc) { + if( added.empty() || !idx.unique() ) + return; + const Ordering ordering = Ordering::make(idx.keyPattern()); + idx.idxInterface().uassertIfDups(idx, added, idx.head, curObjLoc, ordering); // "E11001 duplicate key on update" + } + }; + + class NamespaceDetails; + // changedId should be initialized to false + void getIndexChanges(vector<IndexChanges>& v, NamespaceDetails& d, BSONObj newObj, BSONObj oldObj, bool &cangedId); + void dupCheck(vector<IndexChanges>& v, NamespaceDetails& d, DiskLoc curObjLoc); +} // namespace mongo diff --git a/src/mongo/db/indexkey.cpp b/src/mongo/db/indexkey.cpp new file mode 100644 index 00000000000..18dfcb079b9 --- /dev/null +++ b/src/mongo/db/indexkey.cpp @@ -0,0 +1,462 @@ +// index_key.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "namespace-inl.h" +#include "index.h" +#include "btree.h" +#include "ops/query.h" +#include "background.h" +#include "../util/text.h" + +namespace mongo { + + /** old (<= v1.8) : 0 + 1 is new version + */ + const int DefaultIndexVersionNumber = 1; + + map<string,IndexPlugin*> * IndexPlugin::_plugins; + + IndexType::IndexType( const IndexPlugin * plugin , const IndexSpec * spec ) + : _plugin( plugin ) , _spec( spec ) { + + } + + IndexType::~IndexType() { + } + + const BSONObj& IndexType::keyPattern() const { + return _spec->keyPattern; + } + + IndexPlugin::IndexPlugin( const string& name ) + : _name( name ) { + if ( ! _plugins ) + _plugins = new map<string,IndexPlugin*>(); + (*_plugins)[name] = this; + } + + string IndexPlugin::findPluginName( const BSONObj& keyPattern ) { + string pluginName = ""; + + BSONObjIterator i( keyPattern ); + + while( i.more() ) { + BSONElement e = i.next(); + if ( e.type() != String ) + continue; + + uassert( 13007 , "can only have 1 index plugin / bad index key pattern" , pluginName.size() == 0 || pluginName == e.String() ); + pluginName = e.String(); + } + + return pluginName; + } + + int IndexType::compare( const BSONObj& l , const BSONObj& r ) const { + return l.woCompare( r , _spec->keyPattern ); + } + + void IndexSpec::_init() { + assert( keyPattern.objsize() ); + + // some basics + _nFields = keyPattern.nFields(); + _sparse = info["sparse"].trueValue(); + uassert( 13529 , "sparse only works for single field keys" , ! _sparse || _nFields ); + + + { + // build _nullKey + + BSONObjBuilder b; + BSONObjIterator i( keyPattern ); + + while( i.more() ) { + BSONElement e = i.next(); + _fieldNames.push_back( e.fieldName() ); + _fixed.push_back( BSONElement() ); + b.appendNull( "" ); + } + _nullKey = b.obj(); + } + + { + // _nullElt + BSONObjBuilder b; + b.appendNull( "" ); + _nullObj = b.obj(); + _nullElt = _nullObj.firstElement(); + } + + { + // _undefinedElt + BSONObjBuilder b; + b.appendUndefined( "" ); + _undefinedObj = b.obj(); + _undefinedElt = _undefinedObj.firstElement(); + } + + { + // handle plugins + string pluginName = IndexPlugin::findPluginName( keyPattern ); + if ( pluginName.size() ) { + IndexPlugin * plugin = IndexPlugin::get( pluginName ); + if ( ! plugin ) { + log() << "warning: can't find plugin [" << pluginName << "]" << endl; + } + else { + _indexType.reset( plugin->generate( this ) ); + } + } + } + + _finishedInit = true; + } + + void assertParallelArrays( const char *first, const char *second ) { + stringstream ss; + ss << "cannot index parallel arrays [" << first << "] [" << second << "]"; + uasserted( ParallelArraysCode , ss.str() ); + } + + class KeyGeneratorV0 { + public: + KeyGeneratorV0( const IndexSpec &spec ) : _spec( spec ) {} + + void getKeys( const BSONObj &obj, BSONObjSet &keys ) const { + if ( _spec._indexType.get() ) { //plugin (eg geo) + _spec._indexType->getKeys( obj , keys ); + return; + } + vector<const char*> fieldNames( _spec._fieldNames ); + vector<BSONElement> fixed( _spec._fixed ); + _getKeys( fieldNames , fixed , obj, keys ); + if ( keys.empty() && ! _spec._sparse ) + keys.insert( _spec._nullKey ); + } + + private: + void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSet &keys ) const { + BSONElement arrElt; + unsigned arrIdx = ~0; + int numNotFound = 0; + + for( unsigned i = 0; i < fieldNames.size(); ++i ) { + if ( *fieldNames[ i ] == '\0' ) + continue; + + BSONElement e = obj.getFieldDottedOrArray( fieldNames[ i ] ); + + if ( e.eoo() ) { + e = _spec._nullElt; // no matching field + numNotFound++; + } + + if ( e.type() != Array ) + fieldNames[ i ] = ""; // no matching field or non-array match + + if ( *fieldNames[ i ] == '\0' ) + fixed[ i ] = e; // no need for further object expansion (though array expansion still possible) + + if ( e.type() == Array && arrElt.eoo() ) { // we only expand arrays on a single path -- track the path here + arrIdx = i; + arrElt = e; + } + + // enforce single array path here + if ( e.type() == Array && e.rawdata() != arrElt.rawdata() ) { + assertParallelArrays( e.fieldName(), arrElt.fieldName() ); + } + } + + bool allFound = true; // have we found elements for all field names in the key spec? + for( vector<const char*>::const_iterator i = fieldNames.begin(); i != fieldNames.end(); ++i ) { + if ( **i != '\0' ) { + allFound = false; + break; + } + } + + if ( _spec._sparse && numNotFound == _spec._nFields ) { + // we didn't find any fields + // so we're not going to index this document + return; + } + + bool insertArrayNull = false; + + if ( allFound ) { + if ( arrElt.eoo() ) { + // no terminal array element to expand + BSONObjBuilder b(_spec._sizeTracker); + for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i ) + b.appendAs( *i, "" ); + keys.insert( b.obj() ); + } + else { + // terminal array element to expand, so generate all keys + BSONObjIterator i( arrElt.embeddedObject() ); + if ( i.more() ) { + while( i.more() ) { + BSONObjBuilder b(_spec._sizeTracker); + for( unsigned j = 0; j < fixed.size(); ++j ) { + if ( j == arrIdx ) + b.appendAs( i.next(), "" ); + else + b.appendAs( fixed[ j ], "" ); + } + keys.insert( b.obj() ); + } + } + else if ( fixed.size() > 1 ) { + insertArrayNull = true; + } + } + } + else { + // nonterminal array element to expand, so recurse + assert( !arrElt.eoo() ); + BSONObjIterator i( arrElt.embeddedObject() ); + if ( i.more() ) { + while( i.more() ) { + BSONElement e = i.next(); + if ( e.type() == Object ) { + _getKeys( fieldNames, fixed, e.embeddedObject(), keys ); + } + } + } + else { + insertArrayNull = true; + } + } + + if ( insertArrayNull ) { + // x : [] - need to insert undefined + BSONObjBuilder b(_spec._sizeTracker); + for( unsigned j = 0; j < fixed.size(); ++j ) { + if ( j == arrIdx ) { + b.appendUndefined( "" ); + } + else { + BSONElement e = fixed[j]; + if ( e.eoo() ) + b.appendNull( "" ); + else + b.appendAs( e , "" ); + } + } + keys.insert( b.obj() ); + } + } + + const IndexSpec &_spec; + }; + + class KeyGeneratorV1 { + public: + KeyGeneratorV1( const IndexSpec &spec ) : _spec( spec ) {} + + void getKeys( const BSONObj &obj, BSONObjSet &keys ) const { + if ( _spec._indexType.get() ) { //plugin (eg geo) + _spec._indexType->getKeys( obj , keys ); + return; + } + vector<const char*> fieldNames( _spec._fieldNames ); + vector<BSONElement> fixed( _spec._fixed ); + _getKeys( fieldNames , fixed , obj, keys ); + if ( keys.empty() && ! _spec._sparse ) + keys.insert( _spec._nullKey ); + } + + private: + /** + * @param arrayNestedArray - set if the returned element is an array nested directly within arr. + */ + BSONElement extractNextElement( const BSONObj &obj, const BSONObj &arr, const char *&field, bool &arrayNestedArray ) const { + string firstField = mongoutils::str::before( field, '.' ); + bool haveObjField = !obj.getField( firstField ).eoo(); + BSONElement arrField = arr.getField( firstField ); + bool haveArrField = !arrField.eoo(); + + // An index component field name cannot exist in both a document array and one of that array's children. + uassert( 15855 , str::stream() << "Ambiguous field name found in array (do not use numeric field names in embedded elements in an array), field: '" << arrField.fieldName() << "' for array: " << arr, !haveObjField || !haveArrField ); + + arrayNestedArray = false; + if ( haveObjField ) { + return obj.getFieldDottedOrArray( field ); + } + else if ( haveArrField ) { + if ( arrField.type() == Array ) { + arrayNestedArray = true; + } + return arr.getFieldDottedOrArray( field ); + } + return BSONElement(); + } + + void _getKeysArrEltFixed( vector<const char*> &fieldNames , vector<BSONElement> &fixed , const BSONElement &arrEntry, BSONObjSet &keys, int numNotFound, const BSONElement &arrObjElt, const set< unsigned > &arrIdxs, bool mayExpandArrayUnembedded ) const { + // set up any terminal array values + for( set<unsigned>::const_iterator j = arrIdxs.begin(); j != arrIdxs.end(); ++j ) { + if ( *fieldNames[ *j ] == '\0' ) { + fixed[ *j ] = mayExpandArrayUnembedded ? arrEntry : arrObjElt; + } + } + // recurse + _getKeys( fieldNames, fixed, ( arrEntry.type() == Object ) ? arrEntry.embeddedObject() : BSONObj(), keys, numNotFound, arrObjElt.embeddedObject() ); + } + + /** + * @param fieldNames - fields to index, may be postfixes in recursive calls + * @param fixed - values that have already been identified for their index fields + * @param obj - object from which keys should be extracted, based on names in fieldNames + * @param keys - set where index keys are written + * @param numNotFound - number of index fields that have already been identified as missing + * @param array - array from which keys should be extracted, based on names in fieldNames + * If obj and array are both nonempty, obj will be one of the elements of array. + */ + void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSet &keys, int numNotFound = 0, const BSONObj &array = BSONObj() ) const { + BSONElement arrElt; + set<unsigned> arrIdxs; + bool mayExpandArrayUnembedded = true; + for( unsigned i = 0; i < fieldNames.size(); ++i ) { + if ( *fieldNames[ i ] == '\0' ) { + continue; + } + + bool arrayNestedArray; + // Extract element matching fieldName[ i ] from object xor array. + BSONElement e = extractNextElement( obj, array, fieldNames[ i ], arrayNestedArray ); + + if ( e.eoo() ) { + // if field not present, set to null + fixed[ i ] = _spec._nullElt; + // done expanding this field name + fieldNames[ i ] = ""; + numNotFound++; + } + else if ( e.type() == Array ) { + arrIdxs.insert( i ); + if ( arrElt.eoo() ) { + // we only expand arrays on a single path -- track the path here + arrElt = e; + } + else if ( e.rawdata() != arrElt.rawdata() ) { + // enforce single array path here + assertParallelArrays( e.fieldName(), arrElt.fieldName() ); + } + if ( arrayNestedArray ) { + mayExpandArrayUnembedded = false; + } + } + else { + // not an array - no need for further expansion + fixed[ i ] = e; + } + } + + if ( arrElt.eoo() ) { + // No array, so generate a single key. + if ( _spec._sparse && numNotFound == _spec._nFields ) { + return; + } + BSONObjBuilder b(_spec._sizeTracker); + for( vector< BSONElement >::iterator i = fixed.begin(); i != fixed.end(); ++i ) { + b.appendAs( *i, "" ); + } + keys.insert( b.obj() ); + } + else if ( arrElt.embeddedObject().firstElement().eoo() ) { + // Empty array, so set matching fields to undefined. + _getKeysArrEltFixed( fieldNames, fixed, _spec._undefinedElt, keys, numNotFound, arrElt, arrIdxs, true ); + } + else { + // Non empty array that can be expanded, so generate a key for each member. + BSONObj arrObj = arrElt.embeddedObject(); + BSONObjIterator i( arrObj ); + while( i.more() ) { + _getKeysArrEltFixed( fieldNames, fixed, i.next(), keys, numNotFound, arrElt, arrIdxs, mayExpandArrayUnembedded ); + } + } + } + + const IndexSpec &_spec; + }; + + void IndexSpec::getKeys( const BSONObj &obj, BSONObjSet &keys ) const { + switch( indexVersion() ) { + case 0: { + KeyGeneratorV0 g( *this ); + g.getKeys( obj, keys ); + break; + } + case 1: { + KeyGeneratorV1 g( *this ); + g.getKeys( obj, keys ); + break; + } + default: + massert( 15869, "Invalid index version for key generation.", false ); + } + } + + bool anyElementNamesMatch( const BSONObj& a , const BSONObj& b ) { + BSONObjIterator x(a); + while ( x.more() ) { + BSONElement e = x.next(); + BSONObjIterator y(b); + while ( y.more() ) { + BSONElement f = y.next(); + FieldCompareResult res = compareDottedFieldNames( e.fieldName() , f.fieldName() ); + if ( res == SAME || res == LEFT_SUBFIELD || res == RIGHT_SUBFIELD ) + return true; + } + } + return false; + } + + IndexSuitability IndexSpec::suitability( const BSONObj& query , const BSONObj& order ) const { + if ( _indexType.get() ) + return _indexType->suitability( query , order ); + return _suitability( query , order ); + } + + IndexSuitability IndexSpec::_suitability( const BSONObj& query , const BSONObj& order ) const { + // TODO: optimize + if ( anyElementNamesMatch( keyPattern , query ) == 0 && anyElementNamesMatch( keyPattern , order ) == 0 ) + return USELESS; + return HELPFUL; + } + + IndexSuitability IndexType::suitability( const BSONObj& query , const BSONObj& order ) const { + return _spec->_suitability( query , order ); + } + + int IndexSpec::indexVersion() const { + if ( !info.hasField( "v" ) ) { + return DefaultIndexVersionNumber; + } + return IndexDetails::versionForIndexObj( info ); + } + + bool IndexType::scanAndOrderRequired( const BSONObj& query , const BSONObj& order ) const { + return ! order.isEmpty(); + } + +} diff --git a/src/mongo/db/indexkey.h b/src/mongo/db/indexkey.h new file mode 100644 index 00000000000..12cd755e8a0 --- /dev/null +++ b/src/mongo/db/indexkey.h @@ -0,0 +1,198 @@ +// index_key.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../pch.h" +#include "diskloc.h" +#include "jsobj.h" +#include <map> + +namespace mongo { + + extern const int DefaultIndexVersionNumber; + + const int ParallelArraysCode = 10088; + + class Cursor; + class IndexSpec; + class IndexType; // TODO: this name sucks + class IndexPlugin; + class IndexDetails; + + enum IndexSuitability { USELESS = 0 , HELPFUL = 1 , OPTIMAL = 2 }; + + /** + * this represents an instance of a index plugin + * done this way so parsing, etc... can be cached + * so if there is a FTS IndexPlugin, for each index using FTS + * there will be 1 of these, and it can have things pre-parsed, etc... + */ + class IndexType : boost::noncopyable { + public: + IndexType( const IndexPlugin * plugin , const IndexSpec * spec ); + virtual ~IndexType(); + + virtual void getKeys( const BSONObj &obj, BSONObjSet &keys ) const = 0; + virtual shared_ptr<Cursor> newCursor( const BSONObj& query , const BSONObj& order , int numWanted ) const = 0; + + /** optional op : changes query to match what's in the index */ + virtual BSONObj fixKey( const BSONObj& in ) { return in; } + + /** optional op : compare 2 objects with regards to this index */ + virtual int compare( const BSONObj& l , const BSONObj& r ) const; + + /** @return plugin */ + const IndexPlugin * getPlugin() const { return _plugin; } + + const BSONObj& keyPattern() const; + + virtual IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const ; + + virtual bool scanAndOrderRequired( const BSONObj& query , const BSONObj& order ) const ; + + protected: + const IndexPlugin * _plugin; + const IndexSpec * _spec; + }; + + /** + * this represents a plugin + * a plugin could be something like full text search, sparse index, etc... + * 1 of these exists per type of index per server + * 1 IndexType is created per index using this plugin + */ + class IndexPlugin : boost::noncopyable { + public: + IndexPlugin( const string& name ); + virtual ~IndexPlugin() {} + + virtual IndexType* generate( const IndexSpec * spec ) const = 0; + + string getName() const { return _name; } + + /** + * @return new keyPattern + * if nothing changes, should return keyPattern + */ + virtual BSONObj adjustIndexSpec( const BSONObj& spec ) const { return spec; } + + // ------- static below ------- + + static IndexPlugin* get( const string& name ) { + if ( ! _plugins ) + return 0; + map<string,IndexPlugin*>::iterator i = _plugins->find( name ); + if ( i == _plugins->end() ) + return 0; + return i->second; + } + + /** + * @param keyPattern { x : "fts" } + * @return "" or the name + */ + static string findPluginName( const BSONObj& keyPattern ); + + private: + string _name; + static map<string,IndexPlugin*> * _plugins; + }; + + /* precomputed details about an index, used for inserting keys on updates + stored/cached in NamespaceDetailsTransient, or can be used standalone + */ + class IndexSpec { + public: + BSONObj keyPattern; // e.g., { name : 1 } + BSONObj info; // this is the same as IndexDetails::info.obj() + + IndexSpec() + : _details(0) , _finishedInit(false) { + } + + explicit IndexSpec( const BSONObj& k , const BSONObj& m = BSONObj() ) + : keyPattern(k) , info(m) , _details(0) , _finishedInit(false) { + _init(); + } + + /** + this is a DiscLoc of an IndexDetails info + should have a key field + */ + explicit IndexSpec( const DiskLoc& loc ) { + reset( loc ); + } + + void reset( const BSONObj& info ); + void reset( const DiskLoc& infoLoc ) { reset(infoLoc.obj()); } + void reset( const IndexDetails * details ); + + void getKeys( const BSONObj &obj, BSONObjSet &keys ) const; + + BSONElement missingField() const { return _nullElt; } + + string getTypeName() const { + if ( _indexType.get() ) + return _indexType->getPlugin()->getName(); + return ""; + } + + IndexType* getType() const { + return _indexType.get(); + } + + const IndexDetails * getDetails() const { + return _details; + } + + IndexSuitability suitability( const BSONObj& query , const BSONObj& order ) const ; + + protected: + + int indexVersion() const; + + IndexSuitability _suitability( const BSONObj& query , const BSONObj& order ) const ; + + BSONSizeTracker _sizeTracker; + vector<const char*> _fieldNames; + vector<BSONElement> _fixed; + + BSONObj _nullKey; // a full key with all fields null + BSONObj _nullObj; // only used for _nullElt + BSONElement _nullElt; // jstNull + + BSONObj _undefinedObj; // only used for _undefinedElt + BSONElement _undefinedElt; // undefined + + int _nFields; // number of fields in the index + bool _sparse; // if the index is sparse + shared_ptr<IndexType> _indexType; + const IndexDetails * _details; + + void _init(); + + friend class IndexType; + friend class KeyGeneratorV0; + friend class KeyGeneratorV1; + public: + bool _finishedInit; + }; + + +} // namespace mongo diff --git a/src/mongo/db/instance.cpp b/src/mongo/db/instance.cpp new file mode 100644 index 00000000000..c8f8c6ea85b --- /dev/null +++ b/src/mongo/db/instance.cpp @@ -0,0 +1,1148 @@ +// instance.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "db.h" +#include "../bson/util/atomic_int.h" +#include "introspect.h" +#include "repl.h" +#include "dbmessage.h" +#include "instance.h" +#include "lasterror.h" +#include "security.h" +#include "json.h" +#include "replutil.h" +#include "../s/d_logic.h" +#include "../util/file_allocator.h" +#include "../util/goodies.h" +#include "cmdline.h" +#if !defined(_WIN32) +#include <sys/file.h> +#endif +#include "stats/counters.h" +#include "background.h" +#include "dur_journal.h" +#include "dur_recover.h" +#include "d_concurrency.h" +#include "ops/count.h" +#include "ops/delete.h" +#include "ops/query.h" +#include "ops/update.h" +#include "pagefault.h" + +namespace mongo { + + // "diaglog" + inline void opread(Message& m) { if( _diaglog.getLevel() & 2 ) _diaglog.readop((char *) m.singleData(), m.header()->len); } + inline void opwrite(Message& m) { if( _diaglog.getLevel() & 1 ) _diaglog.write((char *) m.singleData(), m.header()->len); } + + void receivedKillCursors(Message& m); + void receivedUpdate(Message& m, CurOp& op); + void receivedDelete(Message& m, CurOp& op); + void receivedInsert(Message& m, CurOp& op); + bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ); + + int nloggedsome = 0; +#define LOGWITHRATELIMIT if( ++nloggedsome < 1000 || nloggedsome % 100 == 0 ) + + string dbExecCommand; + + DiagLog _diaglog; + + bool useCursors = true; + bool useHints = true; + + KillCurrentOp killCurrentOp; + + int lockFile = 0; +#ifdef _WIN32 + HANDLE lockFileHandle; +#endif + + // see FSyncCommand: + extern bool lockedForWriting; + + OpTime OpTime::now() { + DEV d.dbMutex.assertWriteLocked(); + return now_inlock(); + } + OpTime OpTime::last_inlock(){ + DEV d.dbMutex.assertAtLeastReadLocked(); + return last; + } + + // OpTime::now() uses dbMutex, thus it is in this file not in the cpp files used by drivers and such + void BSONElementManipulator::initTimestamp() { + massert( 10332 , "Expected CurrentTime type", _element.type() == Timestamp ); + unsigned long long ×tamp = *( reinterpret_cast< unsigned long long* >( value() ) ); + if ( timestamp == 0 ) + timestamp = OpTime::now().asDate(); + } + void BSONElementManipulator::SetNumber(double d) { + if ( _element.type() == NumberDouble ) + *getDur().writing( reinterpret_cast< double * >( value() ) ) = d; + else if ( _element.type() == NumberInt ) + *getDur().writing( reinterpret_cast< int * >( value() ) ) = (int) d; + else assert(0); + } + void BSONElementManipulator::SetLong(long long n) { + assert( _element.type() == NumberLong ); + *getDur().writing( reinterpret_cast< long long * >(value()) ) = n; + } + void BSONElementManipulator::SetInt(int n) { + assert( _element.type() == NumberInt ); + getDur().writingInt( *reinterpret_cast< int * >( value() ) ) = n; + } + /* dur:: version */ + void BSONElementManipulator::ReplaceTypeAndValue( const BSONElement &e ) { + char *d = data(); + char *v = value(); + int valsize = e.valuesize(); + int ofs = (int) (v-d); + dassert( ofs > 0 ); + char *p = (char *) getDur().writingPtr(d, valsize + ofs); + *p = e.type(); + memcpy( p + ofs, e.value(), valsize ); + } + + void inProgCmd( Message &m, DbResponse &dbresponse ) { + BSONObjBuilder b; + + if( ! cc().isAdmin() ) { + b.append("err", "unauthorized"); + } + else { + DbMessage d(m); + QueryMessage q(d); + bool all = q.query["$all"].trueValue(); + vector<BSONObj> vals; + { + Client& me = cc(); + scoped_lock bl(Client::clientsMutex); + auto_ptr<Matcher> m(new Matcher(q.query)); + for( set<Client*>::iterator i = Client::clients.begin(); i != Client::clients.end(); i++ ) { + Client *c = *i; + assert( c ); + CurOp* co = c->curop(); + if ( c == &me && !co ) { + continue; + } + assert( co ); + if( all || co->active() ) { + BSONObj info = co->infoNoauth(); + if ( all || m->matches( info )) { + vals.push_back( info ); + } + } + } + } + b.append("inprog", vals); + unsigned x = lockedForWriting; + if( x ) { + b.append("fsyncLock", x); + b.append("info", "use db.fsyncUnlock() to terminate the fsync write/snapshot lock"); + } + } + + replyToQuery(0, m, dbresponse, b.obj()); + } + + void killOp( Message &m, DbResponse &dbresponse ) { + BSONObj obj; + if( ! cc().isAdmin() ) { + obj = fromjson("{\"err\":\"unauthorized\"}"); + } + /*else if( !dbMutexInfo.isLocked() ) + obj = fromjson("{\"info\":\"no op in progress/not locked\"}"); + */ + else { + DbMessage d(m); + QueryMessage q(d); + BSONElement e = q.query.getField("op"); + if( !e.isNumber() ) { + obj = fromjson("{\"err\":\"no op number field specified?\"}"); + } + else { + log() << "going to kill op: " << e << endl; + obj = fromjson("{\"info\":\"attempting to kill op\"}"); + killCurrentOp.kill( (unsigned) e.number() ); + } + } + replyToQuery(0, m, dbresponse, obj); + } + + void unlockFsyncAndWait(); + void unlockFsync(const char *ns, Message& m, DbResponse &dbresponse) { + BSONObj obj; + if ( ! cc().isAdmin() ) { // checks auth + obj = fromjson("{\"err\":\"unauthorized\"}"); + } + else if (strncmp(ns, "admin.", 6) != 0 ) { + obj = fromjson("{\"err\":\"unauthorized - this command must be run against the admin DB\"}"); + } + else { + if( lockedForWriting ) { + log() << "command: unlock requested" << endl; + obj = fromjson("{ok:1,\"info\":\"unlock completed\"}"); + unlockFsyncAndWait(); + } + else { + obj = fromjson("{ok:0,\"errmsg\":\"not locked\"}"); + } + } + replyToQuery(0, m, dbresponse, obj); + } + + static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ) { + bool ok = true; + MSGID responseTo = m.header()->id; + + DbMessage d(m); + QueryMessage q(d); + auto_ptr< Message > resp( new Message() ); + + CurOp& op = *(c.curop()); + + shared_ptr<AssertionException> ex; + + try { + dbresponse.exhaust = runQuery(m, q, op, *resp); + assert( !resp->empty() ); + } + catch ( SendStaleConfigException& e ){ + ex.reset( new SendStaleConfigException( e.getns(), e.getInfo().msg ) ); + ok = false; + } + catch ( AssertionException& e ) { + ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) ); + ok = false; + } + + if( ex ){ + + op.debug().exceptionInfo = ex->getInfo(); + LOGWITHRATELIMIT { + log() << "assertion " << ex->toString() << " ns:" << q.ns << " query:" << + (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl; + if( q.ntoskip || q.ntoreturn ) + log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl; + } + + SendStaleConfigException* scex = NULL; + if ( ex->getCode() == SendStaleConfigCode ) scex = static_cast<SendStaleConfigException*>( ex.get() ); + + BSONObjBuilder err; + ex->getInfo().append( err ); + if( scex ) err.append( "ns", scex->getns() ); + BSONObj errObj = err.done(); + + log() << errObj << endl; + + BufBuilder b; + b.skip(sizeof(QueryResult)); + b.appendBuf((void*) errObj.objdata(), errObj.objsize()); + + // todo: call replyToQuery() from here instead of this!!! see dbmessage.h + QueryResult * msgdata = (QueryResult *) b.buf(); + b.decouple(); + QueryResult *qr = msgdata; + qr->_resultFlags() = ResultFlag_ErrSet; + if( scex ) qr->_resultFlags() |= ResultFlag_ShardConfigStale; + qr->len = b.len(); + qr->setOperation(opReply); + qr->cursorId = 0; + qr->startingFrom = 0; + qr->nReturned = 1; + resp.reset( new Message() ); + resp->setData( msgdata, true ); + + } + + op.debug().responseLength = resp->header()->dataLen(); + + dbresponse.response = resp.release(); + dbresponse.responseTo = responseTo; + + return ok; + } + + void (*reportEventToSystem)(const char *msg) = 0; + + void mongoAbort(const char *msg) { + if( reportEventToSystem ) + reportEventToSystem(msg); + rawOut(msg); + ::abort(); + } + + // Returns false when request includes 'end' + void _assembleResponse( Message &m, DbResponse &dbresponse, const HostAndPort& remote ) { + + // before we lock... + int op = m.operation(); + bool isCommand = false; + const char *ns = m.singleData()->_data + 4; + if ( op == dbQuery ) { + if( strstr(ns, ".$cmd") ) { + isCommand = true; + opwrite(m); + if( strstr(ns, ".$cmd.sys.") ) { + if( strstr(ns, "$cmd.sys.inprog") ) { + inProgCmd(m, dbresponse); + return; + } + if( strstr(ns, "$cmd.sys.killop") ) { + killOp(m, dbresponse); + return; + } + if( strstr(ns, "$cmd.sys.unlock") ) { + unlockFsync(ns, m, dbresponse); + return; + } + } + } + else { + opread(m); + } + } + else if( op == dbGetMore ) { + opread(m); + } + else { + opwrite(m); + } + + globalOpCounters.gotOp( op , isCommand ); + + Client& c = cc(); + + auto_ptr<CurOp> nestedOp; + CurOp* currentOpP = c.curop(); + if ( currentOpP->active() ) { + nestedOp.reset( new CurOp( &c , currentOpP ) ); + currentOpP = nestedOp.get(); + } + CurOp& currentOp = *currentOpP; + currentOp.reset(remote,op); + + OpDebug& debug = currentOp.debug(); + debug.op = op; + + int logThreshold = cmdLine.slowMS; + bool log = logLevel >= 1; + + if ( op == dbQuery ) { + if ( handlePossibleShardedMessage( m , &dbresponse ) ) + return; + receivedQuery(c , dbresponse, m ); + } + else if ( op == dbGetMore ) { + if ( ! receivedGetMore(dbresponse, m, currentOp) ) + log = true; + } + else if ( op == dbMsg ) { + // deprecated - replaced by commands + char *p = m.singleData()->_data; + int len = strlen(p); + if ( len > 400 ) + out() << curTimeMillis64() % 10000 << + " long msg received, len:" << len << endl; + + Message *resp = new Message(); + if ( strcmp( "end" , p ) == 0 ) + resp->setData( opReply , "dbMsg end no longer supported" ); + else + resp->setData( opReply , "i am fine - dbMsg deprecated"); + + dbresponse.response = resp; + dbresponse.responseTo = m.header()->id; + } + else { + const char *ns = m.singleData()->_data + 4; + char cl[256]; + nsToDatabase(ns, cl); + if( ! c.getAuthenticationInfo()->isAuthorized(cl) ) { + uassert_nothrow("unauthorized"); + } + else { + try { + if ( op == dbInsert ) { + receivedInsert(m, currentOp); + } + else if ( op == dbUpdate ) { + receivedUpdate(m, currentOp); + } + else if ( op == dbDelete ) { + receivedDelete(m, currentOp); + } + else if ( op == dbKillCursors ) { + currentOp.ensureStarted(); + logThreshold = 10; + receivedKillCursors(m); + } + else { + mongo::log() << " operation isn't supported: " << op << endl; + currentOp.done(); + log = true; + } + } + catch ( UserException& ue ) { + tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing " << ue.toString() << endl; + debug.exceptionInfo = ue.getInfo(); + } + catch ( AssertionException& e ) { + tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing " << e.toString() << endl; + debug.exceptionInfo = e.getInfo(); + log = true; + } + } + } + currentOp.ensureStarted(); + currentOp.done(); + debug.executionTime = currentOp.totalTimeMillis(); + + //DEV log = true; + if ( log || debug.executionTime > logThreshold ) { + if( logLevel < 3 && op == dbGetMore && strstr(ns, ".oplog.") && debug.executionTime < 4300 && !log ) { + /* it's normal for getMore on the oplog to be slow because of use of awaitdata flag. */ + } + else { + mongo::tlog() << debug << endl; + } + } + + if ( currentOp.shouldDBProfile( debug.executionTime ) ) { + // performance profiling is on + if ( d.dbMutex.getState() < 0 ) { + mongo::log(1) << "note: not profiling because recursive read lock" << endl; + } + else { + writelock lk; + if ( dbHolder()._isLoaded( nsToDatabase( currentOp.getNS() ) , dbpath ) ) { + Client::Context cx( currentOp.getNS() ); + profile(c , currentOp ); + } + else { + mongo::log() << "note: not profiling because db went away - probably a close on: " << currentOp.getNS() << endl; + } + } + } + + debug.reset(); + } /* _assembleResponse() */ + + void assembleResponse( Message &m, DbResponse &dbresponse, const HostAndPort& remote ) { + PageFaultRetryableSection s; + while( 1 ) { + try { + _assembleResponse( m, dbresponse, remote ); + break; + } + catch( PageFaultException& e ) { + DEV log() << "TEMP PageFaultException touch and retry" << endl; + e.touch(); + } + } + } + + void receivedKillCursors(Message& m) { + int *x = (int *) m.singleData()->_data; + x++; // reserved + int n = *x++; + + uassert( 13659 , "sent 0 cursors to kill" , n != 0 ); + massert( 13658 , str::stream() << "bad kill cursors size: " << m.dataSize() , m.dataSize() == 8 + ( 8 * n ) ); + uassert( 13004 , str::stream() << "sent negative cursors to kill: " << n , n >= 1 ); + + if ( n > 2000 ) { + log( n < 30000 ? LL_WARNING : LL_ERROR ) << "receivedKillCursors, n=" << n << endl; + assert( n < 30000 ); + } + + int found = ClientCursor::erase(n, (long long *) x); + + if ( logLevel > 0 || found != n ) { + log( found == n ) << "killcursors: found " << found << " of " << n << endl; + } + + } + + /* db - database name + path - db directory + */ + /*static*/ void Database::closeDatabase( const char *db, const string& path ) { + assertInWriteLock(); + + Client::Context * ctx = cc().getContext(); + assert( ctx ); + assert( ctx->inDB( db , path ) ); + Database *database = ctx->db(); + assert( database->name == db ); + + oplogCheckCloseDatabase( database ); // oplog caches some things, dirty its caches + + if( BackgroundOperation::inProgForDb(db) ) { + log() << "warning: bg op in prog during close db? " << db << endl; + } + + /* important: kill all open cursors on the database */ + string prefix(db); + prefix += '.'; + ClientCursor::invalidate(prefix.c_str()); + + NamespaceDetailsTransient::clearForPrefix( prefix.c_str() ); + + dbHolderW().erase( db, path ); + ctx->_clear(); + delete database; // closes files + } + + void receivedUpdate(Message& m, CurOp& op) { + DbMessage d(m); + const char *ns = d.getns(); + op.debug().ns = ns; + int flags = d.pullInt(); + BSONObj query = d.nextJsObj(); + + assert( d.moreJSObjs() ); + assert( query.objsize() < m.header()->dataLen() ); + BSONObj toupdate = d.nextJsObj(); + uassert( 10055 , "update object too large", toupdate.objsize() <= BSONObjMaxUserSize); + assert( toupdate.objsize() < m.header()->dataLen() ); + assert( query.objsize() + toupdate.objsize() < m.header()->dataLen() ); + bool upsert = flags & UpdateOption_Upsert; + bool multi = flags & UpdateOption_Multi; + bool broadcast = flags & UpdateOption_Broadcast; + + op.debug().query = query; + op.setQuery(query); + + writelock lk; + + // void ReplSetImpl::relinquish() uses big write lock so + // this is thus synchronized given our lock above. + uassert( 10054 , "not master", isMasterNs( ns ) ); + + // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit + if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) ) + return; + + Client::Context ctx( ns ); + + UpdateResult res = updateObjects(ns, toupdate, query, upsert, multi, true, op.debug() ); + lastError.getSafe()->recordUpdate( res.existing , res.num , res.upserted ); // for getlasterror + } + + void receivedDelete(Message& m, CurOp& op) { + DbMessage d(m); + const char *ns = d.getns(); + op.debug().ns = ns; + int flags = d.pullInt(); + bool justOne = flags & RemoveOption_JustOne; + bool broadcast = flags & RemoveOption_Broadcast; + assert( d.moreJSObjs() ); + BSONObj pattern = d.nextJsObj(); + + op.debug().query = pattern; + op.setQuery(pattern); + + writelock lk(ns); + + // writelock is used to synchronize stepdowns w/ writes + uassert( 10056 , "not master", isMasterNs( ns ) ); + + // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit + if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) ) + return; + + Client::Context ctx(ns); + + long long n = deleteObjects(ns, pattern, justOne, true); + lastError.getSafe()->recordDelete( n ); + } + + QueryResult* emptyMoreResult(long long); + + void OpTime::waitForDifferent(unsigned millis){ + DEV d.dbMutex.assertAtLeastReadLocked(); + + if (*this != last) return; // check early + + boost::xtime timeout; + boost::xtime_get(&timeout, boost::TIME_UTC); + + timeout.nsec += millis * 1000*1000; + if (timeout.nsec >= 1000*1000*1000){ + timeout.nsec -= 1000*1000*1000; + timeout.sec += 1; + } + + do { + dbtemprelease tmp; + boost::mutex::scoped_lock lk(notifyMutex()); + if (!notifier().timed_wait(lk, timeout)) + return; // timed out + } while (*this != last); + } + + bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) { + bool ok = true; + + DbMessage d(m); + + const char *ns = d.getns(); + int ntoreturn = d.pullInt(); + long long cursorid = d.pullInt64(); + + curop.debug().ns = ns; + curop.debug().ntoreturn = ntoreturn; + curop.debug().cursorid = cursorid; + + time_t start = 0; + int pass = 0; + bool exhaust = false; + QueryResult* msgdata; + OpTime last; + while( 1 ) { + try { + Client::ReadContext ctx(ns); + if (str::startsWith(ns, "local.oplog.")){ + if (pass == 0) + last = OpTime::last_inlock(); + else + last.waitForDifferent(1000/*ms*/); + } + msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust); + } + catch ( AssertionException& e ) { + exhaust = false; + curop.debug().exceptionInfo = e.getInfo(); + msgdata = emptyMoreResult(cursorid); + ok = false; + } + if (msgdata == 0) { + exhaust = false; + massert(13073, "shutting down", !inShutdown() ); + if( pass == 0 ) { + start = time(0); + } + else { + if( time(0) - start >= 4 ) { + // after about 4 seconds, return. pass stops at 1000 normally. + // we want to return occasionally so slave can checkpoint. + pass = 10000; + } + } + pass++; + if (debug) + sleepmillis(20); + else + sleepmillis(2); + continue; + } + break; + }; + + Message *resp = new Message(); + resp->setData(msgdata, true); + curop.debug().responseLength = resp->header()->dataLen(); + curop.debug().nreturned = msgdata->nReturned; + + dbresponse.response = resp; + dbresponse.responseTo = m.header()->id; + + if( exhaust ) { + curop.debug().exhaust = true; + dbresponse.exhaust = ns; + } + + return ok; + } + + void checkAndInsert(const char *ns, /*modifies*/BSONObj& js) { + uassert( 10059 , "object to insert too large", js.objsize() <= BSONObjMaxUserSize); + { + // check no $ modifiers. note we only check top level. (scanning deep would be quite expensive) + BSONObjIterator i( js ); + while ( i.more() ) { + BSONElement e = i.next(); + uassert( 13511 , "document to insert can't have $ fields" , e.fieldName()[0] != '$' ); + } + } + theDataFileMgr.insertWithObjMod(ns, js, false); // js may be modified in the call to add an _id field. + logOp("i", ns, js); + } + + NOINLINE_DECL void insertMulti(bool keepGoing, const char *ns, vector<BSONObj>& objs) { + size_t i; + for (i=0; i<objs.size(); i++){ + try { + checkAndInsert(ns, objs[i]); + getDur().commitIfNeeded(); + } catch (const UserException&) { + if (!keepGoing || i == objs.size()-1){ + globalOpCounters.incInsertInWriteLock(i); + throw; + } + // otherwise ignore and keep going + } + } + + globalOpCounters.incInsertInWriteLock(i); + } + + void receivedInsert(Message& m, CurOp& op) { + DbMessage d(m); + const char *ns = d.getns(); + op.debug().ns = ns; + + if( !d.moreJSObjs() ) { + // strange. should we complain? + return; + } + BSONObj first = d.nextJsObj(); + + vector<BSONObj> multi; + while (d.moreJSObjs()){ + if (multi.empty()) // first pass + multi.push_back(first); + multi.push_back( d.nextJsObj() ); + } + + writelock lk(ns); + //LockCollectionExclusively lk(ns); + + // CONCURRENCY TODO: is being read locked in big log sufficient here? + // writelock is used to synchronize stepdowns w/ writes + uassert( 10058 , "not master", isMasterNs(ns) ); + + if ( handlePossibleShardedMessage( m , 0 ) ) + return; + + Client::Context ctx(ns); + + if( !multi.empty() ) { + const bool keepGoing = d.reservedField() & InsertOption_ContinueOnError; + insertMulti(keepGoing, ns, multi); + return; + } + + checkAndInsert(ns, first); + globalOpCounters.incInsertInWriteLock(1); + } + + void getDatabaseNames( vector< string > &names , const string& usePath ) { + boost::filesystem::path path( usePath ); + for ( boost::filesystem::directory_iterator i( path ); + i != boost::filesystem::directory_iterator(); ++i ) { + if ( directoryperdb ) { + boost::filesystem::path p = *i; + string dbName = p.leaf(); + p /= ( dbName + ".ns" ); + if ( MMF::exists( p ) ) + names.push_back( dbName ); + } + else { + string fileName = boost::filesystem::path(*i).leaf(); + if ( fileName.length() > 3 && fileName.substr( fileName.length() - 3, 3 ) == ".ns" ) + names.push_back( fileName.substr( 0, fileName.length() - 3 ) ); + } + } + } + + /* returns true if there is data on this server. useful when starting replication. + local database does NOT count except for rsoplog collection. + used to set the hasData field on replset heartbeat command response + */ + bool replHasDatabases() { + vector<string> names; + getDatabaseNames(names); + if( names.size() >= 2 ) return true; + if( names.size() == 1 ) { + if( names[0] != "local" ) + return true; + // we have a local database. return true if oplog isn't empty + { + readlock lk(rsoplog); + BSONObj o; + if( Helpers::getFirst(rsoplog, o) ) + return true; + } + } + return false; + } + + bool DBDirectClient::call( Message &toSend, Message &response, bool assertOk , string * actualServer ) { + if ( lastError._get() ) + lastError.startRequest( toSend, lastError._get() ); + DbResponse dbResponse; + assembleResponse( toSend, dbResponse , _clientHost ); + assert( dbResponse.response ); + dbResponse.response->concat(); // can get rid of this if we make response handling smarter + response = *dbResponse.response; + getDur().commitIfNeeded(); + return true; + } + + void DBDirectClient::say( Message &toSend, bool isRetry ) { + if ( lastError._get() ) + lastError.startRequest( toSend, lastError._get() ); + DbResponse dbResponse; + assembleResponse( toSend, dbResponse , _clientHost ); + getDur().commitIfNeeded(); + } + + auto_ptr<DBClientCursor> DBDirectClient::query(const string &ns, Query query, int nToReturn , int nToSkip , + const BSONObj *fieldsToReturn , int queryOptions ) { + + //if ( ! query.obj.isEmpty() || nToReturn != 0 || nToSkip != 0 || fieldsToReturn || queryOptions ) + return DBClientBase::query( ns , query , nToReturn , nToSkip , fieldsToReturn , queryOptions ); + // + //assert( query.obj.isEmpty() ); + //throw UserException( (string)"yay:" + ns ); + } + + void DBDirectClient::killCursor( long long id ) { + ClientCursor::erase( id ); + } + + HostAndPort DBDirectClient::_clientHost = HostAndPort( "0.0.0.0" , 0 ); + + unsigned long long DBDirectClient::count(const string &ns, const BSONObj& query, int options, int limit, int skip ) { + LockCollectionForReading lk( ns ); + string errmsg; + long long res = runCount( ns.c_str() , _countCmd( ns , query , options , limit , skip ) , errmsg ); + if ( res == -1 ) + return 0; + uassert( 13637 , str::stream() << "count failed in DBDirectClient: " << errmsg , res >= 0 ); + return (unsigned long long )res; + } + + DBClientBase * createDirectClient() { + return new DBDirectClient(); + } + + mongo::mutex exitMutex("exit"); + AtomicUInt numExitCalls = 0; + + bool inShutdown() { + return numExitCalls > 0; + } + + void tryToOutputFatal( const string& s ) { + try { + rawOut( s ); + return; + } + catch ( ... ) {} + + try { + cerr << s << endl; + return; + } + catch ( ... ) {} + + // uh - oh, not sure there is anything else we can do... + } + + /** also called by ntservice.cpp */ + void shutdownServer() { + + log() << "shutdown: going to close listening sockets..." << endl; + ListeningSockets::get()->closeAll(); + + log() << "shutdown: going to flush diaglog..." << endl; + _diaglog.flush(); + + /* must do this before unmapping mem or you may get a seg fault */ + log() << "shutdown: going to close sockets..." << endl; + boost::thread close_socket_thread( boost::bind(MessagingPort::closeAllSockets, 0) ); + + // wait until file preallocation finishes + // we would only hang here if the file_allocator code generates a + // synchronous signal, which we don't expect + log() << "shutdown: waiting for fs preallocator..." << endl; + FileAllocator::get()->waitUntilFinished(); + + if( cmdLine.dur ) { + log() << "shutdown: lock for final commit..." << endl; + { + int n = 10; + while( 1 ) { + // we may already be in a read lock from earlier in the call stack, so do read lock here + // to be consistent with that. + readlocktry w("", 20000); + if( w.got() ) { + log() << "shutdown: final commit..." << endl; + getDur().commitNow(); + break; + } + if( --n <= 0 ) { + log() << "shutdown: couldn't acquire write lock, aborting" << endl; + mongoAbort("couldn't acquire write lock"); + } + log() << "shutdown: waiting for write lock..." << endl; + } + } + MemoryMappedFile::flushAll(true); + } + + log() << "shutdown: closing all files..." << endl; + stringstream ss3; + MemoryMappedFile::closeAllFiles( ss3 ); + log() << ss3.str() << endl; + + if( cmdLine.dur ) { + dur::journalCleanup(true); + } + +#if !defined(__sunos__) + if ( lockFile ) { + log() << "shutdown: removing fs lock..." << endl; + /* This ought to be an unlink(), but Eliot says the last + time that was attempted, there was a race condition + with acquirePathLock(). */ +#ifdef _WIN32 + if( _chsize( lockFile , 0 ) ) + log() << "couldn't remove fs lock " << WSAGetLastError() << endl; + CloseHandle(lockFileHandle); +#else + if( ftruncate( lockFile , 0 ) ) + log() << "couldn't remove fs lock " << errnoWithDescription() << endl; + flock( lockFile, LOCK_UN ); +#endif + } +#endif + } + + void exitCleanly( ExitCode code ) { + killCurrentOp.killAll(); + { + dblock lk; + log() << "now exiting" << endl; + dbexit( code ); + } + } + + + namespace dur { + extern mutex groupCommitMutex; + } + + /* not using log() herein in case we are already locked */ + NOINLINE_DECL void dbexit( ExitCode rc, const char *why, bool tryToGetLock ) { + + auto_ptr<writelocktry> wlt; + if ( tryToGetLock ) { + wlt.reset( new writelocktry( "" , 2 * 60 * 1000 ) ); + uassert( 13455 , "dbexit timed out getting lock" , wlt->got() ); + } + + Client * c = currentClient.get(); + { + scoped_lock lk( exitMutex ); + if ( numExitCalls++ > 0 ) { + if ( numExitCalls > 5 ) { + // this means something horrible has happened + ::_exit( rc ); + } + stringstream ss; + ss << "dbexit: " << why << "; exiting immediately"; + tryToOutputFatal( ss.str() ); + if ( c ) c->shutdown(); + ::exit( rc ); + } + } + + { + stringstream ss; + ss << "dbexit: " << why; + tryToOutputFatal( ss.str() ); + } + + try { + shutdownServer(); // gracefully shutdown instance + } + catch ( ... ) { + tryToOutputFatal( "shutdown failed with exception" ); + } + +#if defined(_DEBUG) + try { + mutexDebugger.programEnding(); + } + catch (...) { } +#endif + + // block the dur thread from doing any work for the rest of the run + log(2) << "shutdown: groupCommitMutex" << endl; + scoped_lock lk(dur::groupCommitMutex); + +#ifdef _WIN32 + // Windows Service Controller wants to be told when we are down, + // so don't call ::exit() yet, or say "really exiting now" + // + if ( rc == EXIT_WINDOWS_SERVICE_STOP ) { + if ( c ) c->shutdown(); + return; + } +#endif + tryToOutputFatal( "dbexit: really exiting now" ); + if ( c ) c->shutdown(); + ::exit(rc); + } + +#if !defined(__sunos__) + void writePid(int fd) { + stringstream ss; + ss << getpid() << endl; + string s = ss.str(); + const char * data = s.c_str(); +#ifdef _WIN32 + assert ( _write( fd, data, strlen( data ) ) ); +#else + assert ( write( fd, data, strlen( data ) ) ); +#endif + } + + void acquirePathLock(bool doingRepair) { + string name = ( boost::filesystem::path( dbpath ) / "mongod.lock" ).native_file_string(); + + bool oldFile = false; + + if ( boost::filesystem::exists( name ) && boost::filesystem::file_size( name ) > 0 ) { + oldFile = true; + } + +#ifdef _WIN32 + lockFileHandle = CreateFileA( name.c_str(), GENERIC_READ | GENERIC_WRITE, + 0 /* do not allow anyone else access */, NULL, + OPEN_ALWAYS /* success if fh can open */, 0, NULL ); + + if (lockFileHandle == INVALID_HANDLE_VALUE) { + DWORD code = GetLastError(); + char *msg; + FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, + NULL, code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + (LPSTR)&msg, 0, NULL); + string m = msg; + str::stripTrailing(m, "\r\n"); + uasserted( 13627 , str::stream() << "Unable to create/open lock file: " << name << ' ' << m << ". Is a mongod instance already running?" ); + } + lockFile = _open_osfhandle((intptr_t)lockFileHandle, 0); +#else + lockFile = open( name.c_str(), O_RDWR | O_CREAT , S_IRWXU | S_IRWXG | S_IRWXO ); + if( lockFile <= 0 ) { + uasserted( 10309 , str::stream() << "Unable to create/open lock file: " << name << ' ' << errnoWithDescription() << " Is a mongod instance already running?" ); + } + if (flock( lockFile, LOCK_EX | LOCK_NB ) != 0) { + close ( lockFile ); + lockFile = 0; + uassert( 10310 , "Unable to lock file: " + name + ". Is a mongod instance already running?", 0 ); + } +#endif + + if ( oldFile ) { + // we check this here because we want to see if we can get the lock + // if we can't, then its probably just another mongod running + + string errmsg; + if (cmdLine.dur) { + if (!dur::haveJournalFiles()) { + + vector<string> dbnames; + getDatabaseNames( dbnames ); + + if ( dbnames.size() == 0 ) { + // this means that mongod crashed + // between initial startup and when journaling was initialized + // it is safe to continue + } + else { + errmsg = str::stream() + << "************** \n" + << "old lock file: " << name << ". probably means unclean shutdown,\n" + << "but there are no journal files to recover.\n" + << "this is likely human error or filesystem corruption.\n" + << "found " << dbnames.size() << " dbs.\n" + << "see: http://dochub.mongodb.org/core/repair for more information\n" + << "*************"; + } + + + } + } + else { + if (!dur::haveJournalFiles() && !doingRepair) { + errmsg = str::stream() + << "************** \n" + << "Unclean shutdown detected.\n" + << "Please visit http://dochub.mongodb.org/core/repair for recovery instructions.\n" + << "*************"; + } + } + + if (!errmsg.empty()) { + cout << errmsg << endl; +#ifdef _WIN32 + CloseHandle( lockFileHandle ); +#else + close ( lockFile ); +#endif + lockFile = 0; + uassert( 12596 , "old lock file" , 0 ); + } + } + + // Not related to lock file, but this is where we handle unclean shutdown + if( !cmdLine.dur && dur::haveJournalFiles() ) { + cout << "**************" << endl; + cout << "Error: journal files are present in journal directory, yet starting without journaling enabled." << endl; + cout << "It is recommended that you start with journaling enabled so that recovery may occur." << endl; + cout << "**************" << endl; + uasserted(13597, "can't start without --journal enabled when journal/ files are present"); + } + +#ifdef _WIN32 + uassert( 13625, "Unable to truncate lock file", _chsize(lockFile, 0) == 0); + writePid( lockFile ); + _commit( lockFile ); +#else + uassert( 13342, "Unable to truncate lock file", ftruncate(lockFile, 0) == 0); + writePid( lockFile ); + fsync( lockFile ); + flushMyDirectory(name); +#endif + } +#else + void acquirePathLock(bool) { + // TODO - this is very bad that the code above not running here. + + // Not related to lock file, but this is where we handle unclean shutdown + if( !cmdLine.dur && dur::haveJournalFiles() ) { + cout << "**************" << endl; + cout << "Error: journal files are present in journal directory, yet starting without --journal enabled." << endl; + cout << "It is recommended that you start with journaling enabled so that recovery may occur." << endl; + cout << "Alternatively (not recommended), you can backup everything, then delete the journal files, and run --repair" << endl; + cout << "**************" << endl; + uasserted(13618, "can't start without --journal enabled when journal/ files are present"); + } + } +#endif + +} // namespace mongo diff --git a/src/mongo/db/instance.h b/src/mongo/db/instance.h new file mode 100644 index 00000000000..9dde729997d --- /dev/null +++ b/src/mongo/db/instance.h @@ -0,0 +1,174 @@ +// instance.h : Global state functions. +// + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + + +#include "../client/dbclient.h" +#include "curop-inl.h" +#include "security.h" +#include "cmdline.h" +#include "client.h" + +namespace mongo { + + extern string dbExecCommand; + + /** a high level recording of operations to the database - sometimes used for diagnostics + and debugging. + */ + class DiagLog { + ofstream *f; // note this is never freed + /* 0 = off; 1 = writes, 2 = reads, 3 = both + 7 = log a few reads, and all writes. + */ + int level; + mongo::mutex mutex; + void openFile() { + assert( f == 0 ); + stringstream ss; + ss << dbpath << "/diaglog." << hex << time(0); + string name = ss.str(); + f = new ofstream(name.c_str(), ios::out | ios::binary); + if ( ! f->good() ) { + problem() << "diagLogging couldn't open " << name << endl; + // todo what is this? : + throw 1717; + } + else { + log() << "diagLogging using file " << name << endl; + } + } + public: + DiagLog() : f(0) , level(0), mutex("DiagLog") { } + int getLevel() const { return level; } + /** + * @return old + */ + int setLevel( int newLevel ) { + scoped_lock lk(mutex); + int old = level; + log() << "diagLogging level=" << newLevel << endl; + if( f == 0 ) { + openFile(); + } + level = newLevel; // must be done AFTER f is set + return old; + } + void flush() { + if ( level ) { + log() << "flushing diag log" << endl; + scoped_lock lk(mutex); + f->flush(); + } + } + void write(char *data,int len) { + if ( level & 1 ) { + scoped_lock lk(mutex); + f->write(data,len); + } + } + void readop(char *data, int len) { + if ( level & 2 ) { + bool log = (level & 4) == 0; + OCCASIONALLY log = true; + if ( log ) { + scoped_lock lk(mutex); + assert( f ); + f->write(data,len); + } + } + } + }; + + extern DiagLog _diaglog; + + /* we defer response until we unlock. don't want a blocked socket to + keep things locked. + */ + struct DbResponse { + Message *response; + MSGID responseTo; + const char *exhaust; /* points to ns if exhaust mode. 0=normal mode*/ + DbResponse(Message *r, MSGID rt) : response(r), responseTo(rt), exhaust(0) { } + DbResponse() { + response = 0; + exhaust = 0; + } + ~DbResponse() { delete response; } + }; + + void assembleResponse( Message &m, DbResponse &dbresponse, const HostAndPort &client ); + + void getDatabaseNames( vector< string > &names , const string& usePath = dbpath ); + + /* returns true if there is no data on this server. useful when starting replication. + local database does NOT count. + */ + bool replHasDatabases(); + + /** "embedded" calls to the local server directly. + Caller does not need to lock, that is handled within. + */ + class DBDirectClient : public DBClientBase { + public: + virtual auto_ptr<DBClientCursor> query(const string &ns, Query query, int nToReturn = 0, int nToSkip = 0, + const BSONObj *fieldsToReturn = 0, int queryOptions = 0); + + virtual bool isFailed() const { + return false; + } + virtual string toString() { + return "DBDirectClient"; + } + virtual string getServerAddress() const { + return "localhost"; // TODO: should this have the port? + } + virtual bool call( Message &toSend, Message &response, bool assertOk=true , string * actualServer = 0 ); + virtual void say( Message &toSend, bool isRetry = false ); + virtual void sayPiggyBack( Message &toSend ) { + // don't need to piggy back when connected locally + return say( toSend ); + } + + virtual void killCursor( long long cursorID ); + + virtual bool callRead( Message& toSend , Message& response ) { + return call( toSend , response ); + } + + virtual unsigned long long count(const string &ns, const BSONObj& query = BSONObj(), int options=0, int limit=0, int skip=0 ); + + virtual ConnectionString::ConnectionType type() const { return ConnectionString::MASTER; } + + double getSoTimeout() const { return 0; } + + virtual bool lazySupported() const { return true; } + private: + static HostAndPort _clientHost; + }; + + extern int lockFile; +#ifdef _WIN32 + extern HANDLE lockFileHandle; +#endif + void acquirePathLock(bool doingRepair=false); // if doingRepair=true don't consider unclean shutdown an error + void maybeCreatePidFile(); + +} // namespace mongo diff --git a/src/mongo/db/introspect.cpp b/src/mongo/db/introspect.cpp new file mode 100644 index 00000000000..7e1d19ce2f3 --- /dev/null +++ b/src/mongo/db/introspect.cpp @@ -0,0 +1,88 @@ +// introspect.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "introspect.h" +#include "../bson/util/builder.h" +#include "../util/goodies.h" +#include "pdfile.h" +#include "jsobj.h" +#include "pdfile.h" +#include "curop.h" + +namespace mongo { + + BufBuilder profileBufBuilder; // reused, instead of allocated every time - avoids a malloc/free cycle + + void profile( const Client& c , CurOp& currentOp ) { + assertInWriteLock(); + + Database *db = c.database(); + DEV assert( db ); + const char *ns = db->profileName.c_str(); + + // build object + profileBufBuilder.reset(); + BSONObjBuilder b(profileBufBuilder); + b.appendDate("ts", jsTime()); + currentOp.debug().append( currentOp , b ); + + b.append("client", c.clientAddress() ); + + if ( c.getAuthenticationInfo() ) + b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) ); + + BSONObj p = b.done(); + + if (p.objsize() > 100*1024){ + string small = p.toString(/*isArray*/false, /*full*/false); + + warning() << "can't add full line to system.profile: " << small; + + // rebuild with limited info + BSONObjBuilder b(profileBufBuilder); + b.appendDate("ts", jsTime()); + b.append("client", c.clientAddress() ); + if ( c.getAuthenticationInfo() ) + b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) ); + + b.append("err", "profile line too large (max is 100KB)"); + if (small.size() < 100*1024){ // should be much smaller but if not don't break anything + b.append("abbreviated", small); + } + + p = b.done(); + } + + // write: not replicated + NamespaceDetails *d = db->namespaceIndex.details(ns); + if( d ) { + int len = p.objsize(); + Record *r = theDataFileMgr.fast_oplog_insert(d, ns, len); + memcpy(getDur().writingPtr(r->data, len), p.objdata(), len); + } + else { + static time_t last; + if( time(0) > last+10 ) { + log() << "profile: warning ns " << ns << " does not exist" << endl; + last = time(0); + } + } + } + +} // namespace mongo diff --git a/src/mongo/db/introspect.h b/src/mongo/db/introspect.h new file mode 100644 index 00000000000..209eeacab7c --- /dev/null +++ b/src/mongo/db/introspect.h @@ -0,0 +1,34 @@ +// introspect.h +// system management stuff. + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../pch.h" +#include "jsobj.h" +#include "pdfile.h" + +namespace mongo { + + /* --- profiling -------------------------------------------- + do when database->profile is set + */ + + void profile( const Client& c , CurOp& currentOp ); + +} // namespace mongo diff --git a/src/mongo/db/javatest.cpp b/src/mongo/db/javatest.cpp new file mode 100644 index 00000000000..22f2bdf8d3c --- /dev/null +++ b/src/mongo/db/javatest.cpp @@ -0,0 +1,24 @@ +// javatest.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "javajs.h" + +int main() { + JavaJS = new JavaJSImpl(); + javajstest(); +} diff --git a/src/mongo/db/jsobj.cpp b/src/mongo/db/jsobj.cpp new file mode 100644 index 00000000000..1e850982396 --- /dev/null +++ b/src/mongo/db/jsobj.cpp @@ -0,0 +1,1268 @@ +/** @file jsobj.cpp - BSON implementation + http://www.mongodb.org/display/DOCS/BSON +*/ + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pch.h" +#include "../bson/oid.h" +#include "jsobj.h" +#include "nonce.h" +#include "../bson/util/atomic_int.h" +#include "../util/base64.h" +#include "../util/md5.hpp" +#include <limits> +#include <cmath> +#include "../util/unittest.h" +#include "../util/embedded_builder.h" +#include "../util/stringutils.h" +#include "../util/mongoutils/str.h" +#include "json.h" +#include "jsobjmanipulator.h" +#include "../util/optime.h" +#include <boost/static_assert.hpp> +#undef assert +#define assert MONGO_assert + +// make sure our assumptions are valid +BOOST_STATIC_ASSERT( sizeof(short) == 2 ); +BOOST_STATIC_ASSERT( sizeof(int) == 4 ); +BOOST_STATIC_ASSERT( sizeof(long long) == 8 ); +BOOST_STATIC_ASSERT( sizeof(double) == 8 ); +BOOST_STATIC_ASSERT( sizeof(mongo::Date_t) == 8 ); +BOOST_STATIC_ASSERT( sizeof(mongo::OID) == 12 ); + +namespace mongo { + + BSONElement eooElement; + + GENOIDLabeler GENOID; + + DateNowLabeler DATENOW; + NullLabeler BSONNULL; + + MinKeyLabeler MINKEY; + MaxKeyLabeler MAXKEY; + + // need to move to bson/, but has dependency on base64 so move that to bson/util/ first. + inline string BSONElement::jsonString( JsonStringFormat format, bool includeFieldNames, int pretty ) const { + BSONType t = type(); + int sign; + if ( t == Undefined ) + return "undefined"; + + stringstream s; + if ( includeFieldNames ) + s << '"' << escape( fieldName() ) << "\" : "; + switch ( type() ) { + case mongo::String: + case Symbol: + s << '"' << escape( string(valuestr(), valuestrsize()-1) ) << '"'; + break; + case NumberLong: + s << _numberLong(); + break; + case NumberInt: + case NumberDouble: + if ( number() >= -numeric_limits< double >::max() && + number() <= numeric_limits< double >::max() ) { + s.precision( 16 ); + s << number(); + } + else if ( mongo::isNaN(number()) ) { + s << "NaN"; + } + else if ( mongo::isInf(number(), &sign) ) { + s << ( sign == 1 ? "Infinity" : "-Infinity"); + } + else { + StringBuilder ss; + ss << "Number " << number() << " cannot be represented in JSON"; + string message = ss.str(); + massert( 10311 , message.c_str(), false ); + } + break; + case mongo::Bool: + s << ( boolean() ? "true" : "false" ); + break; + case jstNULL: + s << "null"; + break; + case Object: + s << embeddedObject().jsonString( format, pretty ); + break; + case mongo::Array: { + if ( embeddedObject().isEmpty() ) { + s << "[]"; + break; + } + s << "[ "; + BSONObjIterator i( embeddedObject() ); + BSONElement e = i.next(); + if ( !e.eoo() ) { + int count = 0; + while ( 1 ) { + if( pretty ) { + s << '\n'; + for( int x = 0; x < pretty; x++ ) + s << " "; + } + + if (strtol(e.fieldName(), 0, 10) > count) { + s << "undefined"; + } + else { + s << e.jsonString( format, false, pretty?pretty+1:0 ); + e = i.next(); + } + count++; + if ( e.eoo() ) + break; + s << ", "; + } + } + s << " ]"; + break; + } + case DBRef: { + mongo::OID *x = (mongo::OID *) (valuestr() + valuestrsize()); + if ( format == TenGen ) + s << "Dbref( "; + else + s << "{ \"$ref\" : "; + s << '"' << valuestr() << "\", "; + if ( format != TenGen ) + s << "\"$id\" : "; + s << '"' << *x << "\" "; + if ( format == TenGen ) + s << ')'; + else + s << '}'; + break; + } + case jstOID: + if ( format == TenGen ) { + s << "ObjectId( "; + } + else { + s << "{ \"$oid\" : "; + } + s << '"' << __oid() << '"'; + if ( format == TenGen ) { + s << " )"; + } + else { + s << " }"; + } + break; + case BinData: { + int len = *(int *)( value() ); + BinDataType type = BinDataType( *(char *)( (int *)( value() ) + 1 ) ); + s << "{ \"$binary\" : \""; + char *start = ( char * )( value() ) + sizeof( int ) + 1; + base64::encode( s , start , len ); + s << "\", \"$type\" : \"" << hex; + s.width( 2 ); + s.fill( '0' ); + s << type << dec; + s << "\" }"; + break; + } + case mongo::Date: + if ( format == Strict ) + s << "{ \"$date\" : "; + else + s << "Date( "; + if( pretty ) { + Date_t d = date(); + if( d == 0 ) s << '0'; + else + s << '"' << date().toString() << '"'; + } + else + s << date(); + if ( format == Strict ) + s << " }"; + else + s << " )"; + break; + case RegEx: + if ( format == Strict ) { + s << "{ \"$regex\" : \"" << escape( regex() ); + s << "\", \"$options\" : \"" << regexFlags() << "\" }"; + } + else { + s << "/" << escape( regex() , true ) << "/"; + // FIXME Worry about alpha order? + for ( const char *f = regexFlags(); *f; ++f ) { + switch ( *f ) { + case 'g': + case 'i': + case 'm': + s << *f; + default: + break; + } + } + } + break; + + case CodeWScope: { + BSONObj scope = codeWScopeObject(); + if ( ! scope.isEmpty() ) { + s << "{ \"$code\" : " << _asCode() << " , " + << " \"$scope\" : " << scope.jsonString() << " }"; + break; + } + } + + case Code: + s << _asCode(); + break; + + case Timestamp: + s << "{ \"t\" : " << timestampTime() << " , \"i\" : " << timestampInc() << " }"; + break; + + case MinKey: + s << "{ \"$minKey\" : 1 }"; + break; + + case MaxKey: + s << "{ \"$maxKey\" : 1 }"; + break; + + default: + StringBuilder ss; + ss << "Cannot create a properly formatted JSON string with " + << "element: " << toString() << " of type: " << type(); + string message = ss.str(); + massert( 10312 , message.c_str(), false ); + } + return s.str(); + } + + int BSONElement::getGtLtOp( int def ) const { + const char *fn = fieldName(); + if ( fn[0] == '$' && fn[1] ) { + if ( fn[2] == 't' ) { + if ( fn[1] == 'g' ) { + if ( fn[3] == 0 ) return BSONObj::GT; + else if ( fn[3] == 'e' && fn[4] == 0 ) return BSONObj::GTE; + } + else if ( fn[1] == 'l' ) { + if ( fn[3] == 0 ) return BSONObj::LT; + else if ( fn[3] == 'e' && fn[4] == 0 ) return BSONObj::LTE; + } + } + else if ( fn[1] == 'n' && fn[2] == 'e' ) { + if ( fn[3] == 0 ) + return BSONObj::NE; + if ( fn[3] == 'a' && fn[4] == 'r') // matches anything with $near prefix + return BSONObj::opNEAR; + } + else if ( fn[1] == 'm' ) { + if ( fn[2] == 'o' && fn[3] == 'd' && fn[4] == 0 ) + return BSONObj::opMOD; + if ( fn[2] == 'a' && fn[3] == 'x' && fn[4] == 'D' && fn[5] == 'i' && fn[6] == 's' && fn[7] == 't' && fn[8] == 'a' && fn[9] == 'n' && fn[10] == 'c' && fn[11] == 'e' && fn[12] == 0 ) + return BSONObj::opMAX_DISTANCE; + } + else if ( fn[1] == 't' && fn[2] == 'y' && fn[3] == 'p' && fn[4] == 'e' && fn[5] == 0 ) + return BSONObj::opTYPE; + else if ( fn[1] == 'i' && fn[2] == 'n' && fn[3] == 0 ) + return BSONObj::opIN; + else if ( fn[1] == 'n' && fn[2] == 'i' && fn[3] == 'n' && fn[4] == 0 ) + return BSONObj::NIN; + else if ( fn[1] == 'a' && fn[2] == 'l' && fn[3] == 'l' && fn[4] == 0 ) + return BSONObj::opALL; + else if ( fn[1] == 's' && fn[2] == 'i' && fn[3] == 'z' && fn[4] == 'e' && fn[5] == 0 ) + return BSONObj::opSIZE; + else if ( fn[1] == 'e' ) { + if ( fn[2] == 'x' && fn[3] == 'i' && fn[4] == 's' && fn[5] == 't' && fn[6] == 's' && fn[7] == 0 ) + return BSONObj::opEXISTS; + if ( fn[2] == 'l' && fn[3] == 'e' && fn[4] == 'm' && fn[5] == 'M' && fn[6] == 'a' && fn[7] == 't' && fn[8] == 'c' && fn[9] == 'h' && fn[10] == 0 ) + return BSONObj::opELEM_MATCH; + } + else if ( fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'g' && fn[4] == 'e' && fn[5] == 'x' && fn[6] == 0 ) + return BSONObj::opREGEX; + else if ( fn[1] == 'o' && fn[2] == 'p' && fn[3] == 't' && fn[4] == 'i' && fn[5] == 'o' && fn[6] == 'n' && fn[7] == 's' && fn[8] == 0 ) + return BSONObj::opOPTIONS; + else if ( fn[1] == 'w' && fn[2] == 'i' && fn[3] == 't' && fn[4] == 'h' && fn[5] == 'i' && fn[6] == 'n' && fn[7] == 0 ) + return BSONObj::opWITHIN; + } + return def; + } + + /* Matcher --------------------------------------*/ + +// If the element is something like: +// a : { $gt : 3 } +// we append +// a : 3 +// else we just append the element. +// + void appendElementHandlingGtLt(BSONObjBuilder& b, const BSONElement& e) { + if ( e.type() == Object ) { + BSONElement fe = e.embeddedObject().firstElement(); + const char *fn = fe.fieldName(); + if ( fn[0] == '$' && fn[1] && fn[2] == 't' ) { + b.appendAs(fe, e.fieldName()); + return; + } + } + b.append(e); + } + + int getGtLtOp(const BSONElement& e) { + if ( e.type() != Object ) + return BSONObj::Equality; + + BSONElement fe = e.embeddedObject().firstElement(); + return fe.getGtLtOp(); + } + + FieldCompareResult compareDottedFieldNames( const string& l , const string& r ) { + static int maxLoops = 1024 * 1024; + + size_t lstart = 0; + size_t rstart = 0; + + for ( int i=0; i<maxLoops; i++ ) { + + size_t a = l.find( '.' , lstart ); + size_t b = r.find( '.' , rstart ); + + size_t lend = a == string::npos ? l.size() : a; + size_t rend = b == string::npos ? r.size() : b; + + const string& c = l.substr( lstart , lend - lstart ); + const string& d = r.substr( rstart , rend - rstart ); + + int x = lexNumCmp( c.c_str(), d.c_str() ); + + if ( x < 0 ) + return LEFT_BEFORE; + if ( x > 0 ) + return RIGHT_BEFORE; + + lstart = lend + 1; + rstart = rend + 1; + + if ( lstart >= l.size() ) { + if ( rstart >= r.size() ) + return SAME; + return RIGHT_SUBFIELD; + } + if ( rstart >= r.size() ) + return LEFT_SUBFIELD; + } + + log() << "compareDottedFieldNames ERROR l: " << l << " r: " << r << " TOO MANY LOOPS" << endl; + assert(0); + return SAME; // will never get here + } + + /* BSONObj ------------------------------------------------------------*/ + + string BSONObj::md5() const { + md5digest d; + md5_state_t st; + md5_init(&st); + md5_append( &st , (const md5_byte_t*)_objdata , objsize() ); + md5_finish(&st, d); + return digestToString( d ); + } + + string BSONObj::jsonString( JsonStringFormat format, int pretty ) const { + + if ( isEmpty() ) return "{}"; + + StringBuilder s; + s << "{ "; + BSONObjIterator i(*this); + BSONElement e = i.next(); + if ( !e.eoo() ) + while ( 1 ) { + s << e.jsonString( format, true, pretty?pretty+1:0 ); + e = i.next(); + if ( e.eoo() ) + break; + s << ","; + if ( pretty ) { + s << '\n'; + for( int x = 0; x < pretty; x++ ) + s << " "; + } + else { + s << " "; + } + } + s << " }"; + return s.str(); + } + + bool BSONObj::valid() const { + try { + BSONObjIterator it(*this); + while( it.moreWithEOO() ) { + // both throw exception on failure + BSONElement e = it.next(true); + e.validate(); + + if (e.eoo()) { + if (it.moreWithEOO()) + return false; + return true; + } + else if (e.isABSONObj()) { + if(!e.embeddedObject().valid()) + return false; + } + else if (e.type() == CodeWScope) { + if(!e.codeWScopeObject().valid()) + return false; + } + } + } + catch (...) { + } + return false; + } + + int BSONObj::woCompare(const BSONObj& r, const Ordering &o, bool considerFieldName) const { + if ( isEmpty() ) + return r.isEmpty() ? 0 : -1; + if ( r.isEmpty() ) + return 1; + + BSONObjIterator i(*this); + BSONObjIterator j(r); + unsigned mask = 1; + while ( 1 ) { + // so far, equal... + + BSONElement l = i.next(); + BSONElement r = j.next(); + if ( l.eoo() ) + return r.eoo() ? 0 : -1; + if ( r.eoo() ) + return 1; + + int x; + { + x = l.woCompare( r, considerFieldName ); + if( o.descending(mask) ) + x = -x; + } + if ( x != 0 ) + return x; + mask <<= 1; + } + return -1; + } + + /* well ordered compare */ + int BSONObj::woCompare(const BSONObj &r, const BSONObj &idxKey, + bool considerFieldName) const { + if ( isEmpty() ) + return r.isEmpty() ? 0 : -1; + if ( r.isEmpty() ) + return 1; + + bool ordered = !idxKey.isEmpty(); + + BSONObjIterator i(*this); + BSONObjIterator j(r); + BSONObjIterator k(idxKey); + while ( 1 ) { + // so far, equal... + + BSONElement l = i.next(); + BSONElement r = j.next(); + BSONElement o; + if ( ordered ) + o = k.next(); + if ( l.eoo() ) + return r.eoo() ? 0 : -1; + if ( r.eoo() ) + return 1; + + int x; + /* + if( ordered && o.type() == String && strcmp(o.valuestr(), "ascii-proto") == 0 && + l.type() == String && r.type() == String ) { + // note: no negative support yet, as this is just sort of a POC + x = _stricmp(l.valuestr(), r.valuestr()); + } + else*/ { + x = l.woCompare( r, considerFieldName ); + if ( ordered && o.number() < 0 ) + x = -x; + } + if ( x != 0 ) + return x; + } + return -1; + } + + BSONObj staticNull = fromjson( "{'':null}" ); + BSONObj makeUndefined() { + BSONObjBuilder b; + b.appendUndefined( "" ); + return b.obj(); + } + BSONObj staticUndefined = makeUndefined(); + + /* well ordered compare */ + int BSONObj::woSortOrder(const BSONObj& other, const BSONObj& sortKey , bool useDotted ) const { + if ( isEmpty() ) + return other.isEmpty() ? 0 : -1; + if ( other.isEmpty() ) + return 1; + + uassert( 10060 , "woSortOrder needs a non-empty sortKey" , ! sortKey.isEmpty() ); + + BSONObjIterator i(sortKey); + while ( 1 ) { + BSONElement f = i.next(); + if ( f.eoo() ) + return 0; + + BSONElement l = useDotted ? getFieldDotted( f.fieldName() ) : getField( f.fieldName() ); + if ( l.eoo() ) + l = staticNull.firstElement(); + BSONElement r = useDotted ? other.getFieldDotted( f.fieldName() ) : other.getField( f.fieldName() ); + if ( r.eoo() ) + r = staticNull.firstElement(); + + int x = l.woCompare( r, false ); + if ( f.number() < 0 ) + x = -x; + if ( x != 0 ) + return x; + } + return -1; + } + + template <typename BSONElementColl> + void _getFieldsDotted( const BSONObj* obj, const StringData& name, BSONElementColl &ret, bool expandLastArray ) { + BSONElement e = obj->getField( name ); + + if ( e.eoo() ) { + const char *p = strchr(name.data(), '.'); + if ( p ) { + string left(name.data(), p-name.data()); + const char* next = p+1; + BSONElement e = obj->getField( left.c_str() ); + + if (e.type() == Object) { + e.embeddedObject().getFieldsDotted(next, ret, expandLastArray ); + } + else if (e.type() == Array) { + bool allDigits = false; + if ( isdigit( *next ) ) { + const char * temp = next + 1; + while ( isdigit( *temp ) ) + temp++; + allDigits = (*temp == '.' || *temp == '\0'); + } + if (allDigits) { + e.embeddedObject().getFieldsDotted(next, ret, expandLastArray ); + } + else { + BSONObjIterator i(e.embeddedObject()); + while ( i.more() ) { + BSONElement e2 = i.next(); + if (e2.type() == Object || e2.type() == Array) + e2.embeddedObject().getFieldsDotted(next, ret, expandLastArray ); + } + } + } + else { + // do nothing: no match + } + } + } + else { + if (e.type() == Array && expandLastArray) { + BSONObjIterator i(e.embeddedObject()); + while ( i.more() ) + ret.insert(i.next()); + } + else { + ret.insert(e); + } + } + } + + void BSONObj::getFieldsDotted(const StringData& name, BSONElementSet &ret, bool expandLastArray ) const { + _getFieldsDotted( this, name, ret, expandLastArray ); + } + void BSONObj::getFieldsDotted(const StringData& name, BSONElementMSet &ret, bool expandLastArray ) const { + _getFieldsDotted( this, name, ret, expandLastArray ); + } + + BSONElement BSONObj::getFieldDottedOrArray(const char *&name) const { + const char *p = strchr(name, '.'); + + BSONElement sub; + + if ( p ) { + sub = getField( string(name, p-name) ); + name = p + 1; + } + else { + sub = getField( name ); + name = name + strlen(name); + } + + if ( sub.eoo() ) + return eooElement; + else if ( sub.type() == Array || name[0] == '\0' ) + return sub; + else if ( sub.type() == Object ) + return sub.embeddedObject().getFieldDottedOrArray( name ); + else + return eooElement; + } + + /** + sets element field names to empty string + If a field in pattern is missing, it is omitted from the returned + object. + */ + BSONObj BSONObj::extractFieldsUnDotted(BSONObj pattern) const { + BSONObjBuilder b; + BSONObjIterator i(pattern); + while ( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + BSONElement x = getField(e.fieldName()); + if ( !x.eoo() ) + b.appendAs(x, ""); + } + return b.obj(); + } + + BSONObj BSONObj::extractFields(const BSONObj& pattern , bool fillWithNull ) const { + BSONObjBuilder b(32); // scanandorder.h can make a zillion of these, so we start the allocation very small + BSONObjIterator i(pattern); + while ( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + BSONElement x = getFieldDotted(e.fieldName()); + if ( ! x.eoo() ) + b.appendAs( x, e.fieldName() ); + else if ( fillWithNull ) + b.appendNull( e.fieldName() ); + } + return b.obj(); + } + + BSONObj BSONObj::filterFieldsUndotted( const BSONObj &filter, bool inFilter ) const { + BSONObjBuilder b; + BSONObjIterator i( *this ); + while( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + BSONElement x = filter.getField( e.fieldName() ); + if ( ( x.eoo() && !inFilter ) || + ( !x.eoo() && inFilter ) ) + b.append( e ); + } + return b.obj(); + } + + BSONElement BSONObj::getFieldUsingIndexNames(const char *fieldName, const BSONObj &indexKey) const { + BSONObjIterator i( indexKey ); + int j = 0; + while( i.moreWithEOO() ) { + BSONElement f = i.next(); + if ( f.eoo() ) + return BSONElement(); + if ( strcmp( f.fieldName(), fieldName ) == 0 ) + break; + ++j; + } + BSONObjIterator k( *this ); + while( k.moreWithEOO() ) { + BSONElement g = k.next(); + if ( g.eoo() ) + return BSONElement(); + if ( j == 0 ) { + return g; + } + --j; + } + return BSONElement(); + } + + /* grab names of all the fields in this object */ + int BSONObj::getFieldNames(set<string>& fields) const { + int n = 0; + BSONObjIterator i(*this); + while ( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + fields.insert(e.fieldName()); + n++; + } + return n; + } + + /* note: addFields always adds _id even if not specified + returns n added not counting _id unless requested. + */ + int BSONObj::addFields(BSONObj& from, set<string>& fields) { + assert( isEmpty() && !isOwned() ); /* partial implementation for now... */ + + BSONObjBuilder b; + + int N = fields.size(); + int n = 0; + BSONObjIterator i(from); + bool gotId = false; + while ( i.moreWithEOO() ) { + BSONElement e = i.next(); + const char *fname = e.fieldName(); + if ( fields.count(fname) ) { + b.append(e); + ++n; + gotId = gotId || strcmp(fname, "_id")==0; + if ( n == N && gotId ) + break; + } + else if ( strcmp(fname, "_id")==0 ) { + b.append(e); + gotId = true; + if ( n == N && gotId ) + break; + } + } + + if ( n ) { + *this = b.obj(); + } + + return n; + } + + bool BSONObj::couldBeArray() const { + BSONObjIterator i( *this ); + int index = 0; + while( i.moreWithEOO() ){ + BSONElement e = i.next(); + if( e.eoo() ) break; + + // TODO: If actually important, may be able to do int->char* much faster + if( strcmp( e.fieldName(), ((string)( mongoutils::str::stream() << index )).c_str() ) != 0 ) + return false; + index++; + } + return true; + } + + BSONObj BSONObj::clientReadable() const { + BSONObjBuilder b; + BSONObjIterator i( *this ); + while( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + switch( e.type() ) { + case MinKey: { + BSONObjBuilder m; + m.append( "$minElement", 1 ); + b.append( e.fieldName(), m.done() ); + break; + } + case MaxKey: { + BSONObjBuilder m; + m.append( "$maxElement", 1 ); + b.append( e.fieldName(), m.done() ); + break; + } + default: + b.append( e ); + } + } + return b.obj(); + } + + BSONObj BSONObj::replaceFieldNames( const BSONObj &names ) const { + BSONObjBuilder b; + BSONObjIterator i( *this ); + BSONObjIterator j( names ); + BSONElement f = j.moreWithEOO() ? j.next() : BSONObj().firstElement(); + while( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + if ( !f.eoo() ) { + b.appendAs( e, f.fieldName() ); + f = j.next(); + } + else { + b.append( e ); + } + } + return b.obj(); + } + + bool BSONObj::okForStorage() const { + BSONObjIterator i( *this ); + while ( i.more() ) { + BSONElement e = i.next(); + const char * name = e.fieldName(); + + if ( strchr( name , '.' ) || + strchr( name , '$' ) ) { + return + strcmp( name , "$ref" ) == 0 || + strcmp( name , "$id" ) == 0 + ; + } + + if ( e.mayEncapsulate() ) { + switch ( e.type() ) { + case Object: + case Array: + if ( ! e.embeddedObject().okForStorage() ) + return false; + break; + case CodeWScope: + if ( ! e.codeWScopeObject().okForStorage() ) + return false; + break; + default: + uassert( 12579, "unhandled cases in BSONObj okForStorage" , 0 ); + } + + } + } + return true; + } + + void BSONObj::dump() const { + out() << hex; + const char *p = objdata(); + for ( int i = 0; i < objsize(); i++ ) { + out() << i << '\t' << ( 0xff & ( (unsigned) *p ) ); + if ( *p >= 'A' && *p <= 'z' ) + out() << '\t' << *p; + out() << endl; + p++; + } + } + + void nested2dotted(BSONObjBuilder& b, const BSONObj& obj, const string& base) { + BSONObjIterator it(obj); + while (it.more()) { + BSONElement e = it.next(); + if (e.type() == Object) { + string newbase = base + e.fieldName() + "."; + nested2dotted(b, e.embeddedObject(), newbase); + } + else { + string newbase = base + e.fieldName(); + b.appendAs(e, newbase); + } + } + } + + void dotted2nested(BSONObjBuilder& b, const BSONObj& obj) { + //use map to sort fields + BSONMap sorted = bson2map(obj); + EmbeddedBuilder eb(&b); + for(BSONMap::const_iterator it=sorted.begin(); it!=sorted.end(); ++it) { + eb.appendAs(it->second, it->first); + } + eb.done(); + } + + /*-- test things ----------------------------------------------------*/ + +#pragma pack(1) + struct MaxKeyData { + MaxKeyData() { + totsize=7; + maxkey=MaxKey; + name=0; + eoo=EOO; + } + int totsize; + char maxkey; + char name; + char eoo; + } maxkeydata; + BSONObj maxKey((const char *) &maxkeydata); + + struct MinKeyData { + MinKeyData() { + totsize=7; + minkey=MinKey; + name=0; + eoo=EOO; + } + int totsize; + char minkey; + char name; + char eoo; + } minkeydata; + BSONObj minKey((const char *) &minkeydata); + + /* + struct JSObj0 { + JSObj0() { + totsize = 5; + eoo = EOO; + } + int totsize; + char eoo; + } js0; + */ +#pragma pack() + + struct BsonUnitTest : public UnitTest { + void testRegex() { + + BSONObjBuilder b; + b.appendRegex("x", "foo"); + BSONObj o = b.done(); + + BSONObjBuilder c; + c.appendRegex("x", "goo"); + BSONObj p = c.done(); + + assert( !o.binaryEqual( p ) ); + assert( o.woCompare( p ) < 0 ); + + } + void testoid() { + OID id; + id.init(); + // sleepsecs(3); + + OID b; + // goes with sleep above... + // b.init(); + // assert( memcmp(id.getData(), b.getData(), 12) < 0 ); + + b.init( id.str() ); + assert( b == id ); + } + + void testbounds() { + BSONObj l , r; + { + BSONObjBuilder b; + b.append( "x" , numeric_limits<long long>::max() ); + l = b.obj(); + } + { + BSONObjBuilder b; + b.append( "x" , numeric_limits<double>::max() ); + r = b.obj(); + } + assert( l.woCompare( r ) < 0 ); + assert( r.woCompare( l ) > 0 ); + { + BSONObjBuilder b; + b.append( "x" , numeric_limits<int>::max() ); + l = b.obj(); + } + assert( l.woCompare( r ) < 0 ); + assert( r.woCompare( l ) > 0 ); + } + + void testorder() { + { + BSONObj x,y,z; + { BSONObjBuilder b; b.append( "x" , (long long)2 ); x = b.obj(); } + { BSONObjBuilder b; b.append( "x" , (int)3 ); y = b.obj(); } + { BSONObjBuilder b; b.append( "x" , (long long)4 ); z = b.obj(); } + assert( x.woCompare( y ) < 0 ); + assert( x.woCompare( z ) < 0 ); + assert( y.woCompare( x ) > 0 ); + assert( z.woCompare( x ) > 0 ); + assert( y.woCompare( z ) < 0 ); + assert( z.woCompare( y ) > 0 ); + } + + { + BSONObj ll,d,i,n,u; + { BSONObjBuilder b; b.append( "x" , (long long)2 ); ll = b.obj(); } + { BSONObjBuilder b; b.append( "x" , (double)2 ); d = b.obj(); } + { BSONObjBuilder b; b.append( "x" , (int)2 ); i = b.obj(); } + { BSONObjBuilder b; b.appendNull( "x" ); n = b.obj(); } + { BSONObjBuilder b; u = b.obj(); } + + assert( ll.woCompare( u ) == d.woCompare( u ) ); + assert( ll.woCompare( u ) == i.woCompare( u ) ); + BSONObj k = BSON( "x" << 1 ); + assert( ll.woCompare( u , k ) == d.woCompare( u , k ) ); + assert( ll.woCompare( u , k ) == i.woCompare( u , k ) ); + + assert( u.woCompare( ll ) == u.woCompare( d ) ); + assert( u.woCompare( ll ) == u.woCompare( i ) ); + assert( u.woCompare( ll , k ) == u.woCompare( d , k ) ); + assert( u.woCompare( ll , k ) == u.woCompare( d , k ) ); + + assert( i.woCompare( n ) == d.woCompare( n ) ); + + assert( ll.woCompare( n ) == d.woCompare( n ) ); + assert( ll.woCompare( n ) == i.woCompare( n ) ); + assert( ll.woCompare( n , k ) == d.woCompare( n , k ) ); + assert( ll.woCompare( n , k ) == i.woCompare( n , k ) ); + + assert( n.woCompare( ll ) == n.woCompare( d ) ); + assert( n.woCompare( ll ) == n.woCompare( i ) ); + assert( n.woCompare( ll , k ) == n.woCompare( d , k ) ); + assert( n.woCompare( ll , k ) == n.woCompare( d , k ) ); + } + + { + BSONObj l,r; + { BSONObjBuilder b; b.append( "x" , "eliot" ); l = b.obj(); } + { BSONObjBuilder b; b.appendSymbol( "x" , "eliot" ); r = b.obj(); } + assert( l.woCompare( r ) == 0 ); + assert( r.woCompare( l ) == 0 ); + } + } + + void run() { + testRegex(); + BSONObjBuilder A,B,C; + A.append("x", 2); + B.append("x", 2.0); + C.append("x", 2.1); + BSONObj a = A.done(); + BSONObj b = B.done(); + BSONObj c = C.done(); + assert( !a.binaryEqual( b ) ); // comments on operator== + int cmp = a.woCompare(b); + assert( cmp == 0 ); + cmp = a.woCompare(c); + assert( cmp < 0 ); + testoid(); + testbounds(); + testorder(); + } + } bson_unittest; + + Labeler::Label GT( "$gt" ); + Labeler::Label GTE( "$gte" ); + Labeler::Label LT( "$lt" ); + Labeler::Label LTE( "$lte" ); + Labeler::Label NE( "$ne" ); + Labeler::Label SIZE( "$size" ); + + void BSONObjBuilder::appendMinForType( const StringData& fieldName , int t ) { + switch ( t ) { + + // Shared canonical types + case NumberInt: + case NumberDouble: + case NumberLong: + append( fieldName , - numeric_limits<double>::max() ); return; + case Symbol: + case String: + append( fieldName , "" ); return; + case Date: + // min varies with V0 and V1 indexes, so we go one type lower. + appendBool(fieldName, true); + //appendDate( fieldName , numeric_limits<long long>::min() ); + return; + case Timestamp: // TODO integrate with Date SERVER-3304 + appendTimestamp( fieldName , 0 ); return; + case Undefined: // shared with EOO + appendUndefined( fieldName ); return; + + // Separate canonical types + case MinKey: + appendMinKey( fieldName ); return; + case MaxKey: + appendMaxKey( fieldName ); return; + case jstOID: { + OID o; + memset(&o, 0, sizeof(o)); + appendOID( fieldName , &o); + return; + } + case Bool: + appendBool( fieldName , false); return; + case jstNULL: + appendNull( fieldName ); return; + case Object: + append( fieldName , BSONObj() ); return; + case Array: + appendArray( fieldName , BSONObj() ); return; + case BinData: + appendBinData( fieldName , 0 , BinDataGeneral , (const char *) 0 ); return; + case RegEx: + appendRegex( fieldName , "" ); return; + case DBRef: { + OID o; + memset(&o, 0, sizeof(o)); + appendDBRef( fieldName , "" , o ); + return; + } + case Code: + appendCode( fieldName , "" ); return; + case CodeWScope: + appendCodeWScope( fieldName , "" , BSONObj() ); return; + }; + log() << "type not supported for appendMinElementForType: " << t << endl; + uassert( 10061 , "type not supported for appendMinElementForType" , false ); + } + + void BSONObjBuilder::appendMaxForType( const StringData& fieldName , int t ) { + switch ( t ) { + + // Shared canonical types + case NumberInt: + case NumberDouble: + case NumberLong: + append( fieldName , numeric_limits<double>::max() ); return; + case Symbol: + case String: + appendMinForType( fieldName, Object ); return; + case Date: + appendDate( fieldName , numeric_limits<long long>::max() ); return; + case Timestamp: // TODO integrate with Date SERVER-3304 + appendTimestamp( fieldName , numeric_limits<unsigned long long>::max() ); return; + case Undefined: // shared with EOO + appendUndefined( fieldName ); return; + + // Separate canonical types + case MinKey: + appendMinKey( fieldName ); return; + case MaxKey: + appendMaxKey( fieldName ); return; + case jstOID: { + OID o; + memset(&o, 0xFF, sizeof(o)); + appendOID( fieldName , &o); + return; + } + case Bool: + appendBool( fieldName , true ); return; + case jstNULL: + appendNull( fieldName ); return; + case Object: + appendMinForType( fieldName, Array ); return; + case Array: + appendMinForType( fieldName, BinData ); return; + case BinData: + appendMinForType( fieldName, jstOID ); return; + case RegEx: + appendMinForType( fieldName, DBRef ); return; + case DBRef: + appendMinForType( fieldName, Code ); return; + case Code: + appendMinForType( fieldName, CodeWScope ); return; + case CodeWScope: + // This upper bound may change if a new bson type is added. + appendMinForType( fieldName , MaxKey ); return; + } + log() << "type not supported for appendMaxElementForType: " << t << endl; + uassert( 14853 , "type not supported for appendMaxElementForType" , false ); + } + + int BSONElementFieldSorter( const void * a , const void * b ) { + const char * x = *((const char**)a); + const char * y = *((const char**)b); + x++; y++; + return lexNumCmp( x , y ); + } + + bool fieldsMatch(const BSONObj& lhs, const BSONObj& rhs) { + BSONObjIterator l(lhs); + BSONObjIterator r(rhs); + + while (l.more() && r.more()){ + if (strcmp(l.next().fieldName(), r.next().fieldName())) { + return false; + } + } + + return !(l.more() || r.more()); // false if lhs and rhs have diff nFields() + } + + BSONObjIteratorSorted::BSONObjIteratorSorted( const BSONObj& o ) { + _nfields = o.nFields(); + _fields = new const char*[_nfields]; + int x = 0; + BSONObjIterator i( o ); + while ( i.more() ) { + _fields[x++] = i.next().rawdata(); + assert( _fields[x-1] ); + } + assert( x == _nfields ); + qsort( _fields , _nfields , sizeof(char*) , BSONElementFieldSorter ); + _cur = 0; + } + + bool BSONObjBuilder::appendAsNumber( const StringData& fieldName , const string& data ) { + if ( data.size() == 0 || data == "-" || data == ".") + return false; + + unsigned int pos=0; + if ( data[0] == '-' ) + pos++; + + bool hasDec = false; + + for ( ; pos<data.size(); pos++ ) { + if ( isdigit(data[pos]) ) + continue; + + if ( data[pos] == '.' ) { + if ( hasDec ) + return false; + hasDec = true; + continue; + } + + return false; + } + + if ( hasDec ) { + double d = atof( data.c_str() ); + append( fieldName , d ); + return true; + } + + if ( data.size() < 8 ) { + append( fieldName , atoi( data.c_str() ) ); + return true; + } + + try { + long long num = boost::lexical_cast<long long>( data ); + append( fieldName , num ); + return true; + } + catch(bad_lexical_cast &) { + return false; + } + } + +} // namespace mongo diff --git a/src/mongo/db/jsobj.h b/src/mongo/db/jsobj.h new file mode 100644 index 00000000000..ae039529fbf --- /dev/null +++ b/src/mongo/db/jsobj.h @@ -0,0 +1,47 @@ +/** @file jsobj.h + BSON classes +*/ + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + BSONObj and its helpers + + "BSON" stands for "binary JSON" -- ie a binary way to represent objects that would be + represented in JSON (plus a few extensions useful for databases & other languages). + + http://www.bsonspec.org/ +*/ + +#pragma once + +#include "../pch.h" +#include "../bson/util/builder.h" +#include "../util/optime.h" +//#include "boost/utility.hpp" +//#include <set> +#include "../bson/bsontypes.h" +#include "../bson/oid.h" +#include "../bson/bsonelement.h" +#include "../bson/bsonobj.h" +#include "../bson/bsonmisc.h" +#include "../bson/bsonobjbuilder.h" +#include "../bson/bsonobjiterator.h" +#include "../bson/bson-inl.h" +#include "../bson/ordering.h" +#include "../bson/stringdata.h" +#include "../bson/bson_db.h" + diff --git a/src/mongo/db/jsobjmanipulator.h b/src/mongo/db/jsobjmanipulator.h new file mode 100644 index 00000000000..860e575940e --- /dev/null +++ b/src/mongo/db/jsobjmanipulator.h @@ -0,0 +1,94 @@ +/** jsobjManipulator.h */ + +/** + * Copyright (C) 2009 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "jsobj.h" +//#include "dur.h" + +namespace mongo { + + /** Manipulate the binary representation of a BSONElement in-place. + Careful, this casts away const. + */ + class BSONElementManipulator { + public: + BSONElementManipulator( const BSONElement &element ) : + _element( element ) { + assert( !_element.eoo() ); + } + /** Replace a Timestamp type with a Date type initialized to + OpTime::now().asDate() + */ + void initTimestamp(); + + // Note the ones with a capital letter call getDur().writing and journal + + /** Change the value, in place, of the number. */ + void setNumber(double d) { + if ( _element.type() == NumberDouble ) *reinterpret_cast< double * >( value() ) = d; + else if ( _element.type() == NumberInt ) *reinterpret_cast< int * >( value() ) = (int) d; + else assert(0); + } + void SetNumber(double d); + void setLong(long long n) { + assert( _element.type() == NumberLong ); + *reinterpret_cast< long long * >( value() ) = n; + } + void SetLong(long long n); + void setInt(int n) { + assert( _element.type() == NumberInt ); + *reinterpret_cast< int * >( value() ) = n; + } + void SetInt(int n); + + /** Replace the type and value of the element with the type and value of e, + preserving the original fieldName */ + void replaceTypeAndValue( const BSONElement &e ) { + *data() = e.type(); + memcpy( value(), e.value(), e.valuesize() ); + } + + /* dur:: version */ + void ReplaceTypeAndValue( const BSONElement &e ); + + static void lookForTimestamps( const BSONObj& obj ) { + // If have a Timestamp field as the first or second element, + // update it to a Date field set to OpTime::now().asDate(). The + // replacement policy is a work in progress. + + BSONObjIterator i( obj ); + for( int j = 0; i.moreWithEOO() && j < 2; ++j ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + if ( e.type() == Timestamp ) { + BSONElementManipulator( e ).initTimestamp(); + break; + } + } + } + private: + char *data() { return nonConst( _element.rawdata() ); } + char *value() { return nonConst( _element.value() ); } + static char *nonConst( const char *s ) { return const_cast< char * >( s ); } + + const BSONElement _element; + }; + +} // namespace mongo diff --git a/src/mongo/db/json.cpp b/src/mongo/db/json.cpp new file mode 100644 index 00000000000..73457a2bfbb --- /dev/null +++ b/src/mongo/db/json.cpp @@ -0,0 +1,651 @@ +// json.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" + +#define BOOST_SPIRIT_THREADSAFE +#if BOOST_VERSION >= 103800 +#define BOOST_SPIRIT_USE_OLD_NAMESPACE +#include <boost/spirit/include/classic_core.hpp> +#include <boost/spirit/include/classic_loops.hpp> +#include <boost/spirit/include/classic_lists.hpp> +#else +#include <boost/spirit/core.hpp> +#include <boost/spirit/utility/loops.hpp> +#include <boost/spirit/utility/lists.hpp> +#endif +#undef assert +#define assert MONGO_assert + +#include "json.h" +#include "../bson/util/builder.h" +#include "../util/base64.h" +#include "../util/hex.h" + + +using namespace boost::spirit; + +namespace mongo { + + struct ObjectBuilder : boost::noncopyable { + ~ObjectBuilder() { + unsigned i = builders.size(); + if ( i ) { + i--; + for ( ; i>=1; i-- ) { + if ( builders[i] ) { + builders[i]->done(); + } + } + } + } + BSONObjBuilder *back() { + return builders.back().get(); + } + // Storage for field names of elements within builders.back(). + const char *fieldName() { + return fieldNames.back().c_str(); + } + bool empty() const { + return builders.size() == 0; + } + void init() { + boost::shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); + builders.push_back( b ); + fieldNames.push_back( "" ); + indexes.push_back( 0 ); + } + void pushObject( const char *fieldName ) { + boost::shared_ptr< BSONObjBuilder > b( new BSONObjBuilder( builders.back()->subobjStart( fieldName ) ) ); + builders.push_back( b ); + fieldNames.push_back( "" ); + indexes.push_back( 0 ); + } + void pushArray( const char *fieldName ) { + boost::shared_ptr< BSONObjBuilder > b( new BSONObjBuilder( builders.back()->subarrayStart( fieldName ) ) ); + builders.push_back( b ); + fieldNames.push_back( "" ); + indexes.push_back( 0 ); + } + BSONObj pop() { + BSONObj ret; + if ( back()->owned() ) + ret = back()->obj(); + else + ret = back()->done(); + builders.pop_back(); + fieldNames.pop_back(); + indexes.pop_back(); + return ret; + } + void nameFromIndex() { + fieldNames.back() = BSONObjBuilder::numStr( indexes.back() ); + } + string popString() { + string ret = ss.str(); + ss.str( "" ); + return ret; + } + // Cannot use auto_ptr because its copy constructor takes a non const reference. + vector< boost::shared_ptr< BSONObjBuilder > > builders; + vector< string > fieldNames; + vector< int > indexes; + stringstream ss; + string ns; + OID oid; + string binData; + BinDataType binDataType; + string regex; + string regexOptions; + Date_t date; + OpTime timestamp; + }; + + struct objectStart { + objectStart( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char &c ) const { + if ( b.empty() ) + b.init(); + else + b.pushObject( b.fieldName() ); + } + ObjectBuilder &b; + }; + + struct arrayStart { + arrayStart( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char &c ) const { + b.pushArray( b.fieldName() ); + b.nameFromIndex(); + } + ObjectBuilder &b; + }; + + struct arrayNext { + arrayNext( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char &c ) const { + ++b.indexes.back(); + b.nameFromIndex(); + } + ObjectBuilder &b; + }; + + struct ch { + ch( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char c ) const { + b.ss << c; + } + ObjectBuilder &b; + }; + + struct chE { + chE( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char c ) const { + char o = '\0'; + switch ( c ) { + case '\"': + o = '\"'; + break; + case '\'': + o = '\''; + break; + case '\\': + o = '\\'; + break; + case '/': + o = '/'; + break; + case 'b': + o = '\b'; + break; + case 'f': + o = '\f'; + break; + case 'n': + o = '\n'; + break; + case 'r': + o = '\r'; + break; + case 't': + o = '\t'; + break; + case 'v': + o = '\v'; + break; + default: + assert( false ); + } + b.ss << o; + } + ObjectBuilder &b; + }; + + struct chU { + chU( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + unsigned char first = fromHex( start ); + unsigned char second = fromHex( start + 2 ); + if ( first == 0 && second < 0x80 ) + b.ss << second; + else if ( first < 0x08 ) { + b.ss << char( 0xc0 | ( ( first << 2 ) | ( second >> 6 ) ) ); + b.ss << char( 0x80 | ( ~0xc0 & second ) ); + } + else { + b.ss << char( 0xe0 | ( first >> 4 ) ); + b.ss << char( 0x80 | ( ~0xc0 & ( ( first << 2 ) | ( second >> 6 ) ) ) ); + b.ss << char( 0x80 | ( ~0xc0 & second ) ); + } + } + ObjectBuilder &b; + }; + + struct chClear { + chClear( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char c ) const { + b.popString(); + } + ObjectBuilder &b; + }; + + struct fieldNameEnd { + fieldNameEnd( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + string name = b.popString(); + massert( 10338 , "Invalid use of reserved field name: " + name, + name != "$oid" && + name != "$binary" && + name != "$type" && + name != "$date" && + name != "$timestamp" && + name != "$regex" && + name != "$options" ); + b.fieldNames.back() = name; + } + ObjectBuilder &b; + }; + + struct unquotedFieldNameEnd { + unquotedFieldNameEnd( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + string name( start, end ); + b.fieldNames.back() = name; + } + ObjectBuilder &b; + }; + + struct stringEnd { + stringEnd( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.back()->append( b.fieldName(), b.popString() ); + } + ObjectBuilder &b; + }; + + struct numberValue { + numberValue( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + string raw(start); + double val; + + // strtod isn't able to deal with NaN and inf in a portable way. + // Correspondingly, we perform the conversions explicitly. + + if ( ! raw.compare(0, 3, "NaN" ) ) { + val = std::numeric_limits<double>::quiet_NaN(); + } + else if ( ! raw.compare(0, 8, "Infinity" ) ) { + val = std::numeric_limits<double>::infinity(); + } + else if ( ! raw.compare(0, 9, "-Infinity" ) ) { + val = -std::numeric_limits<double>::infinity(); + } + else { + // We re-parse the numeric string here because spirit parsing of strings + // to doubles produces different results from strtod in some cases and + // we want to use strtod to ensure consistency with other string to + // double conversions in our code. + + val = strtod( start, 0 ); + } + + b.back()->append( b.fieldName(), val ); + } + ObjectBuilder &b; + }; + + struct intValue { + intValue( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( long long num ) const { + if (num >= numeric_limits<int>::min() && num <= numeric_limits<int>::max()) + b.back()->append( b.fieldName(), (int)num ); + else + b.back()->append( b.fieldName(), num ); + } + ObjectBuilder &b; + }; + + struct subobjectEnd { + subobjectEnd( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.pop(); + } + ObjectBuilder &b; + }; + + struct arrayEnd { + arrayEnd( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.pop(); + } + ObjectBuilder &b; + }; + + struct trueValue { + trueValue( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.back()->appendBool( b.fieldName(), true ); + } + ObjectBuilder &b; + }; + + struct falseValue { + falseValue( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.back()->appendBool( b.fieldName(), false ); + } + ObjectBuilder &b; + }; + + struct nullValue { + nullValue( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.back()->appendNull( b.fieldName() ); + } + ObjectBuilder &b; + }; + + struct undefinedValue { + undefinedValue( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.back()->appendUndefined( b.fieldName() ); + } + ObjectBuilder &b; + }; + + struct dbrefNS { + dbrefNS( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.ns = b.popString(); + } + ObjectBuilder &b; + }; + +// NOTE s must be 24 characters. + OID stringToOid( const char *s ) { + OID oid; + char *oidP = (char *)( &oid ); + for ( int i = 0; i < 12; ++i ) + oidP[ i ] = fromHex( s + ( i * 2 ) ); + return oid; + } + + struct oidValue { + oidValue( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.oid = stringToOid( start ); + } + ObjectBuilder &b; + }; + + struct dbrefEnd { + dbrefEnd( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.back()->appendDBRef( b.fieldName(), b.ns, b.oid ); + } + ObjectBuilder &b; + }; + + struct oidEnd { + oidEnd( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.back()->appendOID( b.fieldName(), &b.oid ); + } + ObjectBuilder &b; + }; + + struct timestampEnd { + timestampEnd( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.back()->appendTimestamp( b.fieldName(), b.timestamp.asDate() ); + } + ObjectBuilder &b; + }; + + struct binDataBinary { + binDataBinary( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + massert( 10339 , "Badly formatted bindata", ( end - start ) % 4 == 0 ); + string encoded( start, end ); + b.binData = base64::decode( encoded ); + } + ObjectBuilder &b; + }; + + struct binDataType { + binDataType( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.binDataType = BinDataType( fromHex( start ) ); + } + ObjectBuilder &b; + }; + + struct binDataEnd { + binDataEnd( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.back()->appendBinData( b.fieldName(), b.binData.length(), + b.binDataType, b.binData.data() ); + } + ObjectBuilder &b; + }; + + struct timestampSecs { + timestampSecs( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( unsigned long long x) const { + b.timestamp = OpTime( (unsigned) (x/1000) , 0); + } + ObjectBuilder &b; + }; + + struct timestampInc { + timestampInc( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( unsigned x) const { + b.timestamp = OpTime(b.timestamp.getSecs(), x); + } + ObjectBuilder &b; + }; + + struct dateValue { + dateValue( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( Date_t v ) const { + b.date = v; + } + ObjectBuilder &b; + }; + + struct dateEnd { + dateEnd( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.back()->appendDate( b.fieldName(), b.date ); + } + ObjectBuilder &b; + }; + + struct regexValue { + regexValue( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.regex = b.popString(); + } + ObjectBuilder &b; + }; + + struct regexOptions { + regexOptions( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.regexOptions = string( start, end ); + } + ObjectBuilder &b; + }; + + struct regexEnd { + regexEnd( ObjectBuilder &_b ) : b( _b ) {} + void operator() ( const char *start, const char *end ) const { + b.back()->appendRegex( b.fieldName(), b.regex, b.regexOptions ); + } + ObjectBuilder &b; + }; + +// One gotcha with this parsing library is probably best ilustrated with an +// example. Say we have a production like this: +// z = ( ch_p( 'a' )[ foo ] >> ch_p( 'b' ) ) | ( ch_p( 'a' )[ foo ] >> ch_p( 'c' ) ); +// On input "ac", action foo() will be called twice -- once as the parser tries +// to match "ab", again as the parser successfully matches "ac". Sometimes +// the grammar can be modified to eliminate these situations. Here, for example: +// z = ch_p( 'a' )[ foo ] >> ( ch_p( 'b' ) | ch_p( 'c' ) ); +// However, this is not always possible. In my implementation I've tried to +// stick to the following pattern: store fields fed to action callbacks +// temporarily as ObjectBuilder members, then append to a BSONObjBuilder once +// the parser has completely matched a nonterminal and won't backtrack. It's +// worth noting here that this parser follows a short-circuit convention. So, +// in the original z example on line 3, if the input was "ab", foo() would only +// be called once. + struct JsonGrammar : public grammar< JsonGrammar > { + public: + JsonGrammar( ObjectBuilder &_b ) : b( _b ) {} + + template < typename ScannerT > + struct definition { + definition( JsonGrammar const &self ) { + object = ch_p( '{' )[ objectStart( self.b ) ] >> !members >> '}'; + members = list_p((fieldName >> ':' >> value) , ','); + fieldName = + str[ fieldNameEnd( self.b ) ] | + singleQuoteStr[ fieldNameEnd( self.b ) ] | + unquotedFieldName[ unquotedFieldNameEnd( self.b ) ]; + array = ch_p( '[' )[ arrayStart( self.b ) ] >> !elements >> ']'; + elements = list_p(value, ch_p(',')[arrayNext( self.b )]); + value = + str[ stringEnd( self.b ) ] | + number[ numberValue( self.b ) ] | + integer | + array[ arrayEnd( self.b ) ] | + lexeme_d[ str_p( "true" ) ][ trueValue( self.b ) ] | + lexeme_d[ str_p( "false" ) ][ falseValue( self.b ) ] | + lexeme_d[ str_p( "null" ) ][ nullValue( self.b ) ] | + lexeme_d[ str_p( "undefined" ) ][ undefinedValue( self.b ) ] | + singleQuoteStr[ stringEnd( self.b ) ] | + date[ dateEnd( self.b ) ] | + oid[ oidEnd( self.b ) ] | + bindata[ binDataEnd( self.b ) ] | + dbref[ dbrefEnd( self.b ) ] | + timestamp[ timestampEnd( self.b ) ] | + regex[ regexEnd( self.b ) ] | + object[ subobjectEnd( self.b ) ] ; + // NOTE lexeme_d and rules don't mix well, so we have this mess. + // NOTE We use range_p rather than cntrl_p, because the latter is locale dependent. + str = lexeme_d[ ch_p( '"' )[ chClear( self.b ) ] >> + *( ( ch_p( '\\' ) >> + ( + ch_p( 'b' )[ chE( self.b ) ] | + ch_p( 'f' )[ chE( self.b ) ] | + ch_p( 'n' )[ chE( self.b ) ] | + ch_p( 'r' )[ chE( self.b ) ] | + ch_p( 't' )[ chE( self.b ) ] | + ch_p( 'v' )[ chE( self.b ) ] | + ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) | + ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported + ) + ) | + ( ~range_p( 0x00, 0x1f ) & ~ch_p( '"' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> '"' ]; + + singleQuoteStr = lexeme_d[ ch_p( '\'' )[ chClear( self.b ) ] >> + *( ( ch_p( '\\' ) >> + ( + ch_p( 'b' )[ chE( self.b ) ] | + ch_p( 'f' )[ chE( self.b ) ] | + ch_p( 'n' )[ chE( self.b ) ] | + ch_p( 'r' )[ chE( self.b ) ] | + ch_p( 't' )[ chE( self.b ) ] | + ch_p( 'v' )[ chE( self.b ) ] | + ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) | + ( ~ch_p('x') & (~range_p('0','9'))[ ch( self.b ) ] ) // hex and octal aren't supported + ) + ) | + ( ~range_p( 0x00, 0x1f ) & ~ch_p( '\'' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> '\'' ]; + + // real_p accepts numbers with nonsignificant zero prefixes, which + // aren't allowed in JSON. Oh well. + number = strict_real_p | str_p( "NaN" ) | str_p( "Infinity" ) | str_p( "-Infinity" ); + + static int_parser<long long, 10, 1, numeric_limits<long long>::digits10 + 1> long_long_p; + integer = long_long_p[ intValue(self.b) ]; + + // We allow a subset of valid js identifier names here. + unquotedFieldName = lexeme_d[ ( alpha_p | ch_p( '$' ) | ch_p( '_' ) ) >> *( ( alnum_p | ch_p( '$' ) | ch_p( '_' )) ) ]; + + dbref = dbrefS | dbrefT; + dbrefS = ch_p( '{' ) >> "\"$ref\"" >> ':' >> + str[ dbrefNS( self.b ) ] >> ',' >> "\"$id\"" >> ':' >> quotedOid >> '}'; + dbrefT = str_p( "Dbref" ) >> '(' >> str[ dbrefNS( self.b ) ] >> ',' >> + quotedOid >> ')'; + + timestamp = ch_p( '{' ) >> "\"$timestamp\"" >> ':' >> '{' >> + "\"t\"" >> ':' >> uint_parser<unsigned long long, 10, 1, -1>()[ timestampSecs(self.b) ] >> ',' >> + "\"i\"" >> ':' >> uint_parser<unsigned int, 10, 1, -1>()[ timestampInc(self.b) ] >> '}' >>'}'; + + oid = oidS | oidT; + oidS = ch_p( '{' ) >> "\"$oid\"" >> ':' >> quotedOid >> '}'; + oidT = str_p( "ObjectId" ) >> '(' >> quotedOid >> ')'; + + quotedOid = lexeme_d[ '"' >> ( repeat_p( 24 )[ xdigit_p ] )[ oidValue( self.b ) ] >> '"' ]; + + bindata = ch_p( '{' ) >> "\"$binary\"" >> ':' >> + lexeme_d[ '"' >> ( *( range_p( 'A', 'Z' ) | range_p( 'a', 'z' ) | range_p( '0', '9' ) | ch_p( '+' ) | ch_p( '/' ) ) >> *ch_p( '=' ) )[ binDataBinary( self.b ) ] >> '"' ] >> ',' >> "\"$type\"" >> ':' >> + lexeme_d[ '"' >> ( repeat_p( 2 )[ xdigit_p ] )[ binDataType( self.b ) ] >> '"' ] >> '}'; + + // TODO: this will need to use a signed parser at some point + date = dateS | dateT; + dateS = ch_p( '{' ) >> "\"$date\"" >> ':' >> uint_parser< Date_t >()[ dateValue( self.b ) ] >> '}'; + dateT = !str_p("new") >> str_p( "Date" ) >> '(' >> uint_parser< Date_t >()[ dateValue( self.b ) ] >> ')'; + + regex = regexS | regexT; + regexS = ch_p( '{' ) >> "\"$regex\"" >> ':' >> str[ regexValue( self.b ) ] >> ',' >> "\"$options\"" >> ':' >> lexeme_d[ '"' >> ( *( alpha_p ) )[ regexOptions( self.b ) ] >> '"' ] >> '}'; + // FIXME Obviously it would be nice to unify this with str. + regexT = lexeme_d[ ch_p( '/' )[ chClear( self.b ) ] >> + *( ( ch_p( '\\' ) >> + ( ch_p( '"' )[ chE( self.b ) ] | + ch_p( '\\' )[ chE( self.b ) ] | + ch_p( '/' )[ chE( self.b ) ] | + ch_p( 'b' )[ chE( self.b ) ] | + ch_p( 'f' )[ chE( self.b ) ] | + ch_p( 'n' )[ chE( self.b ) ] | + ch_p( 'r' )[ chE( self.b ) ] | + ch_p( 't' )[ chE( self.b ) ] | + ( ch_p( 'u' ) >> ( repeat_p( 4 )[ xdigit_p ][ chU( self.b ) ] ) ) ) ) | + ( ~range_p( 0x00, 0x1f ) & ~ch_p( '/' ) & ( ~ch_p( '\\' ) )[ ch( self.b ) ] ) ) >> str_p( "/" )[ regexValue( self.b ) ] + >> ( *( ch_p( 'i' ) | ch_p( 'g' ) | ch_p( 'm' ) ) )[ regexOptions( self.b ) ] ]; + } + rule< ScannerT > object, members, array, elements, value, str, number, integer, + dbref, dbrefS, dbrefT, timestamp, timestampS, timestampT, oid, oidS, oidT, + bindata, date, dateS, dateT, regex, regexS, regexT, quotedOid, fieldName, + unquotedFieldName, singleQuoteStr; + const rule< ScannerT > &start() const { + return object; + } + }; + ObjectBuilder &b; + }; + + BSONObj fromjson( const char *str , int* len) { + if ( str[0] == '\0' ) { + if (len) *len = 0; + return BSONObj(); + } + + ObjectBuilder b; + JsonGrammar parser( b ); + parse_info<> result = parse( str, parser, space_p ); + if (len) { + *len = result.stop - str; + } + else if ( !result.full ) { + int limit = strnlen(result.stop , 10); + if (limit == -1) limit = 10; + msgasserted(10340, "Failure parsing JSON string near: " + string( result.stop, limit )); + } + BSONObj ret = b.pop(); + assert( b.empty() ); + return ret; + } + + BSONObj fromjson( const string &str ) { + return fromjson( str.c_str() ); + } + +} // namespace mongo diff --git a/src/mongo/db/json.h b/src/mongo/db/json.h new file mode 100644 index 00000000000..68dae042574 --- /dev/null +++ b/src/mongo/db/json.h @@ -0,0 +1,41 @@ +/** @file json.h */ + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../pch.h" +#include "jsobj.h" + +namespace mongo { + + /** Create a BSONObj from a JSON <http://www.json.org> string. In addition + to the JSON extensions extensions described here + <http://mongodb.onconfluence.com/display/DOCS/Mongo+Extended+JSON>, + this function accepts certain unquoted field names and allows single quotes + to optionally be used when specifying field names and string values instead + of double quotes. JSON unicode escape sequences (of the form \uXXXX) are + converted to utf8. + \throws MsgAssertionException if parsing fails. The message included with + this assertion includes a rough indication of where parsing failed. + */ + BSONObj fromjson(const string &str); + + /** len will be size of JSON object in text chars. */ + BSONObj fromjson(const char *str, int* len=NULL); + +} // namespace mongo diff --git a/src/mongo/db/key.cpp b/src/mongo/db/key.cpp new file mode 100644 index 00000000000..47449986d21 --- /dev/null +++ b/src/mongo/db/key.cpp @@ -0,0 +1,678 @@ +// @file key.cpp + +/** +* Copyright (C) 2011 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "key.h" +#include "../util/unittest.h" + +namespace mongo { + + extern const Ordering nullOrdering = Ordering::make(BSONObj()); + + // KeyBson is for V0 (version #0) indexes + + int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o); + + // "old" = pre signed dates & such; i.e. btree V0 + /* must be same canon type when called */ + int oldCompareElementValues(const BSONElement& l, const BSONElement& r) { + dassert( l.canonicalType() == r.canonicalType() ); + int f; + double x; + + switch ( l.type() ) { + case EOO: + case Undefined: // EOO and Undefined are same canonicalType + case jstNULL: + case MaxKey: + case MinKey: + return 0; + case Bool: + return *l.value() - *r.value(); + case Timestamp: + case Date: + // unsigned dates for old version + if ( l.date() < r.date() ) + return -1; + return l.date() == r.date() ? 0 : 1; + case NumberLong: + if( r.type() == NumberLong ) { + long long L = l._numberLong(); + long long R = r._numberLong(); + if( L < R ) return -1; + if( L == R ) return 0; + return 1; + } + // else fall through + case NumberInt: + case NumberDouble: { + double left = l.number(); + double right = r.number(); + bool lNan = !( left <= numeric_limits< double >::max() && + left >= -numeric_limits< double >::max() ); + bool rNan = !( right <= numeric_limits< double >::max() && + right >= -numeric_limits< double >::max() ); + if ( lNan ) { + if ( rNan ) { + return 0; + } + else { + return -1; + } + } + else if ( rNan ) { + return 1; + } + x = left - right; + if ( x < 0 ) return -1; + return x == 0 ? 0 : 1; + } + case jstOID: + return memcmp(l.value(), r.value(), 12); + case Code: + case Symbol: + case String: + // nulls not allowed in the middle of strings in the old version + return strcmp(l.valuestr(), r.valuestr()); + case Object: + case Array: + return oldCompare(l.embeddedObject(), r.embeddedObject(), nullOrdering); + case DBRef: { + int lsz = l.valuesize(); + int rsz = r.valuesize(); + if ( lsz - rsz != 0 ) return lsz - rsz; + return memcmp(l.value(), r.value(), lsz); + } + case BinData: { + int lsz = l.objsize(); // our bin data size in bytes, not including the subtype byte + int rsz = r.objsize(); + if ( lsz - rsz != 0 ) return lsz - rsz; + return memcmp(l.value()+4, r.value()+4, lsz+1); + } + case RegEx: { + int c = strcmp(l.regex(), r.regex()); + if ( c ) + return c; + return strcmp(l.regexFlags(), r.regexFlags()); + } + case CodeWScope : { + f = l.canonicalType() - r.canonicalType(); + if ( f ) + return f; + f = strcmp( l.codeWScopeCode() , r.codeWScopeCode() ); + if ( f ) + return f; + f = strcmp( l.codeWScopeScopeData() , r.codeWScopeScopeData() ); + if ( f ) + return f; + return 0; + } + default: + out() << "oldCompareElementValues: bad type " << (int) l.type() << endl; + assert(false); + } + return -1; + } + + int oldElemCompare(const BSONElement&l , const BSONElement& r) { + int lt = (int) l.canonicalType(); + int rt = (int) r.canonicalType(); + int x = lt - rt; + if( x ) + return x; + return oldCompareElementValues(l, r); + } + + // pre signed dates & such + int oldCompare(const BSONObj& l,const BSONObj& r, const Ordering &o) { + BSONObjIterator i(l); + BSONObjIterator j(r); + unsigned mask = 1; + while ( 1 ) { + // so far, equal... + + BSONElement l = i.next(); + BSONElement r = j.next(); + if ( l.eoo() ) + return r.eoo() ? 0 : -1; + if ( r.eoo() ) + return 1; + + int x; + { + x = oldElemCompare(l, r); + if( o.descending(mask) ) + x = -x; + } + if ( x != 0 ) + return x; + mask <<= 1; + } + return -1; + } + + /* old style compares: + - dates are unsigned + - strings no nulls + */ + int KeyBson::woCompare(const KeyBson& r, const Ordering &o) const { + return oldCompare(_o, r._o, o); + } + + // woEqual could be made faster than woCompare but this is for backward compatibility so not worth a big effort + bool KeyBson::woEqual(const KeyBson& r) const { + return oldCompare(_o, r._o, nullOrdering) == 0; + } + + // [ ][HASMORE][x][y][canontype_4bits] + enum CanonicalsEtc { + cminkey=1, + cnull=2, + cdouble=4, + cstring=6, + cbindata=7, + coid=8, + cfalse=10, + ctrue=11, + cdate=12, + cmaxkey=14, + cCANONTYPEMASK = 0xf, + cY = 0x10, + cint = cY | cdouble, + cX = 0x20, + clong = cX | cdouble, + cHASMORE = 0x40, + cNOTUSED = 0x80 // but see IsBSON sentinel - this bit not usable without great care + }; + + // bindata bson type + const unsigned BinDataLenMask = 0xf0; // lengths are powers of 2 of this value + const unsigned BinDataTypeMask = 0x0f; // 0-7 as you would expect, 8-15 are 128+value. see BinDataType. + const int BinDataLenMax = 32; + const int BinDataLengthToCode[] = { + 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, -1/*9*/, 0x90/*10*/, -1/*11*/, 0xa0/*12*/, -1/*13*/, 0xb0/*14*/, -1/*15*/, + 0xc0/*16*/, -1, -1, -1, 0xd0/*20*/, -1, -1, -1, + 0xe0/*24*/, -1, -1, -1, -1, -1, -1, -1, + 0xf0/*32*/ + }; + const int BinDataCodeToLength[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 32 + }; + + int binDataCodeToLength(int codeByte) { + return BinDataCodeToLength[codeByte >> 4]; + } + + /** object cannot be represented in compact format. so store in traditional bson format + with a leading sentinel byte IsBSON to indicate it's in that format. + + Given that the KeyV1Owned constructor already grabbed a bufbuilder, we reuse it here + so that we don't have to do an extra malloc. + */ + void KeyV1Owned::traditional(const BSONObj& obj) { + b.reset(); + b.appendUChar(IsBSON); + b.appendBuf(obj.objdata(), obj.objsize()); + _keyData = (const unsigned char *) b.buf(); + } + + KeyV1Owned::KeyV1Owned(const KeyV1& rhs) { + b.appendBuf( rhs.data(), rhs.dataSize() ); + _keyData = (const unsigned char *) b.buf(); + dassert( b.len() == dataSize() ); // check datasize method is correct + dassert( (*_keyData & cNOTUSED) == 0 ); + } + + // fromBSON to Key format + KeyV1Owned::KeyV1Owned(const BSONObj& obj) { + BSONObj::iterator i(obj); + unsigned char bits = 0; + while( 1 ) { + BSONElement e = i.next(); + if( i.more() ) + bits |= cHASMORE; + switch( e.type() ) { + case MinKey: + b.appendUChar(cminkey|bits); + break; + case jstNULL: + b.appendUChar(cnull|bits); + break; + case MaxKey: + b.appendUChar(cmaxkey|bits); + break; + case Bool: + b.appendUChar( (e.boolean()?ctrue:cfalse) | bits ); + break; + case jstOID: + b.appendUChar(coid|bits); + b.appendBuf(&e.__oid(), sizeof(OID)); + break; + case BinData: + { + int t = e.binDataType(); + // 0-7 and 0x80 to 0x87 are supported by Key + if( (t & 0x78) == 0 && t != ByteArrayDeprecated ) { + int len; + const char * d = e.binData(len); + if( len <= BinDataLenMax ) { + int code = BinDataLengthToCode[len]; + if( code >= 0 ) { + if( t >= 128 ) + t = (t-128) | 0x08; + dassert( (code&t) == 0 ); + b.appendUChar( cbindata|bits ); + b.appendUChar( code | t ); + b.appendBuf(d, len); + break; + } + } + } + traditional(obj); + return; + } + case Date: + b.appendUChar(cdate|bits); + b.appendStruct(e.date()); + break; + case String: + { + b.appendUChar(cstring|bits); + // note we do not store the terminating null, to save space. + unsigned x = (unsigned) e.valuestrsize() - 1; + if( x > 255 ) { + traditional(obj); + return; + } + b.appendUChar(x); + b.appendBuf(e.valuestr(), x); + break; + } + case NumberInt: + b.appendUChar(cint|bits); + b.appendNum((double) e._numberInt()); + break; + case NumberLong: + { + long long n = e._numberLong(); + long long m = 2LL << 52; + DEV { + long long d = m-1; + assert( ((long long) ((double) -d)) == -d ); + } + if( n >= m || n <= -m ) { + // can't represent exactly as a double + traditional(obj); + return; + } + b.appendUChar(clong|bits); + b.appendNum((double) n); + break; + } + case NumberDouble: + { + double d = e._numberDouble(); + if( isNaN(d) ) { + traditional(obj); + return; + } + b.appendUChar(cdouble|bits); + b.appendNum(d); + break; + } + default: + // if other types involved, store as traditional BSON + traditional(obj); + return; + } + if( !i.more() ) + break; + bits = 0; + } + _keyData = (const unsigned char *) b.buf(); + dassert( b.len() == dataSize() ); // check datasize method is correct + dassert( (*_keyData & cNOTUSED) == 0 ); + } + + BSONObj KeyV1::toBson() const { + assert( _keyData != 0 ); + if( !isCompactFormat() ) + return bson(); + + BSONObjBuilder b(512); + const unsigned char *p = _keyData; + while( 1 ) { + unsigned bits = *p++; + + switch( bits & 0x3f ) { + case cminkey: b.appendMinKey(""); break; + case cnull: b.appendNull(""); break; + case cfalse: b.appendBool("", false); break; + case ctrue: b.appendBool("", true); break; + case cmaxkey: + b.appendMaxKey(""); + break; + case cstring: + { + unsigned sz = *p++; + // we build the element ourself as we have to null terminate it + BufBuilder &bb = b.bb(); + bb.appendNum((char) String); + bb.appendUChar(0); // fieldname "" + bb.appendNum(sz+1); + bb.appendBuf(p, sz); + bb.appendUChar(0); // null char at end of string + p += sz; + break; + } + case coid: + b.appendOID("", (OID *) p); + p += sizeof(OID); + break; + case cbindata: + { + int len = binDataCodeToLength(*p); + int subtype = (*p) & BinDataTypeMask; + if( subtype & 0x8 ) { + subtype = (subtype & 0x7) | 0x80; + } + b.appendBinData("", len, (BinDataType) subtype, ++p); + p += len; + break; + } + case cdate: + b.appendDate("", (Date_t&) *p); + p += 8; + break; + case cdouble: + b.append("", (double&) *p); + p += sizeof(double); + break; + case cint: + b.append("", (int) ((double&) *p)); + p += sizeof(double); + break; + case clong: + b.append("", (long long) ((double&) *p)); + p += sizeof(double); + break; + default: + assert(false); + } + + if( (bits & cHASMORE) == 0 ) + break; + } + return b.obj(); + } + + static int compare(const unsigned char *&l, const unsigned char *&r) { + int lt = (*l & cCANONTYPEMASK); + int rt = (*r & cCANONTYPEMASK); + int x = lt - rt; + if( x ) + return x; + + l++; r++; + + // same type + switch( lt ) { + case cdouble: + { + double L = *((double *) l); + double R = *((double *) r); + if( L < R ) + return -1; + if( L != R ) + return 1; + l += 8; r += 8; + break; + } + case cstring: + { + int lsz = *l; + int rsz = *r; + int common = min(lsz, rsz); + l++; r++; // skip the size byte + // use memcmp as we (will) allow zeros in UTF8 strings + int res = memcmp(l, r, common); + if( res ) + return res; + // longer string is the greater one + int diff = lsz-rsz; + if( diff ) + return diff; + l += lsz; r += lsz; + break; + } + case cbindata: + { + int L = *l; + int R = *r; + int llen = binDataCodeToLength(L); + int diff = L-R; // checks length and subtype simultaneously + if( diff ) { + // unfortunately nibbles are backwards to do subtype and len in one check (could bit swap...) + int rlen = binDataCodeToLength(R); + if( llen != rlen ) + return llen - rlen; + return diff; + } + // same length, same type + l++; r++; + int res = memcmp(l, r, llen); + if( res ) + return res; + l += llen; r += llen; + break; + } + case cdate: + { + long long L = *((long long *) l); + long long R = *((long long *) r); + if( L < R ) + return -1; + if( L > R ) + return 1; + l += 8; r += 8; + break; + } + case coid: + { + int res = memcmp(l, r, sizeof(OID)); + if( res ) + return res; + l += 12; r += 12; + break; + } + default: + // all the others are a match -- e.g. null == null + ; + } + + return 0; + } + + // at least one of this and right are traditional BSON format + int NOINLINE_DECL KeyV1::compareHybrid(const KeyV1& right, const Ordering& order) const { + BSONObj L = toBson(); + BSONObj R = right.toBson(); + return L.woCompare(R, order, /*considerfieldname*/false); + } + + int KeyV1::woCompare(const KeyV1& right, const Ordering &order) const { + const unsigned char *l = _keyData; + const unsigned char *r = right._keyData; + + if( (*l|*r) == IsBSON ) // only can do this if cNOTUSED maintained + return compareHybrid(right, order); + + unsigned mask = 1; + while( 1 ) { + char lval = *l; + char rval = *r; + { + int x = compare(l, r); // updates l and r pointers + if( x ) { + if( order.descending(mask) ) + x = -x; + return x; + } + } + + { + int x = ((int)(lval & cHASMORE)) - ((int)(rval & cHASMORE)); + if( x ) + return x; + if( (lval & cHASMORE) == 0 ) + break; + } + + mask <<= 1; + } + + return 0; + } + + static unsigned sizes[] = { + 0, + 1, //cminkey=1, + 1, //cnull=2, + 0, + 9, //cdouble=4, + 0, + 0, //cstring=6, + 0, + 13, //coid=8, + 0, + 1, //cfalse=10, + 1, //ctrue=11, + 9, //cdate=12, + 0, + 1, //cmaxkey=14, + 0 + }; + + inline unsigned sizeOfElement(const unsigned char *p) { + unsigned type = *p & cCANONTYPEMASK; + unsigned sz = sizes[type]; + if( sz == 0 ) { + if( type == cstring ) { + sz = ((unsigned) p[1]) + 2; + } + else { + assert( type == cbindata ); + sz = binDataCodeToLength(p[1]) + 2; + } + } + return sz; + } + + int KeyV1::dataSize() const { + const unsigned char *p = _keyData; + if( !isCompactFormat() ) { + return bson().objsize() + 1; + } + + bool more; + do { + unsigned z = sizeOfElement(p); + more = (*p & cHASMORE) != 0; + p += z; + } while( more ); + return p - _keyData; + } + + bool KeyV1::woEqual(const KeyV1& right) const { + const unsigned char *l = _keyData; + const unsigned char *r = right._keyData; + + if( (*l|*r) == IsBSON ) { + return toBson().equal(right.toBson()); + } + + while( 1 ) { + char lval = *l; + char rval = *r; + if( (lval&(cCANONTYPEMASK|cHASMORE)) != (rval&(cCANONTYPEMASK|cHASMORE)) ) + return false; + l++; r++; + switch( lval&cCANONTYPEMASK ) { + case coid: + if( *((unsigned*) l) != *((unsigned*) r) ) + return false; + l += 4; r += 4; + case cdate: + if( *((unsigned long long *) l) != *((unsigned long long *) r) ) + return false; + l += 8; r += 8; + break; + case cdouble: + if( *((double *) l) != *((double *) r) ) + return false; + l += 8; r += 8; + break; + case cstring: + { + if( *l != *r ) + return false; // not same length + unsigned sz = ((unsigned) *l) + 1; + if( memcmp(l, r, sz) ) + return false; + l += sz; r += sz; + break; + } + case cbindata: + { + if( *l != *r ) + return false; // len or subtype mismatch + int len = binDataCodeToLength(*l) + 1; + if( memcmp(l, r, len) ) + return false; + l += len; r += len; + break; + } + case cminkey: + case cnull: + case cfalse: + case ctrue: + case cmaxkey: + break; + default: + assert(false); + } + if( (lval&cHASMORE) == 0 ) + break; + } + return true; + } + + struct CmpUnitTest : public UnitTest { + void run() { + char a[2]; + char b[2]; + a[0] = -3; + a[1] = 0; + b[0] = 3; + b[1] = 0; + assert( strcmp(a,b)>0 && memcmp(a,b,2)>0 ); + } + } cunittest; + +} diff --git a/src/mongo/db/key.h b/src/mongo/db/key.h new file mode 100644 index 00000000000..9284cdc7422 --- /dev/null +++ b/src/mongo/db/key.h @@ -0,0 +1,115 @@ +// @file key.h class(es) representing individual keys in a btree + +/** +* Copyright (C) 2011 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "jsobj.h" + +namespace mongo { + + /** Key class for precomputing a small format index key that is denser than a traditional BSONObj. + + KeyBson is a legacy wrapper implementation for old BSONObj style keys for v:0 indexes. + + KeyV1 is the new implementation. + */ + class KeyBson /* "KeyV0" */ { + public: + KeyBson() { } + explicit KeyBson(const char *keyData) : _o(keyData) { } + explicit KeyBson(const BSONObj& obj) : _o(obj) { } + int woCompare(const KeyBson& r, const Ordering &o) const; + BSONObj toBson() const { return _o; } + string toString() const { return _o.toString(); } + int dataSize() const { return _o.objsize(); } + const char * data() const { return _o.objdata(); } + BSONElement _firstElement() const { return _o.firstElement(); } + bool isCompactFormat() const { return false; } + bool woEqual(const KeyBson& r) const; + void assign(const KeyBson& rhs) { *this = rhs; } + private: + BSONObj _o; + }; + + class KeyV1Owned; + + // corresponding to BtreeData_V1 + class KeyV1 { + void operator=(const KeyV1&); // disallowed just to make people be careful as we don't own the buffer + KeyV1(const KeyV1Owned&); // disallowed as this is not a great idea as KeyV1Owned likely will go out of scope + public: + KeyV1() { _keyData = 0; } + ~KeyV1() { DEV _keyData = (const unsigned char *) 1; } + + KeyV1(const KeyV1& rhs) : _keyData(rhs._keyData) { + dassert( _keyData > (const unsigned char *) 1 ); + } + + // explicit version of operator= to be safe + void assign(const KeyV1& rhs) { + _keyData = rhs._keyData; + } + + /** @param keyData can be a buffer containing data in either BSON format, OR in KeyV1 format. + when BSON, we are just a wrapper + */ + explicit KeyV1(const char *keyData) : _keyData((unsigned char *) keyData) { } + + int woCompare(const KeyV1& r, const Ordering &o) const; + bool woEqual(const KeyV1& r) const; + BSONObj toBson() const; + string toString() const { return toBson().toString(); } + + /** get the key data we want to store in the btree bucket */ + const char * data() const { return (const char *) _keyData; } + + /** @return size of data() */ + int dataSize() const; + + /** only used by geo, which always has bson keys */ + BSONElement _firstElement() const { return bson().firstElement(); } + bool isCompactFormat() const { return *_keyData != IsBSON; } + protected: + enum { IsBSON = 0xff }; + const unsigned char *_keyData; + BSONObj bson() const { + dassert( !isCompactFormat() ); + return BSONObj((const char *) _keyData+1); + } + private: + int compareHybrid(const KeyV1& right, const Ordering& order) const; + }; + + class KeyV1Owned : public KeyV1 { + void operator=(const KeyV1Owned&); + public: + /** @obj a BSON object to be translated to KeyV1 format. If the object isn't + representable in KeyV1 format (which happens, intentionally, at times) + it will stay as bson herein. + */ + KeyV1Owned(const BSONObj& obj); + + /** makes a copy (memcpy's the whole thing) */ + KeyV1Owned(const KeyV1& rhs); + + private: + StackBufBuilder b; + void traditional(const BSONObj& obj); // store as traditional bson not as compact format + }; + +}; diff --git a/src/mongo/db/lasterror.cpp b/src/mongo/db/lasterror.cpp new file mode 100644 index 00000000000..4ed4dfb0571 --- /dev/null +++ b/src/mongo/db/lasterror.cpp @@ -0,0 +1,142 @@ +// lasterror.cpp + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pch.h" + +#include "../util/unittest.h" +#include "../util/net/message.h" + + +#include "lasterror.h" +#include "jsobj.h" + +namespace mongo { + + LastError LastError::noError; + LastErrorHolder lastError; + + bool isShell = false; + void raiseError(int code , const char *msg) { + LastError *le = lastError.get(); + if ( le == 0 ) { + /* might be intentional (non-user thread) */ + DEV { + static unsigned n; + if( ++n < 4 && !isShell ) log() << "dev: lastError==0 won't report:" << msg << endl; + } + } + else if ( le->disabled ) { + log() << "lastError disabled, can't report: " << code << ":" << msg << endl; + } + else { + le->raiseError(code, msg); + } + } + + bool LastError::appendSelf( BSONObjBuilder &b , bool blankErr ) { + if ( !valid ) { + if ( blankErr ) + b.appendNull( "err" ); + b.append( "n", 0 ); + return false; + } + + if ( msg.empty() ) { + if ( blankErr ) { + b.appendNull( "err" ); + } + } + else { + b.append( "err", msg ); + } + + if ( code ) + b.append( "code" , code ); + if ( updatedExisting != NotUpdate ) + b.appendBool( "updatedExisting", updatedExisting == True ); + if ( upsertedId.isSet() ) + b.append( "upserted" , upsertedId ); + if ( writebackId.isSet() ) { + b.append( "writeback" , writebackId ); + b.append( "instanceIdent" , prettyHostName() ); // this can be any unique string + } + b.appendNumber( "n", nObjects ); + + return ! msg.empty(); + } + + LastErrorHolder::~LastErrorHolder() { + } + + + LastError * LastErrorHolder::disableForCommand() { + LastError *le = _get(); + uassert(13649, "no operation yet", le); + le->disabled = true; + le->nPrev--; // caller is a command that shouldn't count as an operation + return le; + } + + LastError * LastErrorHolder::get( bool create ) { + LastError *ret = _get( create ); + if ( ret && !ret->disabled ) + return ret; + return 0; + } + + LastError * LastErrorHolder::_get( bool create ) { + LastError * le = _tl.get(); + if ( ! le && create ) { + le = new LastError(); + _tl.reset( le ); + } + return le; + } + + void LastErrorHolder::release() { + _tl.release(); + } + + /** ok to call more than once. */ + void LastErrorHolder::initThread() { + if( ! _tl.get() ) + _tl.reset( new LastError() ); + } + + void LastErrorHolder::reset( LastError * le ) { + _tl.reset( le ); + } + + void prepareErrForNewRequest( Message &m, LastError * err ) { + // a killCursors message shouldn't affect last error + assert( err ); + if ( m.operation() == dbKillCursors ) { + err->disabled = true; + } + else { + err->disabled = false; + err->nPrev++; + } + } + + LastError * LastErrorHolder::startRequest( Message& m , LastError * le ) { + assert( le ); + prepareErrForNewRequest( m, le ); + return le; + } + +} // namespace mongo diff --git a/src/mongo/db/lasterror.h b/src/mongo/db/lasterror.h new file mode 100644 index 00000000000..86250e496a8 --- /dev/null +++ b/src/mongo/db/lasterror.h @@ -0,0 +1,146 @@ +// lasterror.h + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../bson/oid.h" + +namespace mongo { + class BSONObjBuilder; + class Message; + + struct LastError { + int code; + string msg; + enum UpdatedExistingType { NotUpdate, True, False } updatedExisting; + OID upsertedId; + OID writebackId; + long long nObjects; + int nPrev; + bool valid; + bool disabled; + void writeback( OID& oid ) { + reset( true ); + writebackId = oid; + } + void raiseError(int _code , const char *_msg) { + reset( true ); + code = _code; + msg = _msg; + } + void recordUpdate( bool _updateObjects , long long _nObjects , OID _upsertedId ) { + reset( true ); + nObjects = _nObjects; + updatedExisting = _updateObjects ? True : False; + if ( _upsertedId.isSet() ) + upsertedId = _upsertedId; + + } + void recordDelete( long long nDeleted ) { + reset( true ); + nObjects = nDeleted; + } + LastError() { + reset(); + } + void reset( bool _valid = false ) { + code = 0; + msg.clear(); + updatedExisting = NotUpdate; + nObjects = 0; + nPrev = 1; + valid = _valid; + disabled = false; + upsertedId.clear(); + writebackId.clear(); + } + + /** + * @return if there is an err + */ + bool appendSelf( BSONObjBuilder &b , bool blankErr = true ); + + struct Disabled : boost::noncopyable { + Disabled( LastError * le ) { + _le = le; + if ( _le ) { + _prev = _le->disabled; + _le->disabled = true; + } + else { + _prev = false; + } + } + + ~Disabled() { + if ( _le ) + _le->disabled = _prev; + } + + LastError * _le; + bool _prev; + }; + + static LastError noError; + }; + + extern class LastErrorHolder { + public: + LastErrorHolder(){} + ~LastErrorHolder(); + + LastError * get( bool create = false ); + LastError * getSafe() { + LastError * le = get(false); + if ( ! le ) { + error() << " no LastError!" << endl; + assert( le ); + } + return le; + } + + LastError * _get( bool create = false ); // may return a disabled LastError + + void reset( LastError * le ); + + /** ok to call more than once. */ + void initThread(); + + int getID(); + + void release(); + + /** when db receives a message/request, call this */ + LastError * startRequest( Message& m , LastError * connectionOwned ); + + void disconnect( int clientId ); + + // used to disable lastError reporting while processing a killCursors message + // disable causes get() to return 0. + LastError *disableForCommand(); // only call once per command invocation! + private: + boost::thread_specific_ptr<LastError> _tl; + + struct Status { + time_t time; + LastError *lerr; + }; + } lastError; + + void raiseError(int code , const char *msg); + +} // namespace mongo diff --git a/src/mongo/db/matcher.cpp b/src/mongo/db/matcher.cpp new file mode 100755 index 00000000000..2631845a757 --- /dev/null +++ b/src/mongo/db/matcher.cpp @@ -0,0 +1,1128 @@ +// matcher.cpp + +/* Matcher is our boolean expression evaluator for "where" clauses */ + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "matcher.h" +#include "../util/goodies.h" +#include "../util/unittest.h" +#include "diskloc.h" +#include "../scripting/engine.h" +#include "db.h" +#include "queryutil.h" +#include "client.h" + +#include "pdfile.h" + +namespace { + inline pcrecpp::RE_Options flags2options(const char* flags) { + pcrecpp::RE_Options options; + options.set_utf8(true); + while ( flags && *flags ) { + if ( *flags == 'i' ) + options.set_caseless(true); + else if ( *flags == 'm' ) + options.set_multiline(true); + else if ( *flags == 'x' ) + options.set_extended(true); + else if ( *flags == 's' ) + options.set_dotall(true); + flags++; + } + return options; + } +} + +//#define DEBUGMATCHER(x) cout << x << endl; +#define DEBUGMATCHER(x) + +namespace mongo { + + extern BSONObj staticNull; + + class Where { + public: + Where() { + jsScope = 0; + func = 0; + } + ~Where() { + + if ( scope.get() ){ + try { + scope->execSetup( "_mongo.readOnly = false;" , "make not read only" ); + } + catch( DBException& e ){ + warning() << "javascript scope cleanup interrupted" << causedBy( e ) << endl; + } + } + + if ( jsScope ) { + delete jsScope; + jsScope = 0; + } + func = 0; + } + + auto_ptr<Scope> scope; + ScriptingFunction func; + BSONObj *jsScope; + + void setFunc(const char *code) { + massert( 10341 , "scope has to be created first!" , scope.get() ); + func = scope->createFunction( code ); + } + + }; + + Matcher::~Matcher() { + delete _where; + _where = 0; + } + + ElementMatcher::ElementMatcher( BSONElement e , int op, bool isNot ) + : _toMatch( e ) , _compareOp( op ), _isNot( isNot ), _subMatcherOnPrimitives(false) { + if ( op == BSONObj::opMOD ) { + BSONObj o = e.embeddedObject(); + _mod = o["0"].numberInt(); + _modm = o["1"].numberInt(); + + uassert( 10073 , "mod can't be 0" , _mod ); + } + else if ( op == BSONObj::opTYPE ) { + _type = (BSONType)(e.numberInt()); + } + else if ( op == BSONObj::opELEM_MATCH ) { + BSONElement m = e; + uassert( 12517 , "$elemMatch needs an Object" , m.type() == Object ); + BSONObj x = m.embeddedObject(); + if ( x.firstElement().getGtLtOp() == 0 ) { + _subMatcher.reset( new Matcher( x ) ); + _subMatcherOnPrimitives = false; + } + else { + // meant to act on primitives + _subMatcher.reset( new Matcher( BSON( "" << x ) ) ); + _subMatcherOnPrimitives = true; + } + } + } + + ElementMatcher::ElementMatcher( BSONElement e , int op , const BSONObj& array, bool isNot ) + : _toMatch( e ) , _compareOp( op ), _isNot( isNot ), _subMatcherOnPrimitives(false) { + + _myset.reset( new set<BSONElement,element_lt>() ); + + BSONObjIterator i( array ); + while ( i.more() ) { + BSONElement ie = i.next(); + if ( op == BSONObj::opALL && ie.type() == Object && ie.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) { + shared_ptr<Matcher> s; + s.reset( new Matcher( ie.embeddedObject().firstElement().embeddedObjectUserCheck() ) ); + _allMatchers.push_back( s ); + } + else if ( ie.type() == RegEx ) { + if ( !_myregex.get() ) { + _myregex.reset( new vector< RegexMatcher >() ); + } + _myregex->push_back( RegexMatcher() ); + RegexMatcher &rm = _myregex->back(); + rm._re.reset( new pcrecpp::RE( ie.regex(), flags2options( ie.regexFlags() ) ) ); + rm._fieldName = 0; // no need for field name + rm._regex = ie.regex(); + rm._flags = ie.regexFlags(); + rm._isNot = false; + bool purePrefix; + string prefix = simpleRegex(rm._regex, rm._flags, &purePrefix); + if (purePrefix) + rm._prefix = prefix; + } + else { + uassert( 15882, "$elemMatch not allowed within $in", + ie.type() != Object || + ie.embeddedObject().firstElement().getGtLtOp() != BSONObj::opELEM_MATCH ); + _myset->insert(ie); + } + } + + if ( _allMatchers.size() ) { + uassert( 13020 , "with $all, can't mix $elemMatch and others" , _myset->size() == 0 && !_myregex.get()); + } + + } + + int ElementMatcher::inverseOfNegativeCompareOp() const { + verify( 15892, negativeCompareOp() ); + return _compareOp == BSONObj::NE ? BSONObj::Equality : BSONObj::opIN; + } + + bool ElementMatcher::negativeCompareOpContainsNull() const { + verify( 15893, negativeCompareOp() ); + return (_compareOp == BSONObj::NE && _toMatch.type() != jstNULL) || + (_compareOp == BSONObj::NIN && _myset->count( staticNull.firstElement()) == 0 ); + } + + void Matcher::addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot) { + + RegexMatcher rm; + rm._re.reset( new pcrecpp::RE(regex, flags2options(flags)) ); + rm._fieldName = fieldName; + rm._regex = regex; + rm._flags = flags; + rm._isNot = isNot; + _regexs.push_back(rm); + + if (!isNot) { //TODO something smarter + bool purePrefix; + string prefix = simpleRegex(regex, flags, &purePrefix); + if (purePrefix) + rm._prefix = prefix; + } + } + + bool Matcher::addOp( const BSONElement &e, const BSONElement &fe, bool isNot, const char *& regex, const char *&flags ) { + const char *fn = fe.fieldName(); + int op = fe.getGtLtOp( -1 ); + if ( op == -1 ) { + if ( !isNot && fn[1] == 'r' && fn[2] == 'e' && fn[3] == 'f' && fn[4] == 0 ) { + return false; // { $ref : xxx } - treat as normal object + } + uassert( 10068 , (string)"invalid operator: " + fn , op != -1 ); + } + + switch ( op ) { + case BSONObj::GT: + case BSONObj::GTE: + case BSONObj::LT: + case BSONObj::LTE: { + shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); + _builders.push_back( b ); + b->appendAs(fe, e.fieldName()); + addBasic(b->done().firstElement(), op, isNot); + break; + } + case BSONObj::NE: { + _haveNeg = true; + shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); + _builders.push_back( b ); + b->appendAs(fe, e.fieldName()); + addBasic(b->done().firstElement(), BSONObj::NE, isNot); + break; + } + case BSONObj::opALL: + _all = true; + case BSONObj::opIN: { + uassert( 13276 , "$in needs an array" , fe.isABSONObj() ); + _basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) ); + BSONObjIterator i( fe.embeddedObject() ); + while( i.more() ) { + if ( i.next().type() == Array ) { + _hasArray = true; + } + } + break; + } + case BSONObj::NIN: + uassert( 13277 , "$nin needs an array" , fe.isABSONObj() ); + _haveNeg = true; + _basics.push_back( ElementMatcher( e , op , fe.embeddedObject(), isNot ) ); + break; + case BSONObj::opMOD: + case BSONObj::opTYPE: + case BSONObj::opELEM_MATCH: { + shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); + _builders.push_back( b ); + b->appendAs(fe, e.fieldName()); + // these are types where ElementMatcher has all the info + _basics.push_back( ElementMatcher( b->done().firstElement() , op, isNot ) ); + break; + } + case BSONObj::opSIZE: { + shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); + _builders.push_back( b ); + b->appendAs(fe, e.fieldName()); + addBasic(b->done().firstElement(), BSONObj::opSIZE, isNot); + _haveSize = true; + break; + } + case BSONObj::opEXISTS: { + shared_ptr< BSONObjBuilder > b( new BSONObjBuilder() ); + _builders.push_back( b ); + b->appendAs(fe, e.fieldName()); + addBasic(b->done().firstElement(), BSONObj::opEXISTS, isNot); + break; + } + case BSONObj::opREGEX: { + uassert( 13032, "can't use $not with $regex, use BSON regex type instead", !isNot ); + if ( fe.type() == RegEx ) { + regex = fe.regex(); + flags = fe.regexFlags(); + } + else { + regex = fe.valuestrsafe(); + } + break; + } + case BSONObj::opOPTIONS: { + uassert( 13029, "can't use $not with $options, use BSON regex type instead", !isNot ); + flags = fe.valuestrsafe(); + break; + } + case BSONObj::opNEAR: + case BSONObj::opWITHIN: + case BSONObj::opMAX_DISTANCE: + break; + default: + uassert( 10069 , (string)"BUG - can't operator for: " + fn , 0 ); + } + return true; + } + + void Matcher::parseExtractedClause( const BSONElement &e, list< shared_ptr< Matcher > > &matchers ) { + uassert( 13086, "$and/$or/$nor must be a nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 ); + BSONObjIterator j( e.embeddedObject() ); + while( j.more() ) { + BSONElement f = j.next(); + uassert( 13087, "$and/$or/$nor match element must be an object", f.type() == Object ); + matchers.push_back( shared_ptr< Matcher >( new Matcher( f.embeddedObject(), true ) ) ); + } + } + + bool Matcher::parseClause( const BSONElement &e ) { + const char *ef = e.fieldName(); + + if ( ef[ 0 ] != '$' ) + return false; + + // $and + if ( ef[ 1 ] == 'a' && ef[ 2 ] == 'n' && ef[ 3 ] == 'd' ) { + parseExtractedClause( e, _andMatchers ); + return true; + } + + // $or + if ( ef[ 1 ] == 'o' && ef[ 2 ] == 'r' && ef[ 3 ] == 0 ) { + parseExtractedClause( e, _orMatchers ); + return true; + } + + // $nor + if ( ef[ 1 ] == 'n' && ef[ 2 ] == 'o' && ef[ 3 ] == 'r' && ef[ 4 ] == 0 ) { + parseExtractedClause( e, _norMatchers ); + return true; + } + + // $comment + if ( ef[ 1 ] == 'c' && ef[ 2 ] == 'o' && ef[ 3 ] == 'm' && str::equals( ef , "$comment" ) ) { + return true; + } + + return false; + } + + // $where: function()... + NOINLINE_DECL void Matcher::parseWhere( const BSONElement &e ) { + uassert(15902 , "$where expression has an unexpected type", e.type() == String || e.type() == CodeWScope || e.type() == Code ); + uassert( 10066 , "$where may only appear once in query", _where == 0 ); + uassert( 10067 , "$where query, but no script engine", globalScriptEngine ); + massert( 13089 , "no current client needed for $where" , haveClient() ); + _where = new Where(); + _where->scope = globalScriptEngine->getPooledScope( cc().ns() ); + _where->scope->localConnect( cc().database()->name.c_str() ); + + if ( e.type() == CodeWScope ) { + _where->setFunc( e.codeWScopeCode() ); + _where->jsScope = new BSONObj( e.codeWScopeScopeData() ); + } + else { + const char *code = e.valuestr(); + _where->setFunc(code); + } + + _where->scope->execSetup( "_mongo.readOnly = true;" , "make read only" ); + } + + void Matcher::parseMatchExpressionElement( const BSONElement &e, bool nested ) { + + uassert( 13629 , "can't have undefined in a query expression" , e.type() != Undefined ); + + if ( parseClause( e ) ) { + return; + } + + const char *fn = e.fieldName(); + if ( str::equals(fn, "$where") ) { + parseWhere(e); + return; + } + + if ( e.type() == RegEx ) { + addRegex( fn, e.regex(), e.regexFlags() ); + return; + } + + // greater than / less than... + // e.g., e == { a : { $gt : 3 } } + // or + // { a : { $in : [1,2,3] } } + if ( e.type() == Object ) { + // support {$regex:"a|b", $options:"imx"} + const char* regex = NULL; + const char* flags = ""; + + // e.g., fe == { $gt : 3 } + BSONObjIterator j(e.embeddedObject()); + bool isOperator = false; + while ( j.more() ) { + BSONElement fe = j.next(); + const char *fn = fe.fieldName(); + + if ( fn[0] == '$' && fn[1] ) { + isOperator = true; + + if ( fn[1] == 'n' && fn[2] == 'o' && fn[3] == 't' && fn[4] == 0 ) { + _haveNeg = true; + switch( fe.type() ) { + case Object: { + BSONObjIterator k( fe.embeddedObject() ); + uassert( 13030, "$not cannot be empty", k.more() ); + while( k.more() ) { + addOp( e, k.next(), true, regex, flags ); + } + break; + } + case RegEx: + addRegex( e.fieldName(), fe.regex(), fe.regexFlags(), true ); + break; + default: + uassert( 13031, "invalid use of $not", false ); + } + } + else { + if ( !addOp( e, fe, false, regex, flags ) ) { + isOperator = false; + break; + } + } + } + else { + isOperator = false; + break; + } + } + if (regex) { + addRegex(e.fieldName(), regex, flags); + } + if ( isOperator ) + return; + } + + if ( e.type() == Array ) { + _hasArray = true; + } + else if( *fn == '$' ) { + if( str::equals(fn, "$atomic") || str::equals(fn, "$isolated") ) { + uassert( 14844, "$atomic specifier must be a top level field", !nested ); + _atomic = e.trueValue(); + return; + } + } + + // normal, simple case e.g. { a : "foo" } + addBasic(e, BSONObj::Equality, false); + } + + /* _jsobj - the query pattern + */ + Matcher::Matcher(const BSONObj &jsobj, bool nested) : + _where(0), _jsobj(jsobj), _haveSize(), _all(), _hasArray(0), _haveNeg(), _atomic(false) { + + BSONObjIterator i(_jsobj); + while ( i.more() ) { + parseMatchExpressionElement( i.next(), nested ); + } + } + + Matcher::Matcher( const Matcher &docMatcher, const BSONObj &key ) : + _where(0), _constrainIndexKey( key ), _haveSize(), _all(), _hasArray(0), _haveNeg(), _atomic(false) { + // Filter out match components that will provide an incorrect result + // given a key from a single key index. + for( vector< ElementMatcher >::const_iterator i = docMatcher._basics.begin(); i != docMatcher._basics.end(); ++i ) { + if ( key.hasField( i->_toMatch.fieldName() ) ) { + switch( i->_compareOp ) { + case BSONObj::opSIZE: + case BSONObj::opALL: + case BSONObj::NE: + case BSONObj::NIN: + case BSONObj::opEXISTS: // We can't match on index in this case. + case BSONObj::opTYPE: // For $type:10 (null), a null key could be a missing field or a null value field. + break; + case BSONObj::opIN: { + bool inContainsArray = false; + for( set<BSONElement,element_lt>::const_iterator j = i->_myset->begin(); j != i->_myset->end(); ++j ) { + if ( j->type() == Array ) { + inContainsArray = true; + break; + } + } + // Can't match an array to its first indexed element. + if ( !i->_isNot && !inContainsArray ) { + _basics.push_back( *i ); + } + break; + } + default: { + // Can't match an array to its first indexed element. + if ( !i->_isNot && i->_toMatch.type() != Array ) { + _basics.push_back( *i ); + } + } + } + } + } + for( vector<RegexMatcher>::const_iterator it = docMatcher._regexs.begin(); + it != docMatcher._regexs.end(); + ++it) { + if ( !it->_isNot && key.hasField( it->_fieldName ) ) { + _regexs.push_back(*it); + } + } + // Recursively filter match components for and and or matchers. + for( list< shared_ptr< Matcher > >::const_iterator i = docMatcher._andMatchers.begin(); i != docMatcher._andMatchers.end(); ++i ) { + _andMatchers.push_back( shared_ptr< Matcher >( new Matcher( **i, key ) ) ); + } + for( list< shared_ptr< Matcher > >::const_iterator i = docMatcher._orMatchers.begin(); i != docMatcher._orMatchers.end(); ++i ) { + _orMatchers.push_back( shared_ptr< Matcher >( new Matcher( **i, key ) ) ); + } + } + + inline bool regexMatches(const RegexMatcher& rm, const BSONElement& e) { + switch (e.type()) { + case String: + case Symbol: + if (rm._prefix.empty()) + return rm._re->PartialMatch(e.valuestr()); + else + return !strncmp(e.valuestr(), rm._prefix.c_str(), rm._prefix.size()); + case RegEx: + return !strcmp(rm._regex, e.regex()) && !strcmp(rm._flags, e.regexFlags()); + default: + return false; + } + } + + inline int Matcher::valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm) const { + assert( op != BSONObj::NE && op != BSONObj::NIN ); + + if ( op == BSONObj::Equality ) { + return l.valuesEqual(r); + } + + if ( op == BSONObj::opIN ) { + // { $in : [1,2,3] } + int count = bm._myset->count(l); + if ( count ) + return count; + if ( bm._myregex.get() ) { + for( vector<RegexMatcher>::const_iterator i = bm._myregex->begin(); i != bm._myregex->end(); ++i ) { + if ( regexMatches( *i, l ) ) { + return true; + } + } + } + } + + if ( op == BSONObj::opSIZE ) { + if ( l.type() != Array ) + return 0; + int count = 0; + BSONObjIterator i( l.embeddedObject() ); + while( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + ++count; + } + return count == r.number(); + } + + if ( op == BSONObj::opMOD ) { + if ( ! l.isNumber() ) + return false; + + return l.numberLong() % bm._mod == bm._modm; + } + + if ( op == BSONObj::opTYPE ) { + return bm._type == l.type(); + } + + /* check LT, GTE, ... */ + if ( l.canonicalType() != r.canonicalType() ) + return false; + int c = compareElementValues(l, r); + if ( c < -1 ) c = -1; + if ( c > 1 ) c = 1; + int z = 1 << (c+1); + return (op & z); + } + + int Matcher::inverseMatch(const char *fieldName, const BSONElement &toMatch, const BSONObj &obj, const ElementMatcher& bm , MatchDetails * details ) const { + int inverseRet = matchesDotted( fieldName, toMatch, obj, bm.inverseOfNegativeCompareOp(), bm , false , details ); + if ( bm.negativeCompareOpContainsNull() ) { + return ( inverseRet <= 0 ) ? 1 : 0; + } + return -inverseRet; + } + + int retExistsFound( const ElementMatcher &bm ) { + return bm._toMatch.trueValue() ? 1 : -1; + } + + /* Check if a particular field matches. + + fieldName - field to match "a.b" if we are reaching into an embedded object. + toMatch - element we want to match. + obj - database object to check against + compareOp - Equality, LT, GT, etc. This may be different than, and should supersede, the compare op in em. + isArr - + + Special forms: + + { "a.b" : 3 } means obj.a.b == 3 + { a : { $lt : 3 } } means obj.a < 3 + { a : { $in : [1,2] } } means [1,2].contains(obj.a) + + return value + -1 mismatch + 0 missing element + 1 match + */ + int Matcher::matchesDotted(const char *fieldName, const BSONElement& toMatch, const BSONObj& obj, int compareOp, const ElementMatcher& em , bool isArr, MatchDetails * details ) const { + DEBUGMATCHER( "\t matchesDotted : " << fieldName << " hasDetails: " << ( details ? "yes" : "no" ) ); + + if ( compareOp == BSONObj::opALL ) { + + if ( em._allMatchers.size() ) { + // $all query matching will not be performed against indexes, so the field + // to match is always extracted from the full document. + BSONElement e = obj.getFieldDotted( fieldName ); + // The $all/$elemMatch operator only matches arrays. + if ( e.type() != Array ) { + return -1; + } + + for ( unsigned i=0; i<em._allMatchers.size(); i++ ) { + bool found = false; + BSONObjIterator x( e.embeddedObject() ); + while ( x.more() ) { + BSONElement f = x.next(); + + if ( f.type() != Object ) + continue; + if ( em._allMatchers[i]->matches( f.embeddedObject() ) ) { + found = true; + break; + } + } + + if ( ! found ) + return -1; + } + + return 1; + } + + if ( em._myset->size() == 0 && !em._myregex.get() ) + return -1; // is this desired? + + BSONElementSet myValues; + obj.getFieldsDotted( fieldName , myValues ); + + for( set< BSONElement, element_lt >::const_iterator i = em._myset->begin(); i != em._myset->end(); ++i ) { + // ignore nulls + if ( i->type() == jstNULL ) + continue; + + if ( myValues.count( *i ) == 0 ) + return -1; + } + + if ( !em._myregex.get() ) + return 1; + + for( vector< RegexMatcher >::const_iterator i = em._myregex->begin(); i != em._myregex->end(); ++i ) { + bool match = false; + for( BSONElementSet::const_iterator j = myValues.begin(); j != myValues.end(); ++j ) { + if ( regexMatches( *i, *j ) ) { + match = true; + break; + } + } + if ( !match ) + return -1; + } + + return 1; + } // end opALL + + if ( compareOp == BSONObj::NE || compareOp == BSONObj::NIN ) { + return inverseMatch( fieldName, toMatch, obj, em , details ); + } + + BSONElement e; + bool indexed = !_constrainIndexKey.isEmpty(); + if ( indexed ) { + e = obj.getFieldUsingIndexNames(fieldName, _constrainIndexKey); + if( e.eoo() ) { + cout << "obj: " << obj << endl; + cout << "fieldName: " << fieldName << endl; + cout << "_constrainIndexKey: " << _constrainIndexKey << endl; + assert( !e.eoo() ); + } + } + else { + + const char *p = strchr(fieldName, '.'); + if ( p ) { + string left(fieldName, p-fieldName); + + BSONElement se = obj.getField(left.c_str()); + if ( se.eoo() ) + ; + else if ( se.type() != Object && se.type() != Array ) + ; + else { + BSONObj eo = se.embeddedObject(); + return matchesDotted(p+1, toMatch, eo, compareOp, em, se.type() == Array , details ); + } + } + + // An array was encountered while scanning for components of the field name. + if ( isArr ) { + DEBUGMATCHER( "\t\t isArr 1 : obj : " << obj ); + BSONObjIterator ai(obj); + bool found = false; + while ( ai.moreWithEOO() ) { + BSONElement z = ai.next(); + + if( strcmp(z.fieldName(),fieldName) == 0 ) { + if ( compareOp == BSONObj::opEXISTS ) { + return retExistsFound( em ); + } + if (valuesMatch(z, toMatch, compareOp, em) ) { + // "field.<n>" array notation was used + if ( details ) + details->_elemMatchKey = z.fieldName(); + return 1; + } + } + + if ( z.type() == Object ) { + BSONObj eo = z.embeddedObject(); + int cmp = matchesDotted(fieldName, toMatch, eo, compareOp, em, false, details ); + if ( cmp > 0 ) { + if ( details ) + details->_elemMatchKey = z.fieldName(); + return 1; + } + else if ( cmp < 0 ) { + found = true; + } + } + } + return found ? -1 : 0; + } + + if( p ) { + // Left portion of field name was not found or wrong type. + return 0; + } + else { + e = obj.getField(fieldName); + } + } + + if ( compareOp == BSONObj::opEXISTS ) { + if( e.eoo() ) { + return 0; + } else { + return retExistsFound( em ); + } + } + else if ( ( e.type() != Array || indexed || compareOp == BSONObj::opSIZE ) && + valuesMatch(e, toMatch, compareOp, em ) ) { + return 1; + } + else if ( e.type() == Array && compareOp != BSONObj::opSIZE ) { + BSONObjIterator ai(e.embeddedObject()); + + while ( ai.moreWithEOO() ) { + BSONElement z = ai.next(); + + if ( compareOp == BSONObj::opELEM_MATCH ) { + if ( z.type() == Object ) { + if ( em._subMatcher->matches( z.embeddedObject() ) ) { + if ( details ) + details->_elemMatchKey = z.fieldName(); + return 1; + } + } + else if ( em._subMatcherOnPrimitives ) { + if ( z.type() && em._subMatcher->matches( z.wrap( "" ) ) ) { + if ( details ) + details->_elemMatchKey = z.fieldName(); + return 1; + } + } + } + else { + if ( valuesMatch( z, toMatch, compareOp, em) ) { + if ( details ) + details->_elemMatchKey = z.fieldName(); + return 1; + } + } + + } + + // match an entire array to itself + if ( compareOp == BSONObj::Equality && e.woCompare( toMatch , false ) == 0 ) { + return 1; + } + if ( compareOp == BSONObj::opIN && valuesMatch( e, toMatch, compareOp, em ) ) { + return 1; + } + } + else if ( e.eoo() ) { + return 0; + } + return -1; + } + + extern int dump; + + /* See if an object matches the query. + */ + bool Matcher::matches(const BSONObj& jsobj , MatchDetails * details ) const { + LOG(5) << "Matcher::matches() " << jsobj.toString() << endl; + + /* assuming there is usually only one thing to match. if more this + could be slow sometimes. */ + + // check normal non-regex cases: + for ( unsigned i = 0; i < _basics.size(); i++ ) { + const ElementMatcher& bm = _basics[i]; + const BSONElement& m = bm._toMatch; + // -1=mismatch. 0=missing element. 1=match + int cmp = matchesDotted(m.fieldName(), m, jsobj, bm._compareOp, bm , false , details ); + if ( cmp == 0 && bm._compareOp == BSONObj::opEXISTS ) { + // If missing, match cmp is opposite of $exists spec. + cmp = -retExistsFound(bm); + } + if ( bm._isNot ) + cmp = -cmp; + if ( cmp < 0 ) + return false; + if ( cmp == 0 ) { + /* missing is ok iff we were looking for null */ + if ( m.type() == jstNULL || m.type() == Undefined || + ( ( bm._compareOp == BSONObj::opIN || bm._compareOp == BSONObj::NIN ) && bm._myset->count( staticNull.firstElement() ) > 0 ) ) { + if ( bm.negativeCompareOp() ^ bm._isNot ) { + return false; + } + } + else { + if ( !bm._isNot ) { + return false; + } + } + } + } + + for (vector<RegexMatcher>::const_iterator it = _regexs.begin(); + it != _regexs.end(); + ++it) { + BSONElementSet s; + if ( !_constrainIndexKey.isEmpty() ) { + BSONElement e = jsobj.getFieldUsingIndexNames(it->_fieldName, _constrainIndexKey); + + // Should only have keys nested one deep here, for geo-indices + // TODO: future indices may nest deeper? + if( e.type() == Array ){ + BSONObjIterator i( e.Obj() ); + while( i.more() ){ + s.insert( i.next() ); + } + } + else if ( !e.eoo() ) + s.insert( e ); + + } + else { + jsobj.getFieldsDotted( it->_fieldName, s ); + } + bool match = false; + for( BSONElementSet::const_iterator i = s.begin(); i != s.end(); ++i ) + if ( regexMatches(*it, *i) ) + match = true; + if ( !match ^ it->_isNot ) + return false; + } + + if ( _orDedupConstraints.size() > 0 ) { + for( vector< shared_ptr< FieldRangeVector > >::const_iterator i = _orDedupConstraints.begin(); + i != _orDedupConstraints.end(); ++i ) { + if ( (*i)->matches( jsobj ) ) { + return false; + } + } + } + + if ( _andMatchers.size() > 0 ) { + for( list< shared_ptr< Matcher > >::const_iterator i = _andMatchers.begin(); + i != _andMatchers.end(); ++i ) { + // SERVER-3192 Track field matched using details the same as for + // top level fields, at least for now. + if ( !(*i)->matches( jsobj, details ) ) { + return false; + } + } + } + + if ( _orMatchers.size() > 0 ) { + bool match = false; + for( list< shared_ptr< Matcher > >::const_iterator i = _orMatchers.begin(); + i != _orMatchers.end(); ++i ) { + // SERVER-205 don't submit details - we don't want to track field + // matched within $or + if ( (*i)->matches( jsobj ) ) { + match = true; + break; + } + } + if ( !match ) { + return false; + } + } + + if ( _norMatchers.size() > 0 ) { + for( list< shared_ptr< Matcher > >::const_iterator i = _norMatchers.begin(); + i != _norMatchers.end(); ++i ) { + // SERVER-205 don't submit details - we don't want to track field + // matched within $nor + if ( (*i)->matches( jsobj ) ) { + return false; + } + } + } + + if ( _where ) { + if ( _where->func == 0 ) { + uassert( 10070 , "$where compile error", false); + return false; // didn't compile + } + + if ( _where->jsScope ) { + _where->scope->init( _where->jsScope ); + } + _where->scope->setObject( "obj", const_cast< BSONObj & >( jsobj ) ); + _where->scope->setBoolean( "fullObject" , true ); // this is a hack b/c fullObject used to be relevant + + int err = _where->scope->invoke( _where->func , 0, &jsobj , 1000 * 60 , false ); + if ( err == -3 ) { // INVOKE_ERROR + stringstream ss; + ss << "error on invocation of $where function:\n" + << _where->scope->getError(); + uassert( 10071 , ss.str(), false); + return false; + } + else if ( err != 0 ) { // ! INVOKE_SUCCESS + uassert( 10072 , "unknown error in invocation of $where function", false); + return false; + } + return _where->scope->getBoolean( "return" ) != 0; + + } + + return true; + } + + bool Matcher::keyMatch( const Matcher &docMatcher ) const { + // Quick check certain non key match cases. + if ( docMatcher._all + || docMatcher._haveSize + || docMatcher._hasArray // We can't match an array to its first indexed element using keymatch + || docMatcher._haveNeg ) { + return false; + } + + // Check that all match components are available in the index matcher. + if ( !( _basics.size() == docMatcher._basics.size() && _regexs.size() == docMatcher._regexs.size() && !docMatcher._where ) ) { + return false; + } + if ( _andMatchers.size() != docMatcher._andMatchers.size() ) { + return false; + } + if ( _orMatchers.size() != docMatcher._orMatchers.size() ) { + return false; + } + if ( docMatcher._norMatchers.size() > 0 ) { + return false; + } + if ( docMatcher._orDedupConstraints.size() > 0 ) { + return false; + } + + // Recursively check that all submatchers support key match. + { + list< shared_ptr< Matcher > >::const_iterator i = _andMatchers.begin(); + list< shared_ptr< Matcher > >::const_iterator j = docMatcher._andMatchers.begin(); + while( i != _andMatchers.end() ) { + if ( !(*i)->keyMatch( **j ) ) { + return false; + } + ++i; ++j; + } + } + { + list< shared_ptr< Matcher > >::const_iterator i = _orMatchers.begin(); + list< shared_ptr< Matcher > >::const_iterator j = docMatcher._orMatchers.begin(); + while( i != _orMatchers.end() ) { + if ( !(*i)->keyMatch( **j ) ) { + return false; + } + ++i; ++j; + } + } + // Nor matchers and or dedup constraints aren't created for index matchers, + // so no need to check those here. + return true; + } + + + /*- just for testing -- */ +#pragma pack(1) + struct JSObj1 { + JSObj1() { + totsize=sizeof(JSObj1); + n = NumberDouble; + strcpy_s(nname, 5, "abcd"); + N = 3.1; + s = String; + strcpy_s(sname, 7, "abcdef"); + slen = 10; + strcpy_s(sval, 10, "123456789"); + eoo = EOO; + } + unsigned totsize; + + char n; + char nname[5]; + double N; + + char s; + char sname[7]; + unsigned slen; + char sval[10]; + + char eoo; + }; +#pragma pack() + + struct JSObj1 js1; + +#pragma pack(1) + struct JSObj2 { + JSObj2() { + totsize=sizeof(JSObj2); + s = String; + strcpy_s(sname, 7, "abcdef"); + slen = 10; + strcpy_s(sval, 10, "123456789"); + eoo = EOO; + } + unsigned totsize; + char s; + char sname[7]; + unsigned slen; + char sval[10]; + char eoo; + } js2; + + struct JSUnitTest : public UnitTest { + void run() { + + BSONObj j1((const char *) &js1); + BSONObj j2((const char *) &js2); + Matcher m(j2); + assert( m.matches(j1) ); + js2.sval[0] = 'z'; + assert( !m.matches(j1) ); + Matcher n(j1); + assert( n.matches(j1) ); + assert( !n.matches(j2) ); + + BSONObj j0 = BSONObj(); +// BSONObj j0((const char *) &js0); + Matcher p(j0); + assert( p.matches(j1) ); + assert( p.matches(j2) ); + } + } jsunittest; + +#pragma pack() + + struct RXTest : public UnitTest { + + RXTest() { + } + + void run() { + /* + static const boost::regex e("(\\d{4}[- ]){3}\\d{4}"); + static const boost::regex b("....."); + out() << "regex result: " << regex_match("hello", e) << endl; + out() << "regex result: " << regex_match("abcoo", b) << endl; + */ + + int ret = 0; + + pcre_config( PCRE_CONFIG_UTF8 , &ret ); + massert( 10342 , "pcre not compiled with utf8 support" , ret ); + + pcrecpp::RE re1(")({a}h.*o"); + pcrecpp::RE re("h.llo"); + assert( re.FullMatch("hello") ); + assert( !re1.FullMatch("hello") ); + + + pcrecpp::RE_Options options; + options.set_utf8(true); + pcrecpp::RE part("dwi", options); + assert( part.PartialMatch("dwight") ); + + pcre_config( PCRE_CONFIG_UNICODE_PROPERTIES , &ret ); + if ( ! ret ) + cout << "warning: some regex utf8 things will not work. pcre build doesn't have --enable-unicode-properties" << endl; + + } + } rxtest; + +} // namespace mongo diff --git a/src/mongo/db/matcher.h b/src/mongo/db/matcher.h new file mode 100644 index 00000000000..b6994a79229 --- /dev/null +++ b/src/mongo/db/matcher.h @@ -0,0 +1,276 @@ +// matcher.h + +/* Matcher is our boolean expression evaluator for "where" clauses */ + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "jsobj.h" +#include "pcrecpp.h" + +namespace mongo { + + class Cursor; + class CoveredIndexMatcher; + class Matcher; + class FieldRangeVector; + + class RegexMatcher { + public: + const char *_fieldName; + const char *_regex; + const char *_flags; + string _prefix; + shared_ptr< pcrecpp::RE > _re; + bool _isNot; + RegexMatcher() : _isNot() {} + }; + + struct element_lt { + bool operator()(const BSONElement& l, const BSONElement& r) const { + int x = (int) l.canonicalType() - (int) r.canonicalType(); + if ( x < 0 ) return true; + else if ( x > 0 ) return false; + return compareElementValues(l,r) < 0; + } + }; + + + class ElementMatcher { + public: + + ElementMatcher() { + } + + ElementMatcher( BSONElement e , int op, bool isNot ); + + ElementMatcher( BSONElement e , int op , const BSONObj& array, bool isNot ); + + ~ElementMatcher() { } + + bool negativeCompareOp() const { return _compareOp == BSONObj::NE || _compareOp == BSONObj::NIN; } + int inverseOfNegativeCompareOp() const; + bool negativeCompareOpContainsNull() const; + + BSONElement _toMatch; + int _compareOp; + bool _isNot; + shared_ptr< set<BSONElement,element_lt> > _myset; + shared_ptr< vector<RegexMatcher> > _myregex; + + // these are for specific operators + int _mod; + int _modm; + BSONType _type; + + shared_ptr<Matcher> _subMatcher; + bool _subMatcherOnPrimitives ; + + vector< shared_ptr<Matcher> > _allMatchers; + }; + + class Where; // used for $where javascript eval + class DiskLoc; + + struct MatchDetails { + MatchDetails() { + reset(); + } + + void reset() { + _loadedObject = false; + _elemMatchKey = 0; + } + + string toString() const { + stringstream ss; + ss << "loadedObject: " << _loadedObject << " "; + ss << "elemMatchKey: " << ( _elemMatchKey ? _elemMatchKey : "NULL" ) << " "; + return ss.str(); + } + + bool _loadedObject; + const char * _elemMatchKey; // warning, this may go out of scope if matched object does + }; + + /* Match BSON objects against a query pattern. + + e.g. + db.foo.find( { a : 3 } ); + + { a : 3 } is the pattern object. See wiki documentation for full info. + + GT/LT: + { a : { $gt : 3 } } + Not equal: + { a : { $ne : 3 } } + + TODO: we should rewrite the matcher to be more an AST style. + */ + class Matcher : boost::noncopyable { + int matchesDotted( + const char *fieldName, + const BSONElement& toMatch, const BSONObj& obj, + int compareOp, const ElementMatcher& bm, bool isArr , MatchDetails * details ) const; + + /** + * Perform a NE or NIN match by returning the inverse of the opposite matching operation. + * Missing values are considered matches unless the match must not equal null. + */ + int inverseMatch( + const char *fieldName, + const BSONElement &toMatch, const BSONObj &obj, + const ElementMatcher&bm, MatchDetails * details ) const; + + public: + static int opDirection(int op) { + return op <= BSONObj::LTE ? -1 : 1; + } + + Matcher(const BSONObj &pattern, bool nested=false); + + ~Matcher(); + + bool matches(const BSONObj& j, MatchDetails * details = 0 ) const; + + bool atomic() const { return _atomic; } + + string toString() const { + return _jsobj.toString(); + } + + void addOrDedupConstraint( const shared_ptr< FieldRangeVector > &frv ) { + _orDedupConstraints.push_back( frv ); + } + + void popOrClause() { + _orMatchers.pop_front(); + } + + /** + * @return true if this key matcher will return the same true/false + * value as the provided doc matcher. + */ + bool keyMatch( const Matcher &docMatcher ) const; + + bool singleSimpleCriterion() const { + return false; // TODO SERVER-958 +// // TODO Really check, especially if all basics are ok. +// // $all, etc +// // _orConstraints? +// return ( ( basics.size() + nRegex ) < 2 ) && !where && !_orMatchers.size() && !_norMatchers.size(); + } + + const BSONObj *getQuery() const { return &_jsobj; }; + + private: + /** + * Generate a matcher for the provided index key format using the + * provided full doc matcher. + */ + Matcher( const Matcher &docMatcher, const BSONObj &constrainIndexKey ); + + void addBasic(const BSONElement &e, int c, bool isNot) { + // TODO May want to selectively ignore these element types based on op type. + if ( e.type() == MinKey || e.type() == MaxKey ) + return; + _basics.push_back( ElementMatcher( e , c, isNot ) ); + } + + void addRegex(const char *fieldName, const char *regex, const char *flags, bool isNot = false); + bool addOp( const BSONElement &e, const BSONElement &fe, bool isNot, const char *& regex, const char *&flags ); + + int valuesMatch(const BSONElement& l, const BSONElement& r, int op, const ElementMatcher& bm) const; + + bool parseClause( const BSONElement &e ); + void parseExtractedClause( const BSONElement &e, list< shared_ptr< Matcher > > &matchers ); + + void parseWhere( const BSONElement &e ); + void parseMatchExpressionElement( const BSONElement &e, bool nested ); + + Where *_where; // set if query uses $where + BSONObj _jsobj; // the query pattern. e.g., { name: "joe" } + BSONObj _constrainIndexKey; + vector<ElementMatcher> _basics; + bool _haveSize; + bool _all; + bool _hasArray; + bool _haveNeg; + + /* $atomic - if true, a multi document operation (some removes, updates) + should be done atomically. in that case, we do not yield - + i.e. we stay locked the whole time. + http://www.mongodb.org/display/DOCS/Removing[ + */ + bool _atomic; + + vector<RegexMatcher> _regexs; + + // so we delete the mem when we're done: + vector< shared_ptr< BSONObjBuilder > > _builders; + list< shared_ptr< Matcher > > _andMatchers; + list< shared_ptr< Matcher > > _orMatchers; + list< shared_ptr< Matcher > > _norMatchers; + vector< shared_ptr< FieldRangeVector > > _orDedupConstraints; + + friend class CoveredIndexMatcher; + }; + + // If match succeeds on index key, then attempt to match full document. + class CoveredIndexMatcher : boost::noncopyable { + public: + CoveredIndexMatcher(const BSONObj &pattern, const BSONObj &indexKeyPattern , bool alwaysUseRecord=false ); + bool matches(const BSONObj &o) { return _docMatcher->matches( o ); } + bool matchesWithSingleKeyIndex(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 ) { + return matches( key, recLoc, details, true ); + } + /** + * This is the preferred method for matching against a cursor, as it + * can handle both multi and single key cursors. + */ + bool matchesCurrent( Cursor * cursor , MatchDetails * details = 0 ); + bool needRecord() { return _needRecord; } + + Matcher& docMatcher() { return *_docMatcher; } + + // once this is called, shouldn't use this matcher for matching any more + void advanceOrClause( const shared_ptr< FieldRangeVector > &frv ) { + _docMatcher->addOrDedupConstraint( frv ); + // TODO this is not yet optimal. Since we could skip an entire + // or clause (if a match is impossible) between calls to advanceOrClause() + // we may not pop all the clauses we can. + _docMatcher->popOrClause(); + } + + CoveredIndexMatcher *nextClauseMatcher( const BSONObj &indexKeyPattern, bool alwaysUseRecord=false ) { + return new CoveredIndexMatcher( _docMatcher, indexKeyPattern, alwaysUseRecord ); + } + + string toString() const; + + private: + bool matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details = 0 , bool keyUsable = true ); + CoveredIndexMatcher(const shared_ptr< Matcher > &docMatcher, const BSONObj &indexKeyPattern , bool alwaysUseRecord=false ); + void init( bool alwaysUseRecord ); + shared_ptr< Matcher > _docMatcher; + Matcher _keyMatcher; + + bool _needRecord; // if the key itself isn't good enough to determine a positive match + }; + +} // namespace mongo diff --git a/src/mongo/db/matcher_covered.cpp b/src/mongo/db/matcher_covered.cpp new file mode 100644 index 00000000000..c6c89d03007 --- /dev/null +++ b/src/mongo/db/matcher_covered.cpp @@ -0,0 +1,101 @@ +// matcher_covered.cpp + +/* Matcher is our boolean expression evaluator for "where" clauses */ + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "matcher.h" +#include "../util/goodies.h" +#include "../util/unittest.h" +#include "diskloc.h" +#include "../scripting/engine.h" +#include "db.h" +#include "client.h" + +#include "pdfile.h" + +namespace mongo { + + CoveredIndexMatcher::CoveredIndexMatcher( const BSONObj &jsobj, const BSONObj &indexKeyPattern, bool alwaysUseRecord) : + _docMatcher( new Matcher( jsobj ) ), + _keyMatcher( *_docMatcher, indexKeyPattern ) { + init( alwaysUseRecord ); + } + + CoveredIndexMatcher::CoveredIndexMatcher( const shared_ptr< Matcher > &docMatcher, const BSONObj &indexKeyPattern , bool alwaysUseRecord ) : + _docMatcher( docMatcher ), + _keyMatcher( *_docMatcher, indexKeyPattern ) { + init( alwaysUseRecord ); + } + + void CoveredIndexMatcher::init( bool alwaysUseRecord ) { + _needRecord = + alwaysUseRecord || + !_keyMatcher.keyMatch( *_docMatcher ); + } + + bool CoveredIndexMatcher::matchesCurrent( Cursor * cursor , MatchDetails * details ) { + // bool keyUsable = ! cursor->isMultiKey() && check for $orish like conditions in matcher SERVER-1264 + return matches( cursor->currKey() , cursor->currLoc() , details , + !cursor->indexKeyPattern().isEmpty() // unindexed cursor + && !cursor->isMultiKey() // multikey cursor + ); + } + + bool CoveredIndexMatcher::matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details , bool keyUsable ) { + + LOG(5) << "CoveredIndexMatcher::matches() " << key.toString() << ' ' << recLoc.toString() << ' ' << keyUsable << endl; + + dassert( key.isValid() ); + + if ( details ) + details->reset(); + + if ( keyUsable ) { + if ( !_keyMatcher.matches(key, details ) ) { + return false; + } + if ( ! _needRecord ) { + return true; + } + } + + if ( details ) + details->_loadedObject = true; + + bool res = _docMatcher->matches(recLoc.obj() , details ); + LOG(5) << "CoveredIndexMatcher _docMatcher->matches() returns " << res << endl; + return res; + } + + string CoveredIndexMatcher::toString() const { + StringBuilder buf; + buf << "(CoveredIndexMatcher "; + + if ( _needRecord ) + buf << "needRecord "; + + buf << "keyMatcher: " << _keyMatcher.toString() << " "; + + if ( _docMatcher ) + buf << "docMatcher: " << _docMatcher->toString() << " "; + + buf << ")"; + return buf.str(); + } +} diff --git a/src/mongo/db/minilex.h b/src/mongo/db/minilex.h new file mode 100644 index 00000000000..677514aa47c --- /dev/null +++ b/src/mongo/db/minilex.h @@ -0,0 +1,164 @@ +// minilex.h +// mini js lexical analyzer. idea is to be dumb and fast. + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#error does anything use this? + +namespace mongo { + +#if defined(_WIN32) + +} // namespace mongo + +#include <hash_map> +using namespace stdext; + +namespace mongo { + + typedef const char * MyStr; + struct less_str { + bool operator()(const MyStr & x, const MyStr & y) const { + if ( strcmp(x, y) > 0) + return true; + + return false; + } + }; + + typedef hash_map<const char*, int, hash_compare<const char *, less_str> > strhashmap; + +#else + +} // namespace mongo + +#include <ext/hash_map> + +namespace mongo { + + using namespace __gnu_cxx; + + typedef const char * MyStr; + struct eq_str { + bool operator()(const MyStr & x, const MyStr & y) const { + if ( strcmp(x, y) == 0) + return true; + + return false; + } + }; + + typedef hash_map<const char*, int, hash<const char *>, eq_str > strhashmap; + +#endif + + /* + struct MiniLexNotUsed { + strhashmap reserved; + bool ic[256]; // ic=Identifier Character + bool starter[256]; + + // dm: very dumb about comments and escaped quotes -- but we are faster then at least, + // albeit returning too much (which is ok for jsbobj current usage). + void grabVariables(char *code , strhashmap& vars) { // 'code' modified and must stay in scope*/ + char *p = code; + char last = 0; + while ( *p ) { + if ( starter[*p] ) { + char *q = p+1; + while ( *q && ic[*q] ) q++; + const char *identifier = p; + bool done = *q == 0; + *q = 0; + if ( !reserved.count(identifier) ) { + // we try to be smart about 'obj' but have to be careful as obj.obj + // can happen; this is so that nFields is right for simplistic where cases + // so we can stop scanning in jsobj when we find the field of interest. + if ( strcmp(identifier,"obj")==0 && p>code && p[-1] != '.' ) + ; + else + vars[identifier] = 1; + } + if ( done ) + break; + p = q + 1; + continue; + } + + if ( *p == '\'' ) { + p++; + while ( *p && *p != '\'' ) p++; + } + else if ( *p == '"' ) { + p++; + while ( *p && *p != '"' ) p++; + } + p++; + } +} + +MiniLex() { + strhashmap atest; + atest["foo"] = 3; + assert( atest.count("bar") == 0 ); + assert( atest.count("foo") == 1 ); + assert( atest["foo"] == 3 ); + + for ( int i = 0; i < 256; i++ ) { + ic[i] = starter[i] = false; + } + for ( int i = 'a'; i <= 'z'; i++ ) + ic[i] = starter[i] = true; + for ( int i = 'A'; i <= 'Z'; i++ ) + ic[i] = starter[i] = true; + for ( int i = '0'; i <= '9'; i++ ) + ic[i] = true; + for ( int i = 128; i < 256; i++ ) + ic[i] = starter[i] = true; + ic['$'] = starter['$'] = true; + ic['_'] = starter['_'] = true; + + reserved["break"] = true; + reserved["case"] = true; + reserved["catch"] = true; + reserved["continue"] = true; + reserved["default"] = true; + reserved["delete"] = true; + reserved["do"] = true; + reserved["else"] = true; + reserved["finally"] = true; + reserved["for"] = true; + reserved["function"] = true; + reserved["if"] = true; + reserved["in"] = true; + reserved["instanceof"] = true; + reserved["new"] = true; + reserved["return"] = true; + reserved["switch"] = true; + reserved["this"] = true; + reserved["throw"] = true; + reserved["try"] = true; + reserved["typeof"] = true; + reserved["var"] = true; + reserved["void"] = true; + reserved["while"] = true; + reserved["with "] = true; +} +}; +*/ + +} // namespace mongo diff --git a/src/mongo/db/module.cpp b/src/mongo/db/module.cpp new file mode 100644 index 00000000000..4269c5e99a0 --- /dev/null +++ b/src/mongo/db/module.cpp @@ -0,0 +1,68 @@ +// module.cpp +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + +#include "pch.h" +#include "module.h" + +namespace mongo { + + std::list<Module*> * Module::_all; + + Module::Module( const string& name ) + : _name( name ) , _options( (string)"Module " + name + " options" ) { + if ( ! _all ) + _all = new list<Module*>(); + _all->push_back( this ); + } + + Module::~Module() {} + + void Module::addOptions( boost::program_options::options_description& options ) { + if ( ! _all ) { + return; + } + for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ) { + Module* m = *i; + options.add( m->_options ); + } + } + + void Module::configAll( boost::program_options::variables_map& params ) { + if ( ! _all ) { + return; + } + for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ) { + Module* m = *i; + m->config( params ); + } + + } + + + void Module::initAll() { + if ( ! _all ) { + return; + } + for ( list<Module*>::iterator i=_all->begin(); i!=_all->end(); i++ ) { + Module* m = *i; + m->init(); + } + + } + +} diff --git a/src/mongo/db/module.h b/src/mongo/db/module.h new file mode 100644 index 00000000000..71f276e0585 --- /dev/null +++ b/src/mongo/db/module.h @@ -0,0 +1,70 @@ +// module.h + +/** +* Copyright (C) 2008 10gen Inc.info +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../pch.h" +#include <boost/program_options.hpp> +#include <list> + +namespace mongo { + + /** + * Module is the base class for adding modules to MongoDB + * modules allow adding hooks and features to mongo + * the idea is to add hooks into the main code for module support where needed + * some ideas are: monitoring, indexes, full text search + */ + class Module { + public: + Module( const string& name ); + virtual ~Module(); + + boost::program_options::options_description_easy_init add_options() { + return _options.add_options(); + } + + /** + * read config from command line + */ + virtual void config( boost::program_options::variables_map& params ) = 0; + + /** + * called after configuration when the server is ready start + */ + virtual void init() = 0; + + /** + * called when the database is about to shutdown + */ + virtual void shutdown() = 0; + + const string& getName() { return _name; } + + // --- static things + + static void addOptions( boost::program_options::options_description& options ); + static void configAll( boost::program_options::variables_map& params ); + static void initAll(); + + private: + static std::list<Module*> * _all; + string _name; + boost::program_options::options_description _options; + }; +} diff --git a/src/mongo/db/modules/mms.cpp b/src/mongo/db/modules/mms.cpp new file mode 100644 index 00000000000..418a553f283 --- /dev/null +++ b/src/mongo/db/modules/mms.cpp @@ -0,0 +1,170 @@ +// @file mms.cpp +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + +#include "pch.h" +#include "../db.h" +#include "../instance.h" +#include "../module.h" +#include "../../util/net/httpclient.h" +#include "../../util/background.h" +#include "../commands.h" + +namespace po = boost::program_options; + +namespace mongo { + + /** Mongo Monitoring Service + if enabled, this runs in the background ands pings mss + */ + class MMS : public BackgroundJob , Module { + public: + + MMS() + : Module( "mms" ) , _baseurl( "" ) , + _secsToSleep(1) , _token( "" ) , _name( "" ) { + + add_options() + ( "mms-url" , po::value<string>()->default_value("http://mms.10gen.com/ping") , "url for mongo monitoring server" ) + ( "mms-token" , po::value<string>() , "account token for mongo monitoring server" ) + ( "mms-name" , po::value<string>() , "server name for mongo monitoring server" ) + ( "mms-interval" , po::value<int>()->default_value(30) , "ping interval (in seconds) for mongo monitoring server" ) + ; + } + + ~MMS() {} + + void config( boost::program_options::variables_map& params ) { + _baseurl = params["mms-url"].as<string>(); + if ( params.count( "mms-token" ) ) { + _token = params["mms-token"].as<string>(); + } + if ( params.count( "mms-name" ) ) { + _name = params["mms-name"].as<string>(); + } + _secsToSleep = params["mms-interval"].as<int>(); + } + + void run() { + if ( _token.size() == 0 && _name.size() == 0 ) { + log(1) << "mms not configured" << endl; + return; + } + + if ( _token.size() == 0 ) { + log() << "no token for mms - not running" << endl; + return; + } + + if ( _name.size() == 0 ) { + log() << "no name for mms - not running" << endl; + return; + } + + log() << "mms monitor staring... token:" << _token << " name:" << _name << " interval: " << _secsToSleep << endl; + Client::initThread( "mms" ); + Client& c = cc(); + + + // TODO: using direct client is bad, but easy for now + + while ( ! inShutdown() ) { + sleepsecs( _secsToSleep ); + + try { + stringstream url; + url << _baseurl << "?" + << "token=" << _token << "&" + << "name=" << _name << "&" + << "ts=" << time(0) + ; + + BSONObjBuilder bb; + // duplicated so the post has everything + bb.append( "token" , _token ); + bb.append( "name" , _name ); + bb.appendDate( "ts" , jsTime() ); + + // any commands + _add( bb , "buildinfo" ); + _add( bb , "serverStatus" ); + + BSONObj postData = bb.obj(); + + log(1) << "mms url: " << url.str() << "\n\t post: " << postData << endl;; + + HttpClient c; + HttpClient::Result r; + int rc = c.post( url.str() , postData.jsonString() , &r ); + log(1) << "\t response code: " << rc << endl; + if ( rc != 200 ) { + log() << "mms error response code:" << rc << endl; + log(1) << "mms error body:" << r.getEntireResponse() << endl; + } + } + catch ( std::exception& e ) { + log() << "mms exception: " << e.what() << endl; + } + } + + c.shutdown(); + } + + void _add( BSONObjBuilder& postData , const char* cmd ) { + Command * c = Command::findCommand( cmd ); + if ( ! c ) { + log() << "MMS can't find command: " << cmd << endl; + postData.append( cmd , "can't find command" ); + return; + } + + if ( c->locktype() ) { + log() << "MMS can only use noLocking commands not: " << cmd << endl; + postData.append( cmd , "not noLocking" ); + return; + } + + BSONObj co = BSON( cmd << 1 ); + + string errmsg; + BSONObjBuilder sub; + if ( ! c->run( "admin.$cmd" , co , 0 , errmsg , sub , false ) ) + postData.append( cmd , errmsg ); + else + postData.append( cmd , sub.obj() ); + } + + + void init() { go(); } + + void shutdown() { + // TODO + } + + private: + string _baseurl; + int _secsToSleep; + + string _token; + string _name; + + } /*mms*/ ; + +} + + + diff --git a/src/mongo/db/mongo.ico b/src/mongo/db/mongo.ico Binary files differnew file mode 100755 index 00000000000..5258b6e0446 --- /dev/null +++ b/src/mongo/db/mongo.ico diff --git a/src/mongo/db/mongommf.cpp b/src/mongo/db/mongommf.cpp new file mode 100644 index 00000000000..af2e822404e --- /dev/null +++ b/src/mongo/db/mongommf.cpp @@ -0,0 +1,339 @@ +// @file mongommf.cpp + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* this module adds some of our layers atop memory mapped files - specifically our handling of private views & such + if you don't care about journaling/durability (temp sort files & such) use MemoryMappedFile class, not this. +*/ + +#include "pch.h" +#include "cmdline.h" +#include "mongommf.h" +#include "dur.h" +#include "dur_journalformat.h" +#include "../util/mongoutils/str.h" +#include "mongomutex.h" +#include "d_globals.h" + +using namespace mongoutils; + +namespace mongo { + +#if defined(_WIN32) + extern mutex mapViewMutex; + + __declspec(noinline) void makeChunkWritable(size_t chunkno) { + scoped_lock lk(mapViewMutex); + + if( writable.get(chunkno) ) // double check lock + return; + + // remap all maps in this chunk. common case is a single map, but could have more than one with smallfiles or .ns files + size_t chunkStart = chunkno * MemoryMappedFile::ChunkSize; + size_t chunkNext = chunkStart + MemoryMappedFile::ChunkSize; + + scoped_lock lk2(privateViews._mutex()); + map<void*,MongoMMF*>::iterator i = privateViews.finditer_inlock((void*) (chunkNext-1)); + while( 1 ) { + const pair<void*,MongoMMF*> x = *(--i); + MongoMMF *mmf = x.second; + if( mmf == 0 ) + break; + + size_t viewStart = (size_t) x.first; + size_t viewEnd = (size_t) (viewStart + mmf->length()); + if( viewEnd <= chunkStart ) + break; + + size_t protectStart = max(viewStart, chunkStart); + dassert(protectStart<chunkNext); + + size_t protectEnd = min(viewEnd, chunkNext); + size_t protectSize = protectEnd - protectStart; + dassert(protectSize>0&&protectSize<=MemoryMappedFile::ChunkSize); + + DWORD old; + bool ok = VirtualProtect((void*)protectStart, protectSize, PAGE_WRITECOPY, &old); + if( !ok ) { + DWORD e = GetLastError(); + log() << "VirtualProtect failed (mcw) " << mmf->filename() << ' ' << chunkno << hex << protectStart << ' ' << protectSize << ' ' << errnoWithDescription(e) << endl; + assert(false); + } + } + + writable.set(chunkno); + } + + void* MemoryMappedFile::createPrivateMap() { + assert( maphandle ); + scoped_lock lk(mapViewMutex); + void *p = MapViewOfFile(maphandle, FILE_MAP_READ, 0, 0, 0); + if ( p == 0 ) { + DWORD e = GetLastError(); + log() << "createPrivateMap failed " << filename() << " " << + errnoWithDescription(e) << " filelen:" << len << + ((sizeof(void*) == 4 ) ? " (32 bit build)" : "") << + endl; + } + else { + clearWritableBits(p); + views.push_back(p); + } + return p; + } + + void* MemoryMappedFile::remapPrivateView(void *oldPrivateAddr) { + d.dbMutex.assertWriteLocked(); // short window where we are unmapped so must be exclusive + + // the mapViewMutex is to assure we get the same address on the remap + scoped_lock lk(mapViewMutex); + + clearWritableBits(oldPrivateAddr); +#if 1 + // https://jira.mongodb.org/browse/SERVER-2942 + DWORD old; + bool ok = VirtualProtect(oldPrivateAddr, (SIZE_T) len, PAGE_READONLY, &old); + if( !ok ) { + DWORD e = GetLastError(); + log() << "VirtualProtect failed in remapPrivateView " << filename() << hex << oldPrivateAddr << ' ' << len << ' ' << errnoWithDescription(e) << endl; + assert(false); + } + return oldPrivateAddr; +#else + if( !UnmapViewOfFile(oldPrivateAddr) ) { + DWORD e = GetLastError(); + log() << "UnMapViewOfFile failed " << filename() << ' ' << errnoWithDescription(e) << endl; + assert(false); + } + + // we want the new address to be the same as the old address in case things keep pointers around (as namespaceindex does). + void *p = MapViewOfFileEx(maphandle, FILE_MAP_READ, 0, 0, + /*dwNumberOfBytesToMap 0 means to eof*/0 /*len*/, + oldPrivateAddr); + + if ( p == 0 ) { + DWORD e = GetLastError(); + log() << "MapViewOfFileEx failed " << filename() << " " << errnoWithDescription(e) << endl; + assert(p); + } + assert(p == oldPrivateAddr); + return p; +#endif + } +#endif + + void MongoMMF::remapThePrivateView() { + assert( cmdLine.dur ); + + // todo 1.9 : it turns out we require that we always remap to the same address. + // so the remove / add isn't necessary and can be removed + privateViews.remove(_view_private); + _view_private = remapPrivateView(_view_private); + privateViews.add(_view_private, this); + } + + /** register view. threadsafe */ + void PointerToMMF::add(void *view, MongoMMF *f) { + assert(view); + assert(f); + mutex::scoped_lock lk(_m); + _views.insert( pair<void*,MongoMMF*>(view,f) ); + } + + /** de-register view. threadsafe */ + void PointerToMMF::remove(void *view) { + if( view ) { + mutex::scoped_lock lk(_m); + _views.erase(view); + } + } + + PointerToMMF::PointerToMMF() : _m("PointerToMMF") { +#if defined(SIZE_MAX) + size_t max = SIZE_MAX; +#else + size_t max = ~((size_t)0); +#endif + assert( max > (size_t) this ); // just checking that no one redef'd SIZE_MAX and that it is sane + + // this way we don't need any boundary checking in _find() + _views.insert( pair<void*,MongoMMF*>((void*)0,(MongoMMF*)0) ); + _views.insert( pair<void*,MongoMMF*>((void*)max,(MongoMMF*)0) ); + } + + /** underscore version of find is for when you are already locked + @param ofs out return our offset in the view + @return the MongoMMF to which this pointer belongs + */ + MongoMMF* PointerToMMF::find_inlock(void *p, /*out*/ size_t& ofs) { + // + // .................memory.......................... + // v1 p v2 + // [--------------------] [-------] + // + // e.g., _find(p) == v1 + // + const pair<void*,MongoMMF*> x = *(--_views.upper_bound(p)); + MongoMMF *mmf = x.second; + if( mmf ) { + size_t o = ((char *)p) - ((char*)x.first); + if( o < mmf->length() ) { + ofs = o; + return mmf; + } + } + return 0; + } + + /** find associated MMF object for a given pointer. + threadsafe + @param ofs out returns offset into the view of the pointer, if found. + @return the MongoMMF to which this pointer belongs. null if not found. + */ + MongoMMF* PointerToMMF::find(void *p, /*out*/ size_t& ofs) { + mutex::scoped_lock lk(_m); + return find_inlock(p, ofs); + } + + PointerToMMF privateViews; + + /* void* MongoMMF::switchToPrivateView(void *readonly_ptr) { + assert( cmdLine.dur ); + assert( testIntent ); + + void *p = readonly_ptr; + + { + size_t ofs=0; + MongoMMF *mmf = ourReadViews.find(p, ofs); + if( mmf ) { + void *res = ((char *)mmf->_view_private) + ofs; + return res; + } + } + + { + size_t ofs=0; + MongoMMF *mmf = privateViews.find(p, ofs); + if( mmf ) { + log() << "dur: perf warning p=" << p << " is already in the writable view of " << mmf->filename() << endl; + return p; + } + } + + // did you call writing() with a pointer that isn't into a datafile? + log() << "dur error switchToPrivateView " << p << endl; + return p; + }*/ + + /* switch to _view_write. normally, this is a bad idea since your changes will not + show up in _view_private if there have been changes there; thus the leading underscore + as a tad of a "warning". but useful when done with some care, such as during + initialization. + */ + void* MongoMMF::_switchToWritableView(void *p) { + size_t ofs; + MongoMMF *f = privateViews.find(p, ofs); + assert( f ); + return (((char *)f->_view_write)+ofs); + } + + extern string dbpath; + + // here so that it is precomputed... + void MongoMMF::setPath(string f) { + string suffix; + string prefix; + bool ok = str::rSplitOn(f, '.', prefix, suffix); + uassert(13520, str::stream() << "MongoMMF only supports filenames in a certain format " << f, ok); + if( suffix == "ns" ) + _fileSuffixNo = dur::JEntry::DotNsSuffix; + else + _fileSuffixNo = (int) str::toUnsigned(suffix); + + _p = RelativePath::fromFullPath(prefix); + } + + bool MongoMMF::open(string fname, bool sequentialHint) { + LOG(3) << "mmf open " << fname << endl; + setPath(fname); + _view_write = mapWithOptions(fname.c_str(), sequentialHint ? SEQUENTIAL : 0); + return finishOpening(); + } + + bool MongoMMF::create(string fname, unsigned long long& len, bool sequentialHint) { + LOG(3) << "mmf create " << fname << endl; + setPath(fname); + _view_write = map(fname.c_str(), len, sequentialHint ? SEQUENTIAL : 0); + return finishOpening(); + } + + bool MongoMMF::finishOpening() { + LOG(3) << "mmf finishOpening " << (void*) _view_write << ' ' << filename() << " len:" << length() << endl; + if( _view_write ) { + if( cmdLine.dur ) { + _view_private = createPrivateMap(); + if( _view_private == 0 ) { + msgasserted(13636, str::stream() << "file " << filename() << " open/create failed in createPrivateMap (look in log for more information)"); + } + privateViews.add(_view_private, this); // note that testIntent builds use this, even though it points to view_write then... + } + else { + _view_private = _view_write; + } + return true; + } + return false; + } + + MongoMMF::MongoMMF() : _willNeedRemap(false) { + _view_write = _view_private = 0; + } + + MongoMMF::~MongoMMF() { + try { + close(); + } + catch(...) { error() << "exception in ~MongoMMF" << endl; } + } + + namespace dur { + void closingFileNotification(); + } + + /*virtual*/ void MongoMMF::close() { + LOG(3) << "mmf close " << filename() << endl; + + if( view_write() /*actually was opened*/ ) { + if( cmdLine.dur ) { + dur::closingFileNotification(); + } + if( !d.dbMutex.isWriteLocked() ) { + assert( inShutdown() ); + DEV { + log() << "is it really ok to close a mongommf outside a write lock? dbmutex status:" << d.dbMutex.getState() << " file:" << filename() << endl; + } + } + } + + LockMongoFilesExclusive lk; + privateViews.remove(_view_private); + _view_write = _view_private = 0; + MemoryMappedFile::close(); + } + +} diff --git a/src/mongo/db/mongommf.h b/src/mongo/db/mongommf.h new file mode 100644 index 00000000000..62a6cdfd3fd --- /dev/null +++ b/src/mongo/db/mongommf.h @@ -0,0 +1,145 @@ +/** @file mongommf.h +* +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../util/mmap.h" +#include "../util/paths.h" + +namespace mongo { + + /** MongoMMF adds some layers atop memory mapped files - specifically our handling of private views & such. + if you don't care about journaling/durability (temp sort files & such) use MemoryMappedFile class, + not this. + */ + class MongoMMF : private MemoryMappedFile { + protected: + virtual void* viewForFlushing() { return _view_write; } + + public: + MongoMMF(); + virtual ~MongoMMF(); + virtual void close(); + + /** @return true if opened ok. */ + bool open(string fname, bool sequentialHint /*typically we open with this false*/); + + /** @return file length */ + unsigned long long length() const { return MemoryMappedFile::length(); } + + string filename() const { return MemoryMappedFile::filename(); } + + void flush(bool sync) { MemoryMappedFile::flush(sync); } + + /* Creates with length if DNE, otherwise uses existing file length, + passed length. + @param sequentialHint if true will be sequentially accessed + @return true for ok + */ + bool create(string fname, unsigned long long& len, bool sequentialHint); + + /* Get the "standard" view (which is the private one). + @return the private view. + */ + void* getView() const { return _view_private; } + + /* Get the "write" view (which is required for writing). + @return the write view. + */ + void* view_write() const { return _view_write; } + + + /* switch to _view_write. normally, this is a bad idea since your changes will not + show up in _view_private if there have been changes there; thus the leading underscore + as a tad of a "warning". but useful when done with some care, such as during + initialization. + */ + static void* _switchToWritableView(void *private_ptr); + + /** for a filename a/b/c.3 + filePath() is "a/b/c" + fileSuffixNo() is 3 + if the suffix is "ns", fileSuffixNo -1 + */ + const RelativePath& relativePath() const { + DEV assert( !_p._p.empty() ); + return _p; + } + + int fileSuffixNo() const { return _fileSuffixNo; } + + /** true if we have written. + set in PREPLOGBUFFER, it is NOT set immediately on write intent declaration. + reset to false in REMAPPRIVATEVIEW + */ + bool& willNeedRemap() { return _willNeedRemap; } + + void remapThePrivateView(); + + virtual bool isMongoMMF() { return true; } + + private: + + void *_view_write; + void *_view_private; + bool _willNeedRemap; + RelativePath _p; // e.g. "somepath/dbname" + int _fileSuffixNo; // e.g. 3. -1="ns" + + void setPath(string pathAndFileName); + bool finishOpening(); + }; + + /** for durability support we want to be able to map pointers to specific MongoMMF objects. + */ + class PointerToMMF : boost::noncopyable { + public: + PointerToMMF(); + + /** register view. + threadsafe + */ + void add(void *view, MongoMMF *f); + + /** de-register view. + threadsafe + */ + void remove(void *view); + + /** find associated MMF object for a given pointer. + threadsafe + @param ofs out returns offset into the view of the pointer, if found. + @return the MongoMMF to which this pointer belongs. null if not found. + */ + MongoMMF* find(void *p, /*out*/ size_t& ofs); + + /** for doing many finds in a row with one lock operation */ + mutex& _mutex() { return _m; } + MongoMMF* find_inlock(void *p, /*out*/ size_t& ofs); + + map<void*,MongoMMF*>::iterator finditer_inlock(void *p) { return _views.upper_bound(p); } + + unsigned numberOfViews_inlock() const { return _views.size(); } + + private: + mutex _m; + map<void*, MongoMMF*> _views; + }; + + // allows a pointer into any private view of a MongoMMF to be resolved to the MongoMMF object + extern PointerToMMF privateViews; +} diff --git a/src/mongo/db/mongomutex.h b/src/mongo/db/mongomutex.h new file mode 100644 index 00000000000..08b091cae9c --- /dev/null +++ b/src/mongo/db/mongomutex.h @@ -0,0 +1,388 @@ +// @file mongomutex.h + +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/* Mutex heirarchy (1 = "leaf") + name level + Logstream::mutex 1 + ClientCursor::ccmutex 2 + dblock 3 + + End func name with _inlock to indicate "caller must lock before calling". +*/ + +#pragma once + +#include "../util/concurrency/rwlock.h" +#include "../util/mmap.h" +#include "../util/time_support.h" +#include "d_globals.h" + +namespace mongo { + + class Client; + Client* curopWaitingForLock( int type ); + void curopGotLock(Client*); + + /* mongomutex time stats */ + class MutexInfo { + unsigned long long enter, timeLocked; // microseconds + int locked; + unsigned long long start; // last as we touch this least often + public: + MutexInfo() : timeLocked(0) , locked(0) { + start = curTimeMicros64(); + } + void entered() { + if ( locked == 0 ) + enter = curTimeMicros64(); + locked++; + assert( locked >= 1 ); + } + void leaving() { + locked--; + assert( locked >= 0 ); + if ( locked == 0 ) + timeLocked += curTimeMicros64() - enter; + } + int isLocked() const { return locked; } + void getTimingInfo(unsigned long long &s, unsigned long long &tl) const { + s = start; + tl = timeLocked; + } + unsigned long long getTimeLocked() const { return timeLocked; } + }; + + /** the 'big lock'. a read/write lock. + there is one of these, d.dbMutex. + + generally if you need to declare a mutex use the right primitive class, not this. + + use readlock and writelock classes for scoped locks on this rather than direct + manipulation. + */ + class MongoMutex { + public: + MongoMutex(const char * name); + + /** @return + * > 0 write lock + * = 0 no lock + * < 0 read lock + */ + int getState() const { return _state.get(); } + + bool atLeastReadLocked() const { return _state.get() != 0; } + void assertAtLeastReadLocked() const { assert(atLeastReadLocked()); } + bool isWriteLocked/*by our thread*/() const { return getState() > 0; } + void assertWriteLocked() const { + assert( getState() > 0 ); + DEV assert( !_releasedEarly.get() ); + } + + // write lock. use the writelock scoped lock class, not this directly. + void lock() { + if ( _writeLockedAlready() ) + return; + + _state.set(1); + + curopWaitingForLock( 1 ); // stats + _m.lock(); + MongoFile::markAllWritable(); // for _DEBUG validation -- a no op for release build + _acquiredWriteLock(); + } + + // try write lock + bool lock_try( int millis ) { + if ( _writeLockedAlready() ) // adjusts _state + return true; + + curopWaitingForLock( 1 ); + bool got = _m.lock_try( millis ); + + if ( got ) { + _state.set(1); + MongoFile::markAllWritable(); // for _DEBUG validation -- a no op for release build + _acquiredWriteLock(); + } + + return got; + } + + // un write lock + void unlock() { + int s = _state.get(); + if( s > 1 ) { + _state.set(s-1); // recursive lock case + return; + } + if( s != 1 ) { + if( _releasedEarly.get() ) { + _releasedEarly.set(false); + return; + } + massert( 12599, "internal error: attempt to unlock when wasn't in a write lock", false); + } + _releasingWriteLock(); + MongoFile::unmarkAllWritable(); // _DEBUG validation + _state.set(0); + _m.unlock(); + } + + /* unlock (write lock), and when unlock() is called later, + be smart then and don't unlock it again. + */ + void releaseEarly() { + assert( getState() == 1 ); // must not be recursive + assert( !_releasedEarly.get() ); + _releasedEarly.set(true); + unlock(); + } + + // read lock. don't call directly, use readlock. + void lock_shared() { + int s = _state.get(); + if( s ) { + if( s > 0 ) { + // already in write lock - just be recursive and stay write locked + _state.set(s+1); + } + else { + // already in read lock - recurse + _state.set(s-1); + } + } + else { + _state.set(-1); + Client *c = curopWaitingForLock( -1 ); + _m.lock_shared(); + curopGotLock(c); + } + } + + // try read lock + bool lock_shared_try( int millis ) { + int s = _state.get(); + if ( s ) { + // we already have a lock, so no need to try + lock_shared(); + return true; + } + + /* [dm] should there be + Client *c = curopWaitingForLock( 1 ); + here? i think so. seems to be missing. + */ + bool got = _m.lock_shared_try( millis ); + if ( got ) + _state.set(-1); + return got; + } + + void unlock_shared() { + int s = _state.get(); + if( s > 0 ) { + wassert( s > 1 ); /* we must have done a lock write first to have s > 1 */ + _state.set(s-1); + return; + } + if( s < -1 ) { + _state.set(s+1); + return; + } + wassert( s == -1 ); + _state.set(0); + _m.unlock_shared(); + } + + MutexInfo& info() { return _minfo; } + + private: + void lockedExclusively(); + void unlockingExclusively(); + void _acquiredWriteLock(); + void _releasingWriteLock(); + + /* @return true if was already write locked. increments recursive lock count. */ + bool _writeLockedAlready(); + + RWLock _m; + + /* > 0 write lock with recurse count + < 0 read lock + */ + ThreadLocalValue<int> _state; + + MutexInfo _minfo; + + public: + // indicates we need to call dur::REMAPPRIVATEVIEW on the next write lock + bool _remapPrivateViewRequested; + + private: + /* See the releaseEarly() method. + we use a separate TLS value for releasedEarly - that is ok as + our normal/common code path, we never even touch it */ + ThreadLocalValue<bool> _releasedEarly; + + /* this is for fsyncAndLock command. otherwise write lock's greediness will + make us block on any attempted write lock the the fsync's lock. + */ + //volatile bool _blockWrites; + }; + + namespace dur { + void REMAPPRIVATEVIEW(); + void releasingWriteLock(); // because it's hard to include dur.h here + } + + inline void MongoMutex::_releasingWriteLock() { + dur::releasingWriteLock(); + unlockingExclusively(); + } + + inline void MongoMutex::_acquiredWriteLock() { + lockedExclusively(); + if( _remapPrivateViewRequested ) { + dur::REMAPPRIVATEVIEW(); + dassert( !_remapPrivateViewRequested ); + } + } + + string sayClientState(); + + /* @return true if was already write locked. increments recursive lock count. */ + inline bool MongoMutex::_writeLockedAlready() { + int s = _state.get(); + if( s > 0 ) { + _state.set(s+1); + return true; + } + massert( 10293 , string("internal error: locks are not upgradeable: ") + sayClientState() , s == 0 ); + return false; + } + + struct writelock { + writelock() { d.dbMutex.lock(); } + writelock(const string& ns) { d.dbMutex.lock(); } + ~writelock() { + DESTRUCTOR_GUARD( + d.dbMutex.unlock(); + ); + } + }; + + struct readlock { + readlock(const string& ns) { + d.dbMutex.lock_shared(); + } + readlock() { d.dbMutex.lock_shared(); } + ~readlock() { + DESTRUCTOR_GUARD( + d.dbMutex.unlock_shared(); + ); + } + }; + struct readlocktry { + readlocktry( const string&ns , int tryms ) { + _got = d.dbMutex.lock_shared_try( tryms ); + } + ~readlocktry() { + if ( _got ) { + d.dbMutex.unlock_shared(); + } + } + bool got() const { return _got; } + private: + bool _got; + }; + + struct writelocktry { + writelocktry( const string&ns , int tryms ) { + _got = d.dbMutex.lock_try( tryms ); + } + ~writelocktry() { + if ( _got ) { + d.dbMutex.unlock(); + } + } + bool got() const { return _got; } + private: + bool _got; + }; + + struct readlocktryassert : public readlocktry { + readlocktryassert(const string& ns, int tryms) : + readlocktry(ns,tryms) { + uassert(13142, "timeout getting readlock", got()); + } + }; + + /** assure we have at least a read lock - they key with this being + if you have a write lock, that's ok too. + */ + struct atleastreadlock { + atleastreadlock( const string& ns = "" ) { + _prev = d.dbMutex.getState(); + if ( _prev == 0 ) + d.dbMutex.lock_shared(); + } + ~atleastreadlock() { + if ( _prev == 0 ) + d.dbMutex.unlock_shared(); + } + private: + int _prev; + }; + + /* parameterized choice of read or write locking + use readlock and writelock instead of this when statically known which you want + */ + class mongolock { + bool _writelock; + public: + mongolock(bool write) : _writelock(write) { + if( _writelock ) { + d.dbMutex.lock(); + } + else + d.dbMutex.lock_shared(); + } + ~mongolock() { + DESTRUCTOR_GUARD( + if( _writelock ) { + d.dbMutex.unlock(); + } + else { + d.dbMutex.unlock_shared(); + } + ); + } + /* this unlocks, does NOT upgrade. that works for our current usage */ + //void releaseAndWriteLock(); + }; + + /* deprecated - use writelock and readlock instead */ + struct dblock : public writelock { + dblock() : writelock("") { } + }; + + // eliminate this - we should just type "d.dbMutex.assertWriteLocked();" instead + inline void assertInWriteLock() { d.dbMutex.assertWriteLocked(); } + +} diff --git a/src/mongo/db/namespace-inl.h b/src/mongo/db/namespace-inl.h new file mode 100644 index 00000000000..a621a229546 --- /dev/null +++ b/src/mongo/db/namespace-inl.h @@ -0,0 +1,132 @@ +// @file namespace-inl.h + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "namespace.h" + +namespace mongo { + + inline Namespace& Namespace::operator=(const char *ns) { + // we fill the remaining space with all zeroes here. as the full Namespace struct is in + // the datafiles (the .ns files specifically), that is helpful as then they are deterministic + // in the bytes they have for a given sequence of operations. that makes testing and debugging + // the data files easier. + // + // if profiling indicates this method is a significant bottleneck, we could have a version we + // use for reads which does not fill with zeroes, and keep the zeroing behavior on writes. + // + unsigned len = strlen(ns); + uassert( 10080 , "ns name too long, max size is 128", len < MaxNsLen); + memset(buf, 0, MaxNsLen); + memcpy(buf, ns, len); + return *this; + } + + inline string Namespace::extraName(int i) const { + char ex[] = "$extra"; + ex[5] += i; + string s = string(buf) + ex; + massert( 10348 , "$extra: ns name too long", s.size() < MaxNsLen); + return s; + } + + inline bool Namespace::isExtra() const { + const char *p = strstr(buf, "$extr"); + return p && p[5] && p[6] == 0; //==0 important in case an index uses name "$extra_1" for example + } + + inline int Namespace::hash() const { + unsigned x = 0; + const char *p = buf; + while ( *p ) { + x = x * 131 + *p; + p++; + } + return (x & 0x7fffffff) | 0x8000000; // must be > 0 + } + + /* future : this doesn't need to be an inline. */ + inline string Namespace::getSisterNS( const char * local ) const { + assert( local && local[0] != '.' ); + string old(buf); + if ( old.find( "." ) != string::npos ) + old = old.substr( 0 , old.find( "." ) ); + return old + "." + local; + } + + inline IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected ) { + if( idxNo < NIndexesBase ) { + IndexDetails& id = _indexes[idxNo]; + return id; + } + Extra *e = extra(); + if ( ! e ) { + if ( missingExpected ) + throw MsgAssertionException( 13283 , "Missing Extra" ); + massert(14045, "missing Extra", e); + } + int i = idxNo - NIndexesBase; + if( i >= NIndexesExtra ) { + e = e->next(this); + if ( ! e ) { + if ( missingExpected ) + throw MsgAssertionException( 14823 , "missing extra" ); + massert(14824, "missing Extra", e); + } + i -= NIndexesExtra; + } + return e->details[i]; + } + + inline int NamespaceDetails::idxNo(IndexDetails& idx) { + IndexIterator i = ii(); + while( i.more() ) { + if( &i.next() == &idx ) + return i.pos()-1; + } + massert( 10349 , "E12000 idxNo fails", false); + return -1; + } + + inline int NamespaceDetails::findIndexByKeyPattern(const BSONObj& keyPattern) { + IndexIterator i = ii(); + while( i.more() ) { + if( i.next().keyPattern() == keyPattern ) + return i.pos()-1; + } + return -1; + } + + // @return offset in indexes[] + inline int NamespaceDetails::findIndexByName(const char *name) { + IndexIterator i = ii(); + while( i.more() ) { + if ( strcmp(i.next().info.obj().getStringField("name"),name) == 0 ) + return i.pos()-1; + } + return -1; + } + + inline NamespaceDetails::IndexIterator::IndexIterator(NamespaceDetails *_d) { + d = _d; + i = 0; + n = d->nIndexes; + } + +} diff --git a/src/mongo/db/namespace.cpp b/src/mongo/db/namespace.cpp new file mode 100644 index 00000000000..af8b5694248 --- /dev/null +++ b/src/mongo/db/namespace.cpp @@ -0,0 +1,800 @@ +// namespace.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "pdfile.h" +#include "db.h" +#include "mongommf.h" +#include "../util/hashtab.h" +#include "../scripting/engine.h" +#include "btree.h" +#include <algorithm> +#include <list> +#include "queryutil.h" +#include "json.h" +#include "ops/delete.h" +#include "ops/query.h" + +namespace mongo { + + BOOST_STATIC_ASSERT( sizeof(Namespace) == 128 ); + + BSONObj idKeyPattern = fromjson("{\"_id\":1}"); + + /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes + so you can look for a deleterecord about the right size. + */ + int bucketSizes[] = { + 32, 64, 128, 256, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, + 0x8000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000, 0x200000, + 0x400000, 0x800000 + }; + + NamespaceDetails::NamespaceDetails( const DiskLoc &loc, bool _capped ) { + /* be sure to initialize new fields here -- doesn't default to zeroes the way we use it */ + firstExtent = lastExtent = capExtent = loc; + stats.datasize = stats.nrecords = 0; + lastExtentSize = 0; + nIndexes = 0; + capped = _capped; + max = 0x7fffffff; + paddingFactor = 1.0; + flags = 0; + capFirstNewRecord = DiskLoc(); + // Signal that we are on first allocation iteration through extents. + capFirstNewRecord.setInvalid(); + // For capped case, signal that we are doing initial extent allocation. + if ( capped ) + cappedLastDelRecLastExtent().setInvalid(); + assert( sizeof(dataFileVersion) == 2 ); + dataFileVersion = 0; + indexFileVersion = 0; + multiKeyIndexBits = 0; + reservedA = 0; + extraOffset = 0; + indexBuildInProgress = 0; + reservedB = 0; + capped2.cc2_ptr = 0; + capped2.fileNumber = 0; + memset(reserved, 0, sizeof(reserved)); + } + + bool NamespaceIndex::exists() const { + return !MMF::exists(path()); + } + + boost::filesystem::path NamespaceIndex::path() const { + boost::filesystem::path ret( dir_ ); + if ( directoryperdb ) + ret /= database_; + ret /= ( database_ + ".ns" ); + return ret; + } + + void NamespaceIndex::maybeMkdir() const { + if ( !directoryperdb ) + return; + boost::filesystem::path dir( dir_ ); + dir /= database_; + if ( !boost::filesystem::exists( dir ) ) + MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( boost::filesystem::create_directory( dir ), "create dir for db " ); + } + + unsigned lenForNewNsFiles = 16 * 1024 * 1024; + +#if defined(_DEBUG) + void NamespaceDetails::dump(const Namespace& k) { + if( !cmdLine.dur ) + cout << "ns offsets which follow will not display correctly with --journal disabled" << endl; + + size_t ofs = 1; // 1 is sentinel that the find call below failed + privateViews.find(this, /*out*/ofs); + + cout << "ns" << hex << setw(8) << ofs << ' '; + cout << k.toString() << '\n'; + + if( k.isExtra() ) { + cout << "ns\t extra" << endl; + return; + } + + cout << "ns " << firstExtent.toString() << ' ' << lastExtent.toString() << " nidx:" << nIndexes << '\n'; + cout << "ns " << stats.datasize << ' ' << stats.nrecords << ' ' << nIndexes << '\n'; + cout << "ns " << capped << ' ' << paddingFactor << ' ' << flags << ' ' << dataFileVersion << '\n'; + cout << "ns " << multiKeyIndexBits << ' ' << indexBuildInProgress << '\n'; + cout << "ns " << (int) reserved[0] << ' ' << (int) reserved[59]; + cout << endl; + } +#endif + + void NamespaceDetails::onLoad(const Namespace& k) { + + if( k.isExtra() ) { + /* overflow storage for indexes - so don't treat as a NamespaceDetails object. */ + return; + } + + if( indexBuildInProgress || capped2.cc2_ptr ) { + assertInWriteLock(); + if( indexBuildInProgress ) { + log() << "indexBuildInProgress was " << indexBuildInProgress << " for " << k << ", indicating an abnormal db shutdown" << endl; + getDur().writingInt( indexBuildInProgress ) = 0; + } + if( capped2.cc2_ptr ) + *getDur().writing(&capped2.cc2_ptr) = 0; + } + } + + static void namespaceOnLoadCallback(const Namespace& k, NamespaceDetails& v) { + v.onLoad(k); + } + + bool checkNsFilesOnLoad = true; + + NOINLINE_DECL void NamespaceIndex::_init() { + assert( !ht ); + + d.dbMutex.assertWriteLocked(); + + /* if someone manually deleted the datafiles for a database, + we need to be sure to clear any cached info for the database in + local.*. + */ + /* + if ( "local" != database_ ) { + DBInfo i(database_.c_str()); + i.dbDropped(); + } + */ + + unsigned long long len = 0; + boost::filesystem::path nsPath = path(); + string pathString = nsPath.string(); + void *p = 0; + if( MMF::exists(nsPath) ) { + if( f.open(pathString, true) ) { + len = f.length(); + if ( len % (1024*1024) != 0 ) { + log() << "bad .ns file: " << pathString << endl; + uassert( 10079 , "bad .ns file length, cannot open database", len % (1024*1024) == 0 ); + } + p = f.getView(); + } + } + else { + // use lenForNewNsFiles, we are making a new database + massert( 10343, "bad lenForNewNsFiles", lenForNewNsFiles >= 1024*1024 ); + maybeMkdir(); + unsigned long long l = lenForNewNsFiles; + if( f.create(pathString, l, true) ) { + getDur().createdFile(pathString, l); // always a new file + len = l; + assert( len == lenForNewNsFiles ); + p = f.getView(); + } + } + + if ( p == 0 ) { + /** TODO: this shouldn't terminate? */ + log() << "error couldn't open file " << pathString << " terminating" << endl; + dbexit( EXIT_FS ); + } + + + assert( len <= 0x7fffffff ); + ht = new HashTable<Namespace,NamespaceDetails>(p, (int) len, "namespace index"); + if( checkNsFilesOnLoad ) + ht->iterAll(namespaceOnLoadCallback); + } + + static void namespaceGetNamespacesCallback( const Namespace& k , NamespaceDetails& v , void * extra ) { + list<string> * l = (list<string>*)extra; + if ( ! k.hasDollarSign() ) + l->push_back( (string)k ); + } + void NamespaceIndex::getNamespaces( list<string>& tofill , bool onlyCollections ) const { + assert( onlyCollections ); // TODO: need to implement this + // need boost::bind or something to make this less ugly + + if ( ht ) + ht->iterAll( namespaceGetNamespacesCallback , (void*)&tofill ); + } + + void NamespaceDetails::addDeletedRec(DeletedRecord *d, DiskLoc dloc) { + BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails) ); + + { + Record *r = (Record *) getDur().writingPtr(d, sizeof(Record)); + d = &r->asDeleted(); + // defensive code: try to make us notice if we reference a deleted record + (unsigned&) (r->data) = 0xeeeeeeee; + } + DEBUGGING log() << "TEMP: add deleted rec " << dloc.toString() << ' ' << hex << d->extentOfs << endl; + if ( capped ) { + if ( !cappedLastDelRecLastExtent().isValid() ) { + // Initial extent allocation. Insert at end. + d->nextDeleted = DiskLoc(); + if ( cappedListOfAllDeletedRecords().isNull() ) + getDur().writingDiskLoc( cappedListOfAllDeletedRecords() ) = dloc; + else { + DiskLoc i = cappedListOfAllDeletedRecords(); + for (; !i.drec()->nextDeleted.isNull(); i = i.drec()->nextDeleted ) + ; + i.drec()->nextDeleted.writing() = dloc; + } + } + else { + d->nextDeleted = cappedFirstDeletedInCurExtent(); + getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = dloc; + // always compact() after this so order doesn't matter + } + } + else { + int b = bucket(d->lengthWithHeaders); + DiskLoc& list = deletedList[b]; + DiskLoc oldHead = list; + getDur().writingDiskLoc(list) = dloc; + d->nextDeleted = oldHead; + } + } + + /* predetermine location of the next alloc without actually doing it. + if cannot predetermine returns null (so still call alloc() then) + */ + DiskLoc NamespaceDetails::allocWillBeAt(const char *ns, int lenToAlloc) { + if ( !capped ) { + lenToAlloc = (lenToAlloc + 3) & 0xfffffffc; + return __stdAlloc(lenToAlloc, true); + } + return DiskLoc(); + } + + /** allocate space for a new record from deleted lists. + @param lenToAlloc is WITH header + @param extentLoc OUT returns the extent location + @return null diskloc if no room - allocate a new extent then + */ + DiskLoc NamespaceDetails::alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc) { + { + // align very slightly. + // note that if doing more coarse-grained quantization (really just if it isn't always + // a constant amount but if it varied by record size) then that quantization should + // NOT be done here but rather in __stdAlloc so that we can grab a deletedrecord that + // is just big enough if we happen to run into one. + lenToAlloc = (lenToAlloc + 3) & 0xfffffffc; + } + + DiskLoc loc = _alloc(ns, lenToAlloc); + if ( loc.isNull() ) + return loc; + + const DeletedRecord *r = loc.drec(); + //r = getDur().writing(r); + + /* note we want to grab from the front so our next pointers on disk tend + to go in a forward direction which is important for performance. */ + int regionlen = r->lengthWithHeaders; + extentLoc.set(loc.a(), r->extentOfs); + assert( r->extentOfs < loc.getOfs() ); + + DEBUGGING out() << "TEMP: alloc() returns " << loc.toString() << ' ' << ns << " lentoalloc:" << lenToAlloc << " ext:" << extentLoc.toString() << endl; + + int left = regionlen - lenToAlloc; + if ( capped == 0 ) { + if ( left < 24 || left < (lenToAlloc >> 3) ) { + // you get the whole thing. + return loc; + } + } + + /* split off some for further use. */ + getDur().writingInt(r->lengthWithHeaders) = lenToAlloc; + DiskLoc newDelLoc = loc; + newDelLoc.inc(lenToAlloc); + DeletedRecord *newDel = DataFileMgr::makeDeletedRecord(newDelLoc, left); + DeletedRecord *newDelW = getDur().writing(newDel); + newDelW->extentOfs = r->extentOfs; + newDelW->lengthWithHeaders = left; + newDelW->nextDeleted.Null(); + + addDeletedRec(newDel, newDelLoc); + + return loc; + } + + /* for non-capped collections. + @param peekOnly just look up where and don't reserve + returned item is out of the deleted list upon return + */ + DiskLoc NamespaceDetails::__stdAlloc(int len, bool peekOnly) { + DiskLoc *prev; + DiskLoc *bestprev = 0; + DiskLoc bestmatch; + int bestmatchlen = 0x7fffffff; + int b = bucket(len); + DiskLoc cur = deletedList[b]; + prev = &deletedList[b]; + int extra = 5; // look for a better fit, a little. + int chain = 0; + while ( 1 ) { + { + int a = cur.a(); + if ( a < -1 || a >= 100000 ) { + problem() << "~~ Assertion - cur out of range in _alloc() " << cur.toString() << + " a:" << a << " b:" << b << " chain:" << chain << '\n'; + sayDbContext(); + if ( cur == *prev ) + prev->Null(); + cur.Null(); + } + } + if ( cur.isNull() ) { + // move to next bucket. if we were doing "extra", just break + if ( bestmatchlen < 0x7fffffff ) + break; + b++; + if ( b > MaxBucket ) { + // out of space. alloc a new extent. + return DiskLoc(); + } + cur = deletedList[b]; + prev = &deletedList[b]; + continue; + } + DeletedRecord *r = cur.drec(); + if ( r->lengthWithHeaders >= len && + r->lengthWithHeaders < bestmatchlen ) { + bestmatchlen = r->lengthWithHeaders; + bestmatch = cur; + bestprev = prev; + } + if ( bestmatchlen < 0x7fffffff && --extra <= 0 ) + break; + if ( ++chain > 30 && b < MaxBucket ) { + // too slow, force move to next bucket to grab a big chunk + //b++; + chain = 0; + cur.Null(); + } + else { + /*this defensive check only made sense for the mmap storage engine: + if ( r->nextDeleted.getOfs() == 0 ) { + problem() << "~~ Assertion - bad nextDeleted " << r->nextDeleted.toString() << + " b:" << b << " chain:" << chain << ", fixing.\n"; + r->nextDeleted.Null(); + }*/ + cur = r->nextDeleted; + prev = &r->nextDeleted; + } + } + + /* unlink ourself from the deleted list */ + if( !peekOnly ) { + const DeletedRecord *bmr = bestmatch.drec(); + *getDur().writing(bestprev) = bmr->nextDeleted; + bmr->nextDeleted.writing().setInvalid(); // defensive. + assert(bmr->extentOfs < bestmatch.getOfs()); + } + + return bestmatch; + } + + void NamespaceDetails::dumpDeleted(set<DiskLoc> *extents) { + for ( int i = 0; i < Buckets; i++ ) { + DiskLoc dl = deletedList[i]; + while ( !dl.isNull() ) { + DeletedRecord *r = dl.drec(); + DiskLoc extLoc(dl.a(), r->extentOfs); + if ( extents == 0 || extents->count(extLoc) <= 0 ) { + out() << " bucket " << i << endl; + out() << " " << dl.toString() << " ext:" << extLoc.toString(); + if ( extents && extents->count(extLoc) <= 0 ) + out() << '?'; + out() << " len:" << r->lengthWithHeaders << endl; + } + dl = r->nextDeleted; + } + } + } + + DiskLoc NamespaceDetails::firstRecord( const DiskLoc &startExtent ) const { + for (DiskLoc i = startExtent.isNull() ? firstExtent : startExtent; + !i.isNull(); i = i.ext()->xnext ) { + if ( !i.ext()->firstRecord.isNull() ) + return i.ext()->firstRecord; + } + return DiskLoc(); + } + + DiskLoc NamespaceDetails::lastRecord( const DiskLoc &startExtent ) const { + for (DiskLoc i = startExtent.isNull() ? lastExtent : startExtent; + !i.isNull(); i = i.ext()->xprev ) { + if ( !i.ext()->lastRecord.isNull() ) + return i.ext()->lastRecord; + } + return DiskLoc(); + } + + int n_complaints_cap = 0; + void NamespaceDetails::maybeComplain( const char *ns, int len ) const { + if ( ++n_complaints_cap < 8 ) { + out() << "couldn't make room for new record (len: " << len << ") in capped ns " << ns << '\n'; + int i = 0; + for ( DiskLoc e = firstExtent; !e.isNull(); e = e.ext()->xnext, ++i ) { + out() << " Extent " << i; + if ( e == capExtent ) + out() << " (capExtent)"; + out() << '\n'; + out() << " magic: " << hex << e.ext()->magic << dec << " extent->ns: " << e.ext()->nsDiagnostic.toString() << '\n'; + out() << " fr: " << e.ext()->firstRecord.toString() << + " lr: " << e.ext()->lastRecord.toString() << " extent->len: " << e.ext()->length << '\n'; + } + assert( len * 5 > lastExtentSize ); // assume it is unusually large record; if not, something is broken + } + } + + /* alloc with capped table handling. */ + DiskLoc NamespaceDetails::_alloc(const char *ns, int len) { + if ( !capped ) + return __stdAlloc(len, false); + + return cappedAlloc(ns,len); + } + + void NamespaceIndex::kill_ns(const char *ns) { + d.dbMutex.assertWriteLocked(); + if ( !ht ) + return; + Namespace n(ns); + ht->kill(n); + + for( int i = 0; i<=1; i++ ) { + try { + Namespace extra(n.extraName(i).c_str()); + ht->kill(extra); + } + catch(DBException&) { + dlog(3) << "caught exception in kill_ns" << endl; + } + } + } + + void NamespaceIndex::add_ns(const char *ns, DiskLoc& loc, bool capped) { + NamespaceDetails details( loc, capped ); + add_ns( ns, details ); + } + void NamespaceIndex::add_ns( const char *ns, const NamespaceDetails &details ) { + d.dbMutex.assertWriteLocked(); + init(); + Namespace n(ns); + uassert( 10081 , "too many namespaces/collections", ht->put(n, details)); + } + + /* extra space for indexes when more than 10 */ + NamespaceDetails::Extra* NamespaceIndex::newExtra(const char *ns, int i, NamespaceDetails *d) { + mongo::d.dbMutex.assertWriteLocked(); + assert( i >= 0 && i <= 1 ); + Namespace n(ns); + Namespace extra(n.extraName(i).c_str()); // throws userexception if ns name too long + + massert( 10350 , "allocExtra: base ns missing?", d ); + massert( 10351 , "allocExtra: extra already exists", ht->get(extra) == 0 ); + + NamespaceDetails::Extra temp; + temp.init(); + uassert( 10082 , "allocExtra: too many namespaces/collections", ht->put(extra, (NamespaceDetails&) temp)); + NamespaceDetails::Extra *e = (NamespaceDetails::Extra *) ht->get(extra); + return e; + } + NamespaceDetails::Extra* NamespaceDetails::allocExtra(const char *ns, int nindexessofar) { + NamespaceIndex *ni = nsindex(ns); + int i = (nindexessofar - NIndexesBase) / NIndexesExtra; + Extra *e = ni->newExtra(ns, i, this); + long ofs = e->ofsFrom(this); + if( i == 0 ) { + assert( extraOffset == 0 ); + *getDur().writing(&extraOffset) = ofs; + assert( extra() == e ); + } + else { + Extra *hd = extra(); + assert( hd->next(this) == 0 ); + hd->setNext(ofs); + } + return e; + } + + /* you MUST call when adding an index. see pdfile.cpp */ + IndexDetails& NamespaceDetails::addIndex(const char *thisns, bool resetTransient) { + IndexDetails *id; + try { + id = &idx(nIndexes,true); + } + catch(DBException&) { + allocExtra(thisns, nIndexes); + id = &idx(nIndexes,false); + } + + (*getDur().writing(&nIndexes))++; + if ( resetTransient ) + NamespaceDetailsTransient::get(thisns).addedIndex(); + return *id; + } + + // must be called when renaming a NS to fix up extra + void NamespaceDetails::copyingFrom(const char *thisns, NamespaceDetails *src) { + extraOffset = 0; // we are a copy -- the old value is wrong. fixing it up below. + Extra *se = src->extra(); + int n = NIndexesBase; + if( se ) { + Extra *e = allocExtra(thisns, n); + while( 1 ) { + n += NIndexesExtra; + e->copy(this, *se); + se = se->next(src); + if( se == 0 ) break; + Extra *nxt = allocExtra(thisns, n); + e->setNext( nxt->ofsFrom(this) ); + e = nxt; + } + assert( extraOffset ); + } + } + + /* returns index of the first index in which the field is present. -1 if not present. + (aug08 - this method not currently used) + */ + int NamespaceDetails::fieldIsIndexed(const char *fieldName) { + massert( 10346 , "not implemented", false); + /* + for ( int i = 0; i < nIndexes; i++ ) { + IndexDetails& idx = indexes[i]; + BSONObj idxKey = idx.info.obj().getObjectField("key"); // e.g., { ts : -1 } + if ( !idxKey.getField(fieldName).eoo() ) + return i; + }*/ + return -1; + } + + long long NamespaceDetails::storageSize( int * numExtents , BSONArrayBuilder * extentInfo ) const { + Extent * e = firstExtent.ext(); + assert( e ); + + long long total = 0; + int n = 0; + while ( e ) { + total += e->length; + n++; + + if ( extentInfo ) { + extentInfo->append( BSON( "len" << e->length << "loc: " << e->myLoc.toBSONObj() ) ); + } + + e = e->getNextExtent(); + } + + if ( numExtents ) + *numExtents = n; + + return total; + } + + NamespaceDetails *NamespaceDetails::writingWithExtra() { + vector< pair< long long, unsigned > > writeRanges; + writeRanges.push_back( make_pair( 0, sizeof( NamespaceDetails ) ) ); + for( Extra *e = extra(); e; e = e->next( this ) ) { + writeRanges.push_back( make_pair( (char*)e - (char*)this, sizeof( Extra ) ) ); + } + return reinterpret_cast< NamespaceDetails* >( getDur().writingRangesAtOffsets( this, writeRanges ) ); + } + + /* ------------------------------------------------------------------------- */ + + SimpleMutex NamespaceDetailsTransient::_qcMutex("qc"); + SimpleMutex NamespaceDetailsTransient::_isMutex("is"); + map< string, shared_ptr< NamespaceDetailsTransient > > NamespaceDetailsTransient::_nsdMap; + typedef map< string, shared_ptr< NamespaceDetailsTransient > >::iterator ouriter; + + void NamespaceDetailsTransient::reset() { + DEV assertInWriteLock(); + clearQueryCache(); + _keysComputed = false; + _indexSpecs.clear(); + } + + /*static*/ NOINLINE_DECL NamespaceDetailsTransient& NamespaceDetailsTransient::make_inlock(const char *ns) { + shared_ptr< NamespaceDetailsTransient > &t = _nsdMap[ ns ]; + assert( t.get() == 0 ); + Database *database = cc().database(); + assert( database ); + if( _nsdMap.size() % 20000 == 10000 ) { + // so we notice if insanely large #s + log() << "opening namespace " << ns << endl; + log() << _nsdMap.size() << " namespaces in nsdMap" << endl; + } + t.reset( new NamespaceDetailsTransient(database, ns) ); + return *t; + } + + // note with repair there could be two databases with the same ns name. + // that is NOT handled here yet! TODO + // repair may not use nsdt though not sure. anyway, requires work. + NamespaceDetailsTransient::NamespaceDetailsTransient(Database *db, const char *ns) : + _ns(ns), _keysComputed(false), _qcWriteCount() + { + dassert(db); + } + + NamespaceDetailsTransient::~NamespaceDetailsTransient() { + } + + void NamespaceDetailsTransient::clearForPrefix(const char *prefix) { + assertInWriteLock(); + vector< string > found; + for( ouriter i = _nsdMap.begin(); i != _nsdMap.end(); ++i ) + if ( strncmp( i->first.c_str(), prefix, strlen( prefix ) ) == 0 ) + found.push_back( i->first ); + for( vector< string >::iterator i = found.begin(); i != found.end(); ++i ) { + _nsdMap[ *i ].reset(); + } + } + + void NamespaceDetailsTransient::eraseForPrefix(const char *prefix) { + assertInWriteLock(); + vector< string > found; + for( ouriter i = _nsdMap.begin(); i != _nsdMap.end(); ++i ) + if ( strncmp( i->first.c_str(), prefix, strlen( prefix ) ) == 0 ) + found.push_back( i->first ); + for( vector< string >::iterator i = found.begin(); i != found.end(); ++i ) { + _nsdMap.erase(*i); + } + } + + void NamespaceDetailsTransient::computeIndexKeys() { + _keysComputed = true; + _indexKeys.clear(); + NamespaceDetails *d = nsdetails(_ns.c_str()); + if ( ! d ) + return; + NamespaceDetails::IndexIterator i = d->ii(); + while( i.more() ) + i.next().keyPattern().getFieldNames(_indexKeys); + } + + + /* ------------------------------------------------------------------------- */ + + /* add a new namespace to the system catalog (<dbname>.system.namespaces). + options: { capped : ..., size : ... } + */ + void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0) { + LOG(1) << "New namespace: " << ns << endl; + if ( strstr(ns, "system.namespaces") ) { + // system.namespaces holds all the others, so it is not explicitly listed in the catalog. + // TODO: fix above should not be strstr! + return; + } + + { + BSONObjBuilder b; + b.append("name", ns); + if ( options ) + b.append("options", *options); + BSONObj j = b.done(); + char database[256]; + nsToDatabase(ns, database); + string s = database; + if( cmdLine.configsvr && (s != "config" && s != "admin") ) { + uasserted(14037, "can't create user databases on a --configsvr instance"); + } + s += ".system.namespaces"; + theDataFileMgr.insert(s.c_str(), j.objdata(), j.objsize(), true); + } + } + + void renameNamespace( const char *from, const char *to ) { + NamespaceIndex *ni = nsindex( from ); + assert( ni ); + assert( ni->details( from ) ); + assert( ! ni->details( to ) ); + + // Our namespace and index details will move to a different + // memory location. The only references to namespace and + // index details across commands are in cursors and nsd + // transient (including query cache) so clear these. + ClientCursor::invalidate( from ); + NamespaceDetailsTransient::eraseForPrefix( from ); + + NamespaceDetails *details = ni->details( from ); + ni->add_ns( to, *details ); + NamespaceDetails *todetails = ni->details( to ); + try { + todetails->copyingFrom(to, details); // fixes extraOffset + } + catch( DBException& ) { + // could end up here if .ns is full - if so try to clean up / roll back a little + ni->kill_ns(to); + throw; + } + ni->kill_ns( from ); + details = todetails; + + BSONObj oldSpec; + char database[MaxDatabaseNameLen]; + nsToDatabase(from, database); + string s = database; + s += ".system.namespaces"; + assert( Helpers::findOne( s.c_str(), BSON( "name" << from ), oldSpec ) ); + + BSONObjBuilder newSpecB; + BSONObjIterator i( oldSpec.getObjectField( "options" ) ); + while( i.more() ) { + BSONElement e = i.next(); + if ( strcmp( e.fieldName(), "create" ) != 0 ) + newSpecB.append( e ); + else + newSpecB << "create" << to; + } + BSONObj newSpec = newSpecB.done(); + addNewNamespaceToCatalog( to, newSpec.isEmpty() ? 0 : &newSpec ); + + deleteObjects( s.c_str(), BSON( "name" << from ), false, false, true ); + // oldSpec variable no longer valid memory + + BSONObj oldIndexSpec; + s = database; + s += ".system.indexes"; + while( Helpers::findOne( s.c_str(), BSON( "ns" << from ), oldIndexSpec ) ) { + BSONObjBuilder newIndexSpecB; + BSONObjIterator i( oldIndexSpec ); + while( i.more() ) { + BSONElement e = i.next(); + if ( strcmp( e.fieldName(), "ns" ) != 0 ) + newIndexSpecB.append( e ); + else + newIndexSpecB << "ns" << to; + } + BSONObj newIndexSpec = newIndexSpecB.done(); + DiskLoc newIndexSpecLoc = theDataFileMgr.insert( s.c_str(), newIndexSpec.objdata(), newIndexSpec.objsize(), true, false ); + int indexI = details->findIndexByName( oldIndexSpec.getStringField( "name" ) ); + IndexDetails &indexDetails = details->idx(indexI); + string oldIndexNs = indexDetails.indexNamespace(); + indexDetails.info = newIndexSpecLoc; + string newIndexNs = indexDetails.indexNamespace(); + + renameIndexNamespace( oldIndexNs.c_str(), newIndexNs.c_str() ); + deleteObjects( s.c_str(), oldIndexSpec.getOwned(), true, false, true ); + } + } + + bool legalClientSystemNS( const string& ns , bool write ) { + if( ns == "local.system.replset" ) return true; + + if ( ns.find( ".system.users" ) != string::npos ) + return true; + + if ( ns.find( ".system.js" ) != string::npos ) { + if ( write ) + Scope::storedFuncMod(); + return true; + } + + return false; + } + +} // namespace mongo diff --git a/src/mongo/db/namespace.h b/src/mongo/db/namespace.h new file mode 100644 index 00000000000..9ceb6a6f4e9 --- /dev/null +++ b/src/mongo/db/namespace.h @@ -0,0 +1,629 @@ +// namespace.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../pch.h" +#include "namespacestring.h" +#include "jsobj.h" +#include "querypattern.h" +#include "diskloc.h" +#include "../util/hashtab.h" +#include "mongommf.h" +#include "d_concurrency.h" + +namespace mongo { + + class Database; + +#pragma pack(1) + /* This helper class is used to make the HashMap below in NamespaceIndex e.g. see line: + HashTable<Namespace,NamespaceDetails> *ht; + */ + class Namespace { + public: + explicit Namespace(const char *ns) { *this = ns; } + Namespace& operator=(const char *ns); + + bool hasDollarSign() const { return strchr( buf , '$' ) > 0; } + void kill() { buf[0] = 0x7f; } + bool operator==(const char *r) const { return strcmp(buf, r) == 0; } + bool operator==(const Namespace& r) const { return strcmp(buf, r.buf) == 0; } + int hash() const; // value returned is always > 0 + + size_t size() const { return strlen( buf ); } + + string toString() const { return (string) buf; } + operator string() const { return (string) buf; } + + /* NamespaceDetails::Extra was added after fact to allow chaining of data blocks to support more than 10 indexes + (more than 10 IndexDetails). It's a bit hacky because of this late addition with backward + file support. */ + string extraName(int i) const; + bool isExtra() const; /* ends with $extr... -- when true an extra block not a normal NamespaceDetails block */ + + /** ( foo.bar ).getSisterNS( "blah" ) == foo.blah + perhaps this should move to the NamespaceString helper? + */ + string getSisterNS( const char * local ) const; + + enum MaxNsLenValue { MaxNsLen = 128 }; + private: + char buf[MaxNsLen]; + }; +#pragma pack() + +} // namespace mongo + +#include "index.h" + +namespace mongo { + + /** @return true if a client can modify this namespace even though it is under ".system." + For example <dbname>.system.users is ok for regular clients to update. + @param write used when .system.js + */ + bool legalClientSystemNS( const string& ns , bool write ); + + /* deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various sizes + so you can look for a deleterecord about the right size. + */ + const int Buckets = 19; + const int MaxBucket = 18; + + extern int bucketSizes[]; + +#pragma pack(1) + /* NamespaceDetails : this is the "header" for a collection that has all its details. + It's in the .ns file and this is a memory mapped region (thus the pack pragma above). + */ + class NamespaceDetails { + public: + enum { NIndexesMax = 64, NIndexesExtra = 30, NIndexesBase = 10 }; + + /*-------- data fields, as present on disk : */ + DiskLoc firstExtent; + DiskLoc lastExtent; + /* NOTE: capped collections v1 override the meaning of deletedList. + deletedList[0] points to a list of free records (DeletedRecord's) for all extents in + the capped namespace. + deletedList[1] points to the last record in the prev extent. When the "current extent" + changes, this value is updated. !deletedList[1].isValid() when this value is not + yet computed. + */ + DiskLoc deletedList[Buckets]; + // ofs 168 (8 byte aligned) + struct Stats { + // datasize and nrecords MUST Be adjacent code assumes! + long long datasize; // this includes padding, but not record headers + long long nrecords; + } stats; + int lastExtentSize; + int nIndexes; + private: + // ofs 192 + IndexDetails _indexes[NIndexesBase]; + public: + // ofs 352 (16 byte aligned) + int capped; + int max; // max # of objects for a capped table. TODO: should this be 64 bit? + double paddingFactor; // 1.0 = no padding. + // ofs 386 (16) + int flags; + DiskLoc capExtent; + DiskLoc capFirstNewRecord; + unsigned short dataFileVersion; // NamespaceDetails version. So we can do backward compatibility in the future. See filever.h + unsigned short indexFileVersion; + unsigned long long multiKeyIndexBits; + private: + // ofs 400 (16) + unsigned long long reservedA; + long long extraOffset; // where the $extra info is located (bytes relative to this) + public: + int indexBuildInProgress; // 1 if in prog + unsigned reservedB; + // ofs 424 (8) + struct Capped2 { + unsigned long long cc2_ptr; // see capped.cpp + unsigned fileNumber; + } capped2; + char reserved[60]; + /*-------- end data 496 bytes */ + + explicit NamespaceDetails( const DiskLoc &loc, bool _capped ); + + class Extra { + long long _next; + public: + IndexDetails details[NIndexesExtra]; + private: + unsigned reserved2; + unsigned reserved3; + Extra(const Extra&) { assert(false); } + Extra& operator=(const Extra& r) { assert(false); return *this; } + public: + Extra() { } + long ofsFrom(NamespaceDetails *d) { + return ((char *) this) - ((char *) d); + } + void init() { memset(this, 0, sizeof(Extra)); } + Extra* next(NamespaceDetails *d) { + if( _next == 0 ) return 0; + return (Extra*) (((char *) d) + _next); + } + void setNext(long ofs) { *getDur().writing(&_next) = ofs; } + void copy(NamespaceDetails *d, const Extra& e) { + memcpy(this, &e, sizeof(Extra)); + _next = 0; + } + }; + Extra* extra() { + if( extraOffset == 0 ) return 0; + return (Extra *) (((char *) this) + extraOffset); + } + /* add extra space for indexes when more than 10 */ + Extra* allocExtra(const char *ns, int nindexessofar); + void copyingFrom(const char *thisns, NamespaceDetails *src); // must be called when renaming a NS to fix up extra + + /* called when loaded from disk */ + void onLoad(const Namespace& k); + + /* dump info on this namespace. for debugging. */ + void dump(const Namespace& k); + + /* dump info on all extents for this namespace. for debugging. */ + void dumpExtents(); + + private: + Extent *theCapExtent() const { return capExtent.ext(); } + void advanceCapExtent( const char *ns ); + DiskLoc __capAlloc(int len); + DiskLoc cappedAlloc(const char *ns, int len); + DiskLoc &cappedFirstDeletedInCurExtent(); + bool nextIsInCapExtent( const DiskLoc &dl ) const; + + public: + DiskLoc& cappedListOfAllDeletedRecords() { return deletedList[0]; } + DiskLoc& cappedLastDelRecLastExtent() { return deletedList[1]; } + void cappedDumpDelInfo(); + bool capLooped() const { return capped && capFirstNewRecord.isValid(); } + bool inCapExtent( const DiskLoc &dl ) const; + void cappedCheckMigrate(); + /** + * Truncate documents newer than the document at 'end' from the capped + * collection. The collection cannot be completely emptied using this + * function. An assertion will be thrown if that is attempted. + * @param inclusive - Truncate 'end' as well iff true + */ + void cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive); + /** Remove all documents from the capped collection */ + void emptyCappedCollection(const char *ns); + + /* when a background index build is in progress, we don't count the index in nIndexes until + complete, yet need to still use it in _indexRecord() - thus we use this function for that. + */ + int nIndexesBeingBuilt() const { return nIndexes + indexBuildInProgress; } + + /* NOTE: be careful with flags. are we manipulating them in read locks? if so, + this isn't thread safe. TODO + */ + enum NamespaceFlags { + Flag_HaveIdIndex = 1 << 0 // set when we have _id index (ONLY if ensureIdIndex was called -- 0 if that has never been called) + }; + + IndexDetails& idx(int idxNo, bool missingExpected = false ); + + /** get the IndexDetails for the index currently being built in the background. (there is at most one) */ + IndexDetails& inProgIdx() { + DEV assert(indexBuildInProgress); + return idx(nIndexes); + } + + class IndexIterator { + public: + int pos() { return i; } // note this is the next one to come + bool more() { return i < n; } + IndexDetails& next() { return d->idx(i++); } + private: + friend class NamespaceDetails; + int i, n; + NamespaceDetails *d; + IndexIterator(NamespaceDetails *_d); + }; + + IndexIterator ii() { return IndexIterator(this); } + + /* hackish - find our index # in the indexes array */ + int idxNo(IndexDetails& idx); + + /* multikey indexes are indexes where there are more than one key in the index + for a single document. see multikey in wiki. + for these, we have to do some dedup work on queries. + */ + bool isMultikey(int i) const { return (multiKeyIndexBits & (((unsigned long long) 1) << i)) != 0; } + void setIndexIsMultikey(int i) { + dassert( i < NIndexesMax ); + unsigned long long x = ((unsigned long long) 1) << i; + if( multiKeyIndexBits & x ) return; + *getDur().writing(&multiKeyIndexBits) |= x; + } + void clearIndexIsMultikey(int i) { + dassert( i < NIndexesMax ); + unsigned long long x = ((unsigned long long) 1) << i; + if( (multiKeyIndexBits & x) == 0 ) return; + *getDur().writing(&multiKeyIndexBits) &= ~x; + } + + /* add a new index. does not add to system.indexes etc. - just to NamespaceDetails. + caller must populate returned object. + */ + IndexDetails& addIndex(const char *thisns, bool resetTransient=true); + + void aboutToDeleteAnIndex() { + *getDur().writing(&flags) = flags & ~Flag_HaveIdIndex; + } + + /* returns index of the first index in which the field is present. -1 if not present. */ + int fieldIsIndexed(const char *fieldName); + + /* called to indicate that an update fit in place. + fits also called on an insert -- idea there is that if you had some mix and then went to + pure inserts it would adapt and PF would trend to 1.0. note update calls insert on a move + so there is a double count there that must be adjusted for below. + + todo: greater sophistication could be helpful and added later. for example the absolute + size of documents might be considered -- in some cases smaller ones are more likely + to grow than larger ones in the same collection? (not always) + */ + void paddingFits() { + MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less + double x = paddingFactor - 0.001; + if ( x >= 1.0 ) { + *getDur().writing(&paddingFactor) = x; + //getDur().setNoJournal(&paddingFactor, &x, sizeof(x)); + } + } + } + void paddingTooSmall() { + MONGO_SOMETIMES(sometimes, 4) { // do this on a sampled basis to journal less + /* the more indexes we have, the higher the cost of a move. so we take that into + account herein. note on a move that insert() calls paddingFits(), thus + here for example with no inserts and nIndexes = 1 we have + .001*4-.001 or a 3:1 ratio to non moves -> 75% nonmoves. insert heavy + can pushes this down considerably. further tweaking will be a good idea but + this should be an adequate starting point. + */ + double N = min(nIndexes,7) + 3; + double x = paddingFactor + (0.001 * N); + if ( x <= 2.0 ) { + *getDur().writing(&paddingFactor) = x; + //getDur().setNoJournal(&paddingFactor, &x, sizeof(x)); + } + } + } + + // @return offset in indexes[] + int findIndexByName(const char *name); + + // @return offset in indexes[] + int findIndexByKeyPattern(const BSONObj& keyPattern); + + void findIndexByType( const string& name , vector<int>& matches ) { + IndexIterator i = ii(); + while ( i.more() ) { + if ( i.next().getSpec().getTypeName() == name ) + matches.push_back( i.pos() - 1 ); + } + } + + /* @return -1 = not found + generally id is first index, so not that expensive an operation (assuming present). + */ + int findIdIndex() { + IndexIterator i = ii(); + while( i.more() ) { + if( i.next().isIdIndex() ) + return i.pos()-1; + } + return -1; + } + + bool haveIdIndex() { + return (flags & NamespaceDetails::Flag_HaveIdIndex) || findIdIndex() >= 0; + } + + /* return which "deleted bucket" for this size object */ + static int bucket(int n) { + for ( int i = 0; i < Buckets; i++ ) + if ( bucketSizes[i] > n ) + return i; + return Buckets-1; + } + + /* predetermine location of the next alloc without actually doing it. + if cannot predetermine returns null (so still call alloc() then) + */ + DiskLoc allocWillBeAt(const char *ns, int lenToAlloc); + + /* allocate a new record. lenToAlloc includes headers. */ + DiskLoc alloc(const char *ns, int lenToAlloc, DiskLoc& extentLoc); + + /* add a given record to the deleted chains for this NS */ + void addDeletedRec(DeletedRecord *d, DiskLoc dloc); + void dumpDeleted(set<DiskLoc> *extents = 0); + // Start from firstExtent by default. + DiskLoc firstRecord( const DiskLoc &startExtent = DiskLoc() ) const; + // Start from lastExtent by default. + DiskLoc lastRecord( const DiskLoc &startExtent = DiskLoc() ) const; + long long storageSize( int * numExtents = 0 , BSONArrayBuilder * extentInfo = 0 ) const; + + int averageObjectSize() { + if ( stats.nrecords == 0 ) + return 5; + return (int) (stats.datasize / stats.nrecords); + } + + NamespaceDetails *writingWithoutExtra() { + return ( NamespaceDetails* ) getDur().writingPtr( this, sizeof( NamespaceDetails ) ); + } + /** Make all linked Extra objects writeable as well */ + NamespaceDetails *writingWithExtra(); + + private: + DiskLoc _alloc(const char *ns, int len); + void maybeComplain( const char *ns, int len ) const; + DiskLoc __stdAlloc(int len, bool willBeAt); + void compact(); // combine adjacent deleted records + friend class NamespaceIndex; + struct ExtraOld { + // note we could use this field for more chaining later, so don't waste it: + unsigned long long reserved1; + IndexDetails details[NIndexesExtra]; + unsigned reserved2; + unsigned reserved3; + }; + /** Update cappedLastDelRecLastExtent() after capExtent changed in cappedTruncateAfter() */ + void cappedTruncateLastDelUpdate(); + BOOST_STATIC_ASSERT( NIndexesMax <= NIndexesBase + NIndexesExtra*2 ); + BOOST_STATIC_ASSERT( NIndexesMax <= 64 ); // multiKey bits + BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::ExtraOld) == 496 ); + BOOST_STATIC_ASSERT( sizeof(NamespaceDetails::Extra) == 496 ); + }; // NamespaceDetails +#pragma pack() + + /* NamespaceDetailsTransient + + these are things we know / compute about a namespace that are transient -- things + we don't actually store in the .ns file. so mainly caching of frequently used + information. + + CAUTION: Are you maintaining this properly on a collection drop()? A dropdatabase()? Be careful. + The current field "allIndexKeys" may have too many keys in it on such an occurrence; + as currently used that does not cause anything terrible to happen. + + todo: cleanup code, need abstractions and separation + */ + // todo: multiple db's with the same name (repairDatbase) is not handled herein. that may be + // the way to go, if not used by repair, but need some sort of enforcement / asserts. + class NamespaceDetailsTransient : boost::noncopyable { + BOOST_STATIC_ASSERT( sizeof(NamespaceDetails) == 496 ); + + //Database *database; + const string _ns; + void reset(); + static std::map< string, shared_ptr< NamespaceDetailsTransient > > _nsdMap; + + NamespaceDetailsTransient(Database*,const char *ns); + public: + ~NamespaceDetailsTransient(); + void addedIndex() { assertInWriteLock(); reset(); } + void deletedIndex() { assertInWriteLock(); reset(); } + /* Drop cached information on all namespaces beginning with the specified prefix. + Can be useful as index namespaces share the same start as the regular collection. + SLOW - sequential scan of all NamespaceDetailsTransient objects */ + static void clearForPrefix(const char *prefix); + static void eraseForPrefix(const char *prefix); + + /** + * @return a cursor interface to the query optimizer. The implementation may + * utilize a single query plan or interleave results from multiple query + * plans before settling on a single query plan. Note that the schema of + * currKey() documents, the matcher(), and the isMultiKey() nature of the + * cursor may change over the course of iteration. + * + * @param query - Query used to select indexes and populate matchers. + * + * @param order - Required ordering spec for documents produced by this cursor, + * empty object default indicates no order requirement. If no index exists that + * satisfies the required sort order, an empty shared_ptr is returned. + * + * @param requireIndex - If true, no unindexed (ie collection scan) cursors are + * used to generate the returned cursor. If an unindexed cursor is required, an + * assertion is raised by the cursor during iteration. + * + * @param simpleEqualityMatch - Set to true for certain simple queries - + * see queryoptimizer.cpp. + * + * The returned cursor may @throw inside of advance() or recoverFromYield() in + * certain error cases, for example if a capped overrun occurred during a yield. + * This indicates that the cursor was unable to perform a complete scan. + * + * This is a work in progress. Partial list of features not yet implemented: + * - covered indexes + * - in memory sorting + */ + static shared_ptr<Cursor> getCursor( const char *ns, const BSONObj &query, + const BSONObj &order = BSONObj(), bool requireIndex = false, + bool *simpleEqualityMatch = 0 ); + + /* indexKeys() cache ---------------------------------------------------- */ + /* assumed to be in write lock for this */ + private: + bool _keysComputed; + set<string> _indexKeys; + void computeIndexKeys(); + public: + /* get set of index keys for this namespace. handy to quickly check if a given + field is indexed (Note it might be a secondary component of a compound index.) + */ + set<string>& indexKeys() { + DEV assertInWriteLock(); + if ( !_keysComputed ) + computeIndexKeys(); + return _indexKeys; + } + + /* IndexSpec caching */ + private: + map<const IndexDetails*,IndexSpec> _indexSpecs; + static SimpleMutex _isMutex; + public: + const IndexSpec& getIndexSpec( const IndexDetails * details ) { + IndexSpec& spec = _indexSpecs[details]; + if ( ! spec._finishedInit ) { + SimpleMutex::scoped_lock lk(_isMutex); + if ( ! spec._finishedInit ) { + spec.reset( details ); + assert( spec._finishedInit ); + } + } + return spec; + } + + /* query cache (for query optimizer) ------------------------------------- */ + private: + int _qcWriteCount; + map< QueryPattern, pair< BSONObj, long long > > _qcCache; + static NamespaceDetailsTransient& make_inlock(const char *ns); + public: + static SimpleMutex _qcMutex; + + /* you must be in the qcMutex when calling this. + A NamespaceDetailsTransient object will not go out of scope on you if you are + d.dbMutex.atLeastReadLocked(), so you do't have to stay locked. + Creates a NamespaceDetailsTransient before returning if one DNE. + todo: avoid creating too many on erroneous ns queries. + */ + static NamespaceDetailsTransient& get_inlock(const char *ns); + + static NamespaceDetailsTransient& get(const char *ns) { + SimpleMutex::scoped_lock lk(_qcMutex); + return get_inlock(ns); + } + + void clearQueryCache() { // public for unit tests + _qcCache.clear(); + _qcWriteCount = 0; + } + /* you must notify the cache if you are doing writes, as query plan optimality will change */ + void notifyOfWriteOp() { + if ( _qcCache.empty() ) + return; + if ( ++_qcWriteCount >= 100 ) + clearQueryCache(); + } + BSONObj indexForPattern( const QueryPattern &pattern ) { + return _qcCache[ pattern ].first; + } + long long nScannedForPattern( const QueryPattern &pattern ) { + return _qcCache[ pattern ].second; + } + void registerIndexForPattern( const QueryPattern &pattern, const BSONObj &indexKey, long long nScanned ) { + _qcCache[ pattern ] = make_pair( indexKey, nScanned ); + } + + }; /* NamespaceDetailsTransient */ + + inline NamespaceDetailsTransient& NamespaceDetailsTransient::get_inlock(const char *ns) { + std::map< string, shared_ptr< NamespaceDetailsTransient > >::iterator i = _nsdMap.find(ns); + if( i != _nsdMap.end() && + i->second.get() ) { // could be null ptr from clearForPrefix + return *i->second; + } + return make_inlock(ns); + } + + /* NamespaceIndex is the ".ns" file you see in the data directory. It is the "system catalog" + if you will: at least the core parts. (Additional info in system.* collections.) + */ + class NamespaceIndex { + public: + NamespaceIndex(const string &dir, const string &database) : + ht( 0 ), dir_( dir ), database_( database ) {} + + /* returns true if new db will be created if we init lazily */ + bool exists() const; + + void init() { + if( !ht ) + _init(); + } + + void add_ns(const char *ns, DiskLoc& loc, bool capped); + void add_ns( const char *ns, const NamespaceDetails &details ); + + NamespaceDetails* details(const char *ns) { + if ( !ht ) + return 0; + Namespace n(ns); + NamespaceDetails *d = ht->get(n); + if ( d && d->capped ) + d->cappedCheckMigrate(); + return d; + } + + void kill_ns(const char *ns); + + bool find(const char *ns, DiskLoc& loc) { + NamespaceDetails *l = details(ns); + if ( l ) { + loc = l->firstExtent; + return true; + } + return false; + } + + bool allocated() const { return ht != 0; } + + void getNamespaces( list<string>& tofill , bool onlyCollections = true ) const; + + NamespaceDetails::Extra* newExtra(const char *ns, int n, NamespaceDetails *d); + + boost::filesystem::path path() const; + + unsigned long long fileLength() const { return f.length(); } + + private: + void _init(); + void maybeMkdir() const; + + MongoMMF f; + HashTable<Namespace,NamespaceDetails> *ht; + string dir_; + string database_; + }; + + extern string dbpath; // --dbpath parm + extern bool directoryperdb; + + // Rename a namespace within current 'client' db. + // (Arguments should include db name) + void renameNamespace( const char *from, const char *to ); + + +} // namespace mongo diff --git a/src/mongo/db/namespacestring.h b/src/mongo/db/namespacestring.h new file mode 100644 index 00000000000..d982c5fff75 --- /dev/null +++ b/src/mongo/db/namespacestring.h @@ -0,0 +1,147 @@ +// @file namespacestring.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include <string> + +namespace mongo { + + using std::string; + + /* in the mongo source code, "client" means "database". */ + + const int MaxDatabaseNameLen = 256; // max str len for the db name, including null char + + /* e.g. + NamespaceString ns("acme.orders"); + cout << ns.coll; // "orders" + */ + class NamespaceString { + public: + string db; + string coll; // note collection names can have periods in them for organizing purposes (e.g. "system.indexes") + + NamespaceString( const char * ns ) { init(ns); } + NamespaceString( const string& ns ) { init(ns.c_str()); } + + string ns() const { return db + '.' + coll; } + + bool isSystem() const { return strncmp(coll.c_str(), "system.", 7) == 0; } + bool isCommand() const { return coll == "$cmd"; } + + operator string() const { return ns(); } + + bool operator==( const string& nsIn ) const { return nsIn == ns(); } + bool operator==( const char* nsIn ) const { return (string)nsIn == ns(); } + bool operator==( const NamespaceString& nsIn ) const { return nsIn.db == db && nsIn.coll == coll; } + + bool operator!=( const string& nsIn ) const { return nsIn != ns(); } + bool operator!=( const char* nsIn ) const { return (string)nsIn != ns(); } + bool operator!=( const NamespaceString& nsIn ) const { return nsIn.db != db || nsIn.coll != coll; } + + string toString() const { return ns(); } + + /** + * @return true if ns is 'normal'. $ used for collections holding index data, which do not contain BSON objects in their records. + * special case for the local.oplog.$main ns -- naming it as such was a mistake. + */ + static bool normal(const char* ns) { + const char *p = strchr(ns, '$'); + if( p == 0 ) + return true; + return strcmp( ns, "local.oplog.$main" ) == 0; + } + + static bool special(const char *ns) { + return !normal(ns) || strstr(ns, ".system."); + } + + /** + * samples: + * good: + * foo + * bar + * foo-bar + * bad: + * foo bar + * foo.bar + * foo"bar + * + * @param db - a possible database name + * @return if db is an allowed database name + */ + static bool validDBName( const string& db ) { + if ( db.size() == 0 || db.size() > 64 ) + return false; + size_t good = strcspn( db.c_str() , "/\\. \"" ); + return good == db.size(); + } + + /** + * samples: + * good: + * foo.bar + * bad: + * foo. + * + * @param dbcoll - a possible collection name of the form db.coll + * @return if db.coll is an allowed collection name + */ + static bool validCollectionName(const char* dbcoll){ + const char *c = strchr( dbcoll, '.' ) + 1; + return normal(dbcoll) && c && *c; + } + + private: + void init(const char *ns) { + const char *p = strchr(ns, '.'); + if( p == 0 ) return; + db = string(ns, p - ns); + coll = p + 1; + } + }; + + // "database.a.b.c" -> "database" + inline void nsToDatabase(const char *ns, char *database) { + const char *p = ns; + char *q = database; + while ( *p != '.' ) { + if ( *p == 0 ) + break; + *q++ = *p++; + } + *q = 0; + if (q-database>=MaxDatabaseNameLen) { + log() << "nsToDatabase: ns too long. terminating, buf overrun condition" << endl; + dbexit( EXIT_POSSIBLE_CORRUPTION ); + } + } + inline string nsToDatabase(const char *ns) { + char buf[MaxDatabaseNameLen]; + nsToDatabase(ns, buf); + return buf; + } + inline string nsToDatabase(const string& ns) { + size_t i = ns.find( '.' ); + if ( i == string::npos ) + return ns; + return ns.substr( 0 , i ); + } + +} diff --git a/src/mongo/db/nonce.cpp b/src/mongo/db/nonce.cpp new file mode 100644 index 00000000000..379e88f116d --- /dev/null +++ b/src/mongo/db/nonce.cpp @@ -0,0 +1,95 @@ +// nonce.cpp + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pch.h" +#include "nonce.h" +#include "../util/time_support.h" + +extern int do_md5_test(void); + +namespace mongo { + + BOOST_STATIC_ASSERT( sizeof(nonce64) == 8 ); + + static Security security; // needs to be static so _initialized is preset to false (see initsafe below) + + Security::Security() { + static int n; + massert( 10352 , "Security is a singleton class", ++n == 1); + init(); + } + + NOINLINE_DECL void Security::init() { + if( _initialized ) return; + _initialized = true; + +#if defined(__linux__) || defined(__sunos__) || defined(__APPLE__) + _devrandom = new ifstream("/dev/urandom", ios::binary|ios::in); + massert( 10353 , "can't open dev/urandom", _devrandom->is_open() ); +#elif defined(_WIN32) + srand(curTimeMicros()); // perhaps not relevant for rand_s but we might want elsewhere anyway +#else + srandomdev(); +#endif + +#ifndef NDEBUG + if ( do_md5_test() ) + massert( 10354 , "md5 unit test fails", false); +#endif + } + + nonce64 Security::__getNonce() { + dassert( _initialized ); + nonce64 n; +#if defined(__linux__) || defined(__sunos__) || defined(__APPLE__) + _devrandom->read((char*)&n, sizeof(n)); + massert(10355 , "devrandom failed", !_devrandom->fail()); +#elif defined(_WIN32) + unsigned a=0, b=0; + assert( rand_s(&a) == 0 ); + assert( rand_s(&b) == 0 ); + n = (((unsigned long long)a)<<32) | b; +#else + n = (((unsigned long long)random())<<32) | random(); +#endif + return n; + } + + SimpleMutex nonceMutex("nonce"); + nonce64 Security::_getNonce() { + // not good this is a static as gcc will mutex protect it which costs time + SimpleMutex::scoped_lock lk(nonceMutex); + if( !_initialized ) + init(); + return __getNonce(); + } + + nonce64 Security::getNonceDuringInit() { + // the mutex might not be inited yet. init phase should be one thread anyway (hopefully we don't spawn threads therein) + if( !security._initialized ) + security.init(); + return security.__getNonce(); + } + + nonce64 Security::getNonce() { + return security._getNonce(); + } + + // name warns us this might be a little slow (see code above) + unsigned goodRandomNumberSlow() { return (unsigned) Security::getNonce(); } + +} // namespace mongo diff --git a/src/mongo/db/nonce.h b/src/mongo/db/nonce.h new file mode 100644 index 00000000000..d6a147ab1c0 --- /dev/null +++ b/src/mongo/db/nonce.h @@ -0,0 +1,36 @@ +// @file nonce.h + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace mongo { + + typedef unsigned long long nonce64; + + struct Security { + Security(); + static nonce64 getNonce(); + static nonce64 getNonceDuringInit(); // use this version during global var constructors + private: + nonce64 _getNonce(); + nonce64 __getNonce(); + ifstream *_devrandom; + bool _initialized; + void init(); // can call more than once + }; + +} // namespace mongo diff --git a/src/mongo/db/oplog.cpp b/src/mongo/db/oplog.cpp new file mode 100644 index 00000000000..342f362a28f --- /dev/null +++ b/src/mongo/db/oplog.cpp @@ -0,0 +1,872 @@ +// @file oplog.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "oplog.h" +#include "repl_block.h" +#include "repl.h" +#include "commands.h" +#include "repl/rs.h" +#include "stats/counters.h" +#include "../util/file.h" +#include "../util/unittest.h" +#include "queryoptimizer.h" +#include "ops/update.h" +#include "ops/delete.h" +#include "ops/query.h" + +namespace mongo { + + void logOpForSharding( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt ); + + int __findingStartInitialTimeout = 5; // configurable for testing + + // cached copies of these...so don't rename them, drop them, etc.!!! + static NamespaceDetails *localOplogMainDetails = 0; + static Database *localDB = 0; + static NamespaceDetails *rsOplogDetails = 0; + void oplogCheckCloseDatabase( Database * db ) { + localDB = 0; + localOplogMainDetails = 0; + rsOplogDetails = 0; + resetSlaveCache(); + } + + static void _logOpUninitialized(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) { + uassert(13288, "replSet error write op to db before replSet initialized", str::startsWith(ns, "local.") || *opstr == 'n'); + } + + /** write an op to the oplog that is already built. + todo : make _logOpRS() call this so we don't repeat ourself? + */ + void _logOpObjRS(const BSONObj& op) { + DEV assertInWriteLock(); + + const OpTime ts = op["ts"]._opTime(); + long long h = op["h"].numberLong(); + + { + const char *logns = rsoplog; + if ( rsOplogDetails == 0 ) { + Client::Context ctx( logns , dbpath, false); + localDB = ctx.db(); + assert( localDB ); + rsOplogDetails = nsdetails(logns); + massert(13389, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails); + } + Client::Context ctx( logns , localDB, false ); + { + int len = op.objsize(); + Record *r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len); + memcpy(getDur().writingPtr(r->data, len), op.objdata(), len); + } + /* todo: now() has code to handle clock skew. but if the skew server to server is large it will get unhappy. + this code (or code in now() maybe) should be improved. + */ + if( theReplSet ) { + if( !(theReplSet->lastOpTimeWritten<ts) ) { + log() << "replSet error possible failover clock skew issue? " << theReplSet->lastOpTimeWritten.toString() << ' ' << endl; + } + theReplSet->lastOpTimeWritten = ts; + theReplSet->lastH = h; + ctx.getClient()->setLastOp( ts ); + } + } + } + + /** given a BSON object, create a new one at dst which is the existing (partial) object + with a new object element appended at the end with fieldname "o". + + @param partial already build object with everything except the o member. e.g. something like: + { ts:..., ns:..., os2:... } + @param o a bson object to be added with fieldname "o" + @dst where to put the newly built combined object. e.g. ends up as something like: + { ts:..., ns:..., os2:..., o:... } + */ + void append_O_Obj(char *dst, const BSONObj& partial, const BSONObj& o) { + const int size1 = partial.objsize() - 1; // less the EOO char + const int oOfs = size1+3; // 3 = byte BSONOBJTYPE + byte 'o' + byte \0 + + void *p = getDur().writingPtr(dst, oOfs+o.objsize()+1); + + memcpy(p, partial.objdata(), size1); + + // adjust overall bson object size for the o: field + *(static_cast<unsigned*>(p)) += o.objsize() + 1/*fieldtype byte*/ + 2/*"o" fieldname*/; + + char *b = static_cast<char *>(p); + b += size1; + *b++ = (char) Object; + *b++ = 'o'; // { o : ... } + *b++ = 0; // null terminate "o" fieldname + memcpy(b, o.objdata(), o.objsize()); + b += o.objsize(); + *b = EOO; + } + + // global is safe as we are in write lock. we put the static outside the function to avoid the implicit mutex + // the compiler would use if inside the function. the reason this is static is to avoid a malloc/free for this + // on every logop call. + static BufBuilder logopbufbuilder(8*1024); + static void _logOpRS(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) { + DEV assertInWriteLock(); + + if ( strncmp(ns, "local.", 6) == 0 ) { + if ( strncmp(ns, "local.slaves", 12) == 0 ) + resetSlaveCache(); + return; + } + + const OpTime ts = OpTime::now(); + long long hashNew; + if( theReplSet ) { + massert(13312, "replSet error : logOp() but not primary?", theReplSet->box.getState().primary()); + hashNew = (theReplSet->lastH * 131 + ts.asLL()) * 17 + theReplSet->selfId(); + } + else { + // must be initiation + assert( *ns == 0 ); + hashNew = 0; + } + + /* we jump through a bunch of hoops here to avoid copying the obj buffer twice -- + instead we do a single copy to the destination position in the memory mapped file. + */ + + logopbufbuilder.reset(); + BSONObjBuilder b(logopbufbuilder); + b.appendTimestamp("ts", ts.asDate()); + b.append("h", hashNew); + b.append("op", opstr); + b.append("ns", ns); + if ( bb ) + b.appendBool("b", *bb); + if ( o2 ) + b.append("o2", *o2); + BSONObj partial = b.done(); + int posz = partial.objsize(); + int len = posz + obj.objsize() + 1 + 2 /*o:*/; + + Record *r; + DEV assert( logNS == 0 ); + { + const char *logns = rsoplog; + if ( rsOplogDetails == 0 ) { + Client::Context ctx( logns , dbpath, false); + localDB = ctx.db(); + assert( localDB ); + rsOplogDetails = nsdetails(logns); + massert(13347, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails); + } + Client::Context ctx( logns , localDB, false ); + r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len); + /* todo: now() has code to handle clock skew. but if the skew server to server is large it will get unhappy. + this code (or code in now() maybe) should be improved. + */ + if( theReplSet ) { + if( !(theReplSet->lastOpTimeWritten<ts) ) { + log() << "replSet ERROR possible failover clock skew issue? " << theReplSet->lastOpTimeWritten << ' ' << ts << rsLog; + log() << "replSet " << theReplSet->isPrimary() << rsLog; + } + theReplSet->lastOpTimeWritten = ts; + theReplSet->lastH = hashNew; + ctx.getClient()->setLastOp( ts ); + } + } + + append_O_Obj(r->data, partial, obj); + + if ( logLevel >= 6 ) { + BSONObj temp(r); + log( 6 ) << "logOp:" << temp << endl; + } + } + + /* we write to local.opload.$main: + { ts : ..., op: ..., ns: ..., o: ... } + ts: an OpTime timestamp + op: + "i" insert + "u" update + "d" delete + "c" db cmd + "db" declares presence of a database (ns is set to the db name + '.') + "n" no op + logNS - where to log it. 0/null means "local.oplog.$main". + bb: + if not null, specifies a boolean to pass along to the other side as b: param. + used for "justOne" or "upsert" flags on 'd', 'u' + first: true + when set, indicates this is the first thing we have logged for this database. + thus, the slave does not need to copy down all the data when it sees this. + + note this is used for single collection logging even when --replSet is enabled. + */ + static void _logOpOld(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) { + DEV assertInWriteLock(); + static BufBuilder bufbuilder(8*1024); + + if ( strncmp(ns, "local.", 6) == 0 ) { + if ( strncmp(ns, "local.slaves", 12) == 0 ) { + resetSlaveCache(); + } + return; + } + + const OpTime ts = OpTime::now(); + Client::Context context("",0,false); + + /* we jump through a bunch of hoops here to avoid copying the obj buffer twice -- + instead we do a single copy to the destination position in the memory mapped file. + */ + + bufbuilder.reset(); + BSONObjBuilder b(bufbuilder); + b.appendTimestamp("ts", ts.asDate()); + b.append("op", opstr); + b.append("ns", ns); + if ( bb ) + b.appendBool("b", *bb); + if ( o2 ) + b.append("o2", *o2); + BSONObj partial = b.done(); // partial is everything except the o:... part. + + int po_sz = partial.objsize(); + int len = po_sz + obj.objsize() + 1 + 2 /*o:*/; + + Record *r; + if( logNS == 0 ) { + logNS = "local.oplog.$main"; + if ( localOplogMainDetails == 0 ) { + Client::Context ctx( logNS , dbpath, false); + localDB = ctx.db(); + assert( localDB ); + localOplogMainDetails = nsdetails(logNS); + assert( localOplogMainDetails ); + } + Client::Context ctx( logNS , localDB, false ); + r = theDataFileMgr.fast_oplog_insert(localOplogMainDetails, logNS, len); + } + else { + Client::Context ctx( logNS, dbpath, false ); + assert( nsdetails( logNS ) ); + // first we allocate the space, then we fill it below. + r = theDataFileMgr.fast_oplog_insert( nsdetails( logNS ), logNS, len); + } + + append_O_Obj(r->data, partial, obj); + + context.getClient()->setLastOp( ts ); + + if ( logLevel >= 6 ) { + BSONObj temp(r); + log( 6 ) << "logging op:" << temp << endl; + } + + } + + static void (*_logOp)(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb ) = _logOpOld; + void newReplUp() { + replSettings.master = true; + _logOp = _logOpRS; + } + void newRepl() { + replSettings.master = true; + _logOp = _logOpUninitialized; + } + void oldRepl() { _logOp = _logOpOld; } + + void logKeepalive() { + _logOp("n", "", 0, BSONObj(), 0, 0); + } + void logOpComment(const BSONObj& obj) { + _logOp("n", "", 0, obj, 0, 0); + } + void logOpInitiate(const BSONObj& obj) { + _logOpRS("n", "", 0, obj, 0, 0); + } + + /*@ @param opstr: + c userCreateNS + i insert + n no-op / keepalive + d delete / remove + u update + */ + void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt, bool *b) { + if ( replSettings.master ) { + _logOp(opstr, ns, 0, obj, patt, b); + } + + logOpForSharding( opstr , ns , obj , patt ); + } + + void createOplog() { + dblock lk; + + const char * ns = "local.oplog.$main"; + + bool rs = !cmdLine._replSet.empty(); + if( rs ) + ns = rsoplog; + + Client::Context ctx(ns); + + NamespaceDetails * nsd = nsdetails( ns ); + + if ( nsd ) { + + if ( cmdLine.oplogSize != 0 ) { + int o = (int)(nsd->storageSize() / ( 1024 * 1024 ) ); + int n = (int)(cmdLine.oplogSize / ( 1024 * 1024 ) ); + if ( n != o ) { + stringstream ss; + ss << "cmdline oplogsize (" << n << ") different than existing (" << o << ") see: http://dochub.mongodb.org/core/increase-oplog"; + log() << ss.str() << endl; + throw UserException( 13257 , ss.str() ); + } + } + + if( rs ) return; + + DBDirectClient c; + BSONObj lastOp = c.findOne( ns, Query().sort(reverseNaturalObj) ); + if ( !lastOp.isEmpty() ) { + OpTime::setLast( lastOp[ "ts" ].date() ); + } + return; + } + + /* create an oplog collection, if it doesn't yet exist. */ + BSONObjBuilder b; + double sz; + if ( cmdLine.oplogSize != 0 ) + sz = (double)cmdLine.oplogSize; + else { + /* not specified. pick a default size */ + sz = 50.0 * 1000 * 1000; + if ( sizeof(int *) >= 8 ) { +#if defined(__APPLE__) + // typically these are desktops (dev machines), so keep it smallish + sz = (256-64) * 1000 * 1000; +#else + sz = 990.0 * 1000 * 1000; + boost::intmax_t free = File::freeSpace(dbpath); //-1 if call not supported. + double fivePct = free * 0.05; + if ( fivePct > sz ) + sz = fivePct; +#endif + } + } + + log() << "******" << endl; + log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB..." << endl; + + b.append("size", sz); + b.appendBool("capped", 1); + b.appendBool("autoIndexId", false); + + string err; + BSONObj o = b.done(); + userCreateNS(ns, o, err, false); + if( !rs ) + logOp( "n", "", BSONObj() ); + + /* sync here so we don't get any surprising lag later when we try to sync */ + MemoryMappedFile::flushAll(true); + log() << "******" << endl; + } + + // ------------------------------------- + + FindingStartCursor::FindingStartCursor( const QueryPlan & qp ) : + _qp( qp ), + _findingStart( true ), + _findingStartMode() + { init(); } + + void FindingStartCursor::next() { + if ( !_findingStartCursor || !_findingStartCursor->ok() ) { + _findingStart = false; + _c = _qp.newCursor(); // on error, start from beginning + destroyClientCursor(); + return; + } + switch( _findingStartMode ) { + // Initial mode: scan backwards from end of collection + case Initial: { + if ( !_matcher->matchesCurrent( _findingStartCursor->c() ) ) { + _findingStart = false; // found first record out of query range, so scan normally + _c = _qp.newCursor( _findingStartCursor->currLoc() ); + destroyClientCursor(); + return; + } + _findingStartCursor->advance(); + RARELY { + if ( _findingStartTimer.seconds() >= __findingStartInitialTimeout ) { + // If we've scanned enough, switch to find extent mode. + createClientCursor( extentFirstLoc( _findingStartCursor->currLoc() ) ); + _findingStartMode = FindExtent; + return; + } + } + return; + } + // FindExtent mode: moving backwards through extents, check first + // document of each extent. + case FindExtent: { + if ( !_matcher->matchesCurrent( _findingStartCursor->c() ) ) { + _findingStartMode = InExtent; + return; + } + DiskLoc prev = prevExtentFirstLoc( _findingStartCursor->currLoc() ); + if ( prev.isNull() ) { // hit beginning, so start scanning from here + createClientCursor(); + _findingStartMode = InExtent; + return; + } + // There might be a more efficient implementation than creating new cursor & client cursor each time, + // not worrying about that for now + createClientCursor( prev ); + return; + } + // InExtent mode: once an extent is chosen, find starting doc in the extent. + case InExtent: { + if ( _matcher->matchesCurrent( _findingStartCursor->c() ) ) { + _findingStart = false; // found first record in query range, so scan normally + _c = _qp.newCursor( _findingStartCursor->currLoc() ); + destroyClientCursor(); + return; + } + _findingStartCursor->advance(); + return; + } + default: { + massert( 14038, "invalid _findingStartMode", false ); + } + } + } + + DiskLoc FindingStartCursor::extentFirstLoc( const DiskLoc &rec ) { + Extent *e = rec.rec()->myExtent( rec ); + if ( !_qp.nsd()->capLooped() || ( e->myLoc != _qp.nsd()->capExtent ) ) + return e->firstRecord; + // Likely we are on the fresh side of capExtent, so return first fresh record. + // If we are on the stale side of capExtent, then the collection is small and it + // doesn't matter if we start the extent scan with capFirstNewRecord. + return _qp.nsd()->capFirstNewRecord; + } + + void wassertExtentNonempty( const Extent *e ) { + // TODO ensure this requirement is clearly enforced, or fix. + wassert( !e->firstRecord.isNull() ); + } + + DiskLoc FindingStartCursor::prevExtentFirstLoc( const DiskLoc &rec ) { + Extent *e = rec.rec()->myExtent( rec ); + if ( _qp.nsd()->capLooped() ) { + if ( e->xprev.isNull() ) { + e = _qp.nsd()->lastExtent.ext(); + } + else { + e = e->xprev.ext(); + } + if ( e->myLoc != _qp.nsd()->capExtent ) { + wassertExtentNonempty( e ); + return e->firstRecord; + } + } + else { + if ( !e->xprev.isNull() ) { + e = e->xprev.ext(); + wassertExtentNonempty( e ); + return e->firstRecord; + } + } + return DiskLoc(); // reached beginning of collection + } + + void FindingStartCursor::createClientCursor( const DiskLoc &startLoc ) { + shared_ptr<Cursor> c = _qp.newCursor( startLoc ); + _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns()) ); + } + + bool FindingStartCursor::firstDocMatchesOrEmpty() const { + shared_ptr<Cursor> c = _qp.newCursor(); + return !c->ok() || _matcher->matchesCurrent( c.get() ); + } + + void FindingStartCursor::init() { + BSONElement tsElt = _qp.originalQuery()[ "ts" ]; + massert( 13044, "no ts field in query", !tsElt.eoo() ); + BSONObjBuilder b; + b.append( tsElt ); + BSONObj tsQuery = b.obj(); + _matcher.reset(new CoveredIndexMatcher(tsQuery, _qp.indexKey())); + if ( firstDocMatchesOrEmpty() ) { + _c = _qp.newCursor(); + _findingStart = false; + return; + } + // Use a ClientCursor here so we can release db mutex while scanning + // oplog (can take quite a while with large oplogs). + shared_ptr<Cursor> c = _qp.newReverseCursor(); + _findingStartCursor.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, _qp.ns(), BSONObj()) ); + _findingStartTimer.reset(); + _findingStartMode = Initial; + } + + // ------------------------------------- + + struct TestOpTime : public UnitTest { + void run() { + OpTime t; + for ( int i = 0; i < 10; i++ ) { + OpTime s = OpTime::now_inlock(); + assert( s != t ); + t = s; + } + OpTime q = t; + assert( q == t ); + assert( !(q != t) ); + } + } testoptime; + + int _dummy_z; + + void pretouchN(vector<BSONObj>& v, unsigned a, unsigned b) { + DEV assert( !d.dbMutex.isWriteLocked() ); + + Client *c = currentClient.get(); + if( c == 0 ) { + Client::initThread("pretouchN"); + c = &cc(); + } + + readlock lk(""); + for( unsigned i = a; i <= b; i++ ) { + const BSONObj& op = v[i]; + const char *which = "o"; + const char *opType = op.getStringField("op"); + if ( *opType == 'i' ) + ; + else if( *opType == 'u' ) + which = "o2"; + else + continue; + /* todo : other operations */ + + try { + BSONObj o = op.getObjectField(which); + BSONElement _id; + if( o.getObjectID(_id) ) { + const char *ns = op.getStringField("ns"); + BSONObjBuilder b; + b.append(_id); + BSONObj result; + Client::Context ctx( ns ); + if( Helpers::findById(cc(), ns, b.done(), result) ) + _dummy_z += result.objsize(); // touch + } + } + catch( DBException& e ) { + log() << "ignoring assertion in pretouchN() " << a << ' ' << b << ' ' << i << ' ' << e.toString() << endl; + } + } + } + + void pretouchOperation(const BSONObj& op) { + + if( d.dbMutex.isWriteLocked() ) + return; // no point pretouching if write locked. not sure if this will ever fire, but just in case. + + const char *which = "o"; + const char *opType = op.getStringField("op"); + if ( *opType == 'i' ) + ; + else if( *opType == 'u' ) + which = "o2"; + else + return; + /* todo : other operations */ + + try { + BSONObj o = op.getObjectField(which); + BSONElement _id; + if( o.getObjectID(_id) ) { + const char *ns = op.getStringField("ns"); + BSONObjBuilder b; + b.append(_id); + BSONObj result; + readlock lk(ns); + Client::Context ctx( ns ); + if( Helpers::findById(cc(), ns, b.done(), result) ) + _dummy_z += result.objsize(); // touch + } + } + catch( DBException& ) { + log() << "ignoring assertion in pretouchOperation()" << endl; + } + } + + BSONObj Sync::getMissingDoc(const BSONObj& o) { + OplogReader missingObjReader; + + uassert(15916, str::stream() << "Can no longer connect to initial sync source: " << hn, missingObjReader.connect(hn)); + + const char *ns = o.getStringField("ns"); + // might be more than just _id in the update criteria + BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj(); + BSONObj missingObj; + try { + missingObj = missingObjReader.findOne(ns, query); + } catch(DBException& e) { + log() << "replication assertion fetching missing object: " << e.what() << endl; + throw; + } + + return missingObj; + } + + bool Sync::shouldRetry(const BSONObj& o) { + // we don't have the object yet, which is possible on initial sync. get it. + log() << "replication info adding missing object" << endl; // rare enough we can log + + BSONObj missingObj = getMissingDoc(o); + + if( missingObj.isEmpty() ) { + log() << "replication missing object not found on source. presumably deleted later in oplog" << endl; + log() << "replication o2: " << o.getObjectField("o2").toString() << endl; + log() << "replication o firstfield: " << o.getObjectField("o").firstElementFieldName() << endl; + + return false; + } + else { + const char *ns = o.getStringField("ns"); + Client::Context ctx(ns); + DiskLoc d = theDataFileMgr.insert(ns, (void*) missingObj.objdata(), missingObj.objsize()); + uassert(15917, "Got bad disk location when attempting to insert", !d.isNull()); + + return true; + } + } + + /** @param fromRepl false if from ApplyOpsCmd + @return true if was and update should have happened and the document DNE. see replset initial sync code. + */ + bool applyOperation_inlock(const BSONObj& op , bool fromRepl ) { + assertInWriteLock(); + LOG(6) << "applying op: " << op << endl; + bool failedUpdate = false; + + OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters; + + const char *names[] = { "o", "ns", "op", "b" }; + BSONElement fields[4]; + op.getFields(4, names, fields); + + BSONObj o; + if( fields[0].isABSONObj() ) + o = fields[0].embeddedObject(); + + const char *ns = fields[1].valuestrsafe(); + + // operation type -- see logOp() comments for types + const char *opType = fields[2].valuestrsafe(); + + if ( *opType == 'i' ) { + opCounters->gotInsert(); + + const char *p = strchr(ns, '.'); + if ( p && strcmp(p, ".system.indexes") == 0 ) { + // updates aren't allowed for indexes -- so we will do a regular insert. if index already + // exists, that is ok. + theDataFileMgr.insert(ns, (void*) o.objdata(), o.objsize()); + } + else { + // do upserts for inserts as we might get replayed more than once + OpDebug debug; + BSONElement _id; + if( !o.getObjectID(_id) ) { + /* No _id. This will be very slow. */ + Timer t; + updateObjects(ns, o, o, true, false, false, debug ); + if( t.millis() >= 2 ) { + RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl; + } + } + else { + /* erh 10/16/2009 - this is probably not relevant any more since its auto-created, but not worth removing */ + RARELY ensureHaveIdIndex(ns); // otherwise updates will be slow + + /* todo : it may be better to do an insert here, and then catch the dup key exception and do update + then. very few upserts will not be inserts... + */ + BSONObjBuilder b; + b.append(_id); + updateObjects(ns, o, b.done(), true, false, false , debug ); + } + } + } + else if ( *opType == 'u' ) { + opCounters->gotUpdate(); + // dm do we create this for a capped collection? + // - if not, updates would be slow + // - but if were by id would be slow on primary too so maybe ok + // - if on primary was by another key and there are other indexes, this could be very bad w/out an index + // - if do create, odd to have on secondary but not primary. also can cause secondary to block for + // quite a while on creation. + RARELY ensureHaveIdIndex(ns); // otherwise updates will be super slow + OpDebug debug; + BSONObj updateCriteria = op.getObjectField("o2"); + bool upsert = fields[3].booleanSafe(); + UpdateResult ur = updateObjects(ns, o, updateCriteria, upsert, /*multi*/ false, /*logop*/ false , debug ); + if( ur.num == 0 ) { + if( ur.mod ) { + if( updateCriteria.nFields() == 1 ) { + // was a simple { _id : ... } update criteria + failedUpdate = true; + // todo: probably should assert in these failedUpdate cases if not in initialSync + } + // need to check to see if it isn't present so we can set failedUpdate correctly. + // note that adds some overhead for this extra check in some cases, such as an updateCriteria + // of the form + // { _id:..., { x : {$size:...} } + // thus this is not ideal. + else { + NamespaceDetails *nsd = nsdetails(ns); + + if (nsd == NULL || + (nsd->findIdIndex() >= 0 && Helpers::findById(nsd, updateCriteria).isNull()) || + // capped collections won't have an _id index + (nsd->findIdIndex() < 0 && Helpers::findOne(ns, updateCriteria, false).isNull())) { + failedUpdate = true; + } + + // Otherwise, it's present; zero objects were updated because of additional specifiers + // in the query for idempotence + } + } + else { + // this could happen benignly on an oplog duplicate replay of an upsert + // (because we are idempotent), + // if an regular non-mod update fails the item is (presumably) missing. + if( !upsert ) { + failedUpdate = true; + } + } + } + } + else if ( *opType == 'd' ) { + opCounters->gotDelete(); + if ( opType[1] == 0 ) + deleteObjects(ns, o, /*justOne*/ fields[3].booleanSafe()); + else + assert( opType[1] == 'b' ); // "db" advertisement + } + else if ( *opType == 'c' ) { + opCounters->gotCommand(); + BufBuilder bb; + BSONObjBuilder ob; + _runCommands(ns, o, bb, ob, true, 0); + } + else if ( *opType == 'n' ) { + // no op + } + else { + throw MsgAssertionException( 14825 , ErrorMsg("error in applyOperation : unknown opType ", *opType) ); + } + return failedUpdate; + } + + class ApplyOpsCmd : public Command { + public: + virtual bool slaveOk() const { return false; } + virtual LockType locktype() const { return WRITE; } + ApplyOpsCmd() : Command( "applyOps" ) {} + virtual void help( stringstream &help ) const { + help << "internal (sharding)\n{ applyOps : [ ] , preCondition : [ { ns : ... , q : ... , res : ... } ] }"; + } + virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + + if ( cmdObj.firstElement().type() != Array ) { + errmsg = "ops has to be an array"; + return false; + } + + BSONObj ops = cmdObj.firstElement().Obj(); + + { + // check input + BSONObjIterator i( ops ); + while ( i.more() ) { + BSONElement e = i.next(); + if ( e.type() == Object ) + continue; + errmsg = "op not an object: "; + errmsg += e.fieldName(); + return false; + } + } + + if ( cmdObj["preCondition"].type() == Array ) { + BSONObjIterator i( cmdObj["preCondition"].Obj() ); + while ( i.more() ) { + BSONObj f = i.next().Obj(); + + BSONObj realres = db.findOne( f["ns"].String() , f["q"].Obj() ); + + Matcher m( f["res"].Obj() ); + if ( ! m.matches( realres ) ) { + result.append( "got" , realres ); + result.append( "whatFailed" , f ); + errmsg = "pre-condition failed"; + return false; + } + } + } + + // apply + int num = 0; + BSONObjIterator i( ops ); + while ( i.more() ) { + BSONElement e = i.next(); + // todo SERVER-4259 ? + applyOperation_inlock( e.Obj() , false ); + num++; + } + + result.append( "applied" , num ); + + if ( ! fromRepl ) { + // We want this applied atomically on slaves + // so we re-wrap without the pre-condition for speed + + string tempNS = str::stream() << dbname << ".$cmd"; + + logOp( "c" , tempNS.c_str() , cmdObj.firstElement().wrap() ); + } + + return true; + } + + DBDirectClient db; + + } applyOpsCmd; + +} diff --git a/src/mongo/db/oplog.h b/src/mongo/db/oplog.h new file mode 100644 index 00000000000..6c1644fe3ab --- /dev/null +++ b/src/mongo/db/oplog.h @@ -0,0 +1,149 @@ +// oplog.h - writing to and reading from oplog + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* + + local.oplog.$main is the default +*/ + +#pragma once + +#include "pdfile.h" +#include "db.h" +#include "dbhelpers.h" +#include "clientcursor.h" +#include "../client/dbclient.h" +#include "../util/optime.h" +#include "../util/timer.h" + +namespace mongo { + + void createOplog(); + + void _logOpObjRS(const BSONObj& op); + + /** Write operation to the log (local.oplog.$main) + + @param opstr + "i" insert + "u" update + "d" delete + "c" db cmd + "n" no-op + "db" declares presence of a database (ns is set to the db name + '.') + + See _logOp() in oplog.cpp for more details. + */ + void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt = 0, bool *b = 0); + + void logKeepalive(); + + /** puts obj in the oplog as a comment (a no-op). Just for diags. + convention is + { msg : "text", ... } + */ + void logOpComment(const BSONObj& obj); + + void oplogCheckCloseDatabase( Database * db ); + + extern int __findingStartInitialTimeout; // configurable for testing + + class QueryPlan; + + /** Implements an optimized procedure for finding the first op in the oplog. */ + class FindingStartCursor { + public: + + /** + * The cursor will attempt to find the first op in the oplog matching the + * 'ts' field of the qp's query. + */ + FindingStartCursor( const QueryPlan & qp ); + + /** @return true if the first matching op in the oplog has been found. */ + bool done() const { return !_findingStart; } + + /** @return cursor pointing to the first matching op, if done(). */ + shared_ptr<Cursor> cursor() { verify( 14835, done() ); return _c; } + + /** Iterate the cursor, to continue trying to find matching op. */ + void next(); + + /** Yield cursor, if not done(). */ + bool prepareToYield() { + if ( _findingStartCursor ) { + return _findingStartCursor->prepareToYield( _yieldData ); + } + return false; + } + + /** Recover from cursor yield. */ + void recoverFromYield() { + if ( _findingStartCursor ) { + if ( !ClientCursor::recoverFromYield( _yieldData ) ) { + _findingStartCursor.reset( 0 ); + msgassertedNoTrace( 15889, "FindingStartCursor::recoverFromYield() failed to recover" ); + } + } + } + private: + enum FindingStartMode { Initial, FindExtent, InExtent }; + const QueryPlan &_qp; + bool _findingStart; + FindingStartMode _findingStartMode; + auto_ptr< CoveredIndexMatcher > _matcher; + Timer _findingStartTimer; + ClientCursor::CleanupPointer _findingStartCursor; + shared_ptr<Cursor> _c; + ClientCursor::YieldData _yieldData; + DiskLoc extentFirstLoc( const DiskLoc &rec ); + + DiskLoc prevExtentFirstLoc( const DiskLoc &rec ); + void createClientCursor( const DiskLoc &startLoc = DiskLoc() ); + void destroyClientCursor() { + _findingStartCursor.reset( 0 ); + } + void init(); + bool firstDocMatchesOrEmpty() const; + }; + + class Sync { + protected: + string hn; + public: + Sync(const string& hostname) : hn(hostname) {} + virtual ~Sync() {} + virtual BSONObj getMissingDoc(const BSONObj& o); + + /** + * If applyOperation_inlock should be called again after an update fails. + */ + virtual bool shouldRetry(const BSONObj& o); + }; + + void pretouchOperation(const BSONObj& op); + void pretouchN(vector<BSONObj>&, unsigned a, unsigned b); + + /** + * take an op and apply locally + * used for applying from an oplog + * @param fromRepl really from replication or for testing/internal/command/etc... + * Returns if the op was an update that could not be applied (true on failure) + */ + bool applyOperation_inlock(const BSONObj& op , bool fromRepl = true ); +} diff --git a/src/mongo/db/oplogreader.h b/src/mongo/db/oplogreader.h new file mode 100644 index 00000000000..6efd1469c01 --- /dev/null +++ b/src/mongo/db/oplogreader.h @@ -0,0 +1,121 @@ +/** @file oplogreader.h */ + +#pragma once + +#include "../client/dbclient.h" +#include "../client/constants.h" +#include "dbhelpers.h" + +namespace mongo { + + /* started abstracting out the querying of the primary/master's oplog + still fairly awkward but a start. + */ + class OplogReader { + shared_ptr<DBClientConnection> _conn; + shared_ptr<DBClientCursor> cursor; + public: + OplogReader() { } + ~OplogReader() { } + void resetCursor() { cursor.reset(); } + void resetConnection() { + cursor.reset(); + _conn.reset(); + } + DBClientConnection* conn() { return _conn.get(); } + BSONObj findOne(const char *ns, const Query& q) { + return conn()->findOne(ns, q, 0, QueryOption_SlaveOk); + } + BSONObj getLastOp(const char *ns) { + return findOne(ns, Query().sort(reverseNaturalObj)); + } + + /* ok to call if already connected */ + bool connect(string hostname); + + bool connect(const BSONObj& rid, const int from, const string& to); + + void tailCheck() { + if( cursor.get() && cursor->isDead() ) { + log() << "repl: old cursor isDead, will initiate a new one" << endl; + resetCursor(); + } + } + + bool haveCursor() { return cursor.get() != 0; } + + /** this is ok but commented out as when used one should consider if QueryOption_OplogReplay + is needed; if not fine, but if so, need to change. + *//* + void query(const char *ns, const BSONObj& query) { + assert( !haveCursor() ); + cursor.reset( _conn->query(ns, query, 0, 0, 0, QueryOption_SlaveOk).release() ); + }*/ + + /** this can be used; it is commented out as it does not indicate + QueryOption_OplogReplay and that is likely important. could be uncommented + just need to add that. + */ + /* + void queryGTE(const char *ns, OpTime t) { + BSONObjBuilder q; + q.appendDate("$gte", t.asDate()); + BSONObjBuilder q2; + q2.append("ts", q.done()); + query(ns, q2.done()); + } + */ + + void tailingQuery(const char *ns, const BSONObj& query, const BSONObj* fields=0) { + assert( !haveCursor() ); + log(2) << "repl: " << ns << ".find(" << query.toString() << ')' << endl; + cursor.reset( _conn->query( ns, query, 0, 0, fields, + QueryOption_CursorTailable | QueryOption_SlaveOk | QueryOption_OplogReplay | + /* TODO: slaveOk maybe shouldn't use? */ + QueryOption_AwaitData + ).release() ); + } + + void tailingQueryGTE(const char *ns, OpTime t, const BSONObj* fields=0) { + BSONObjBuilder q; + q.appendDate("$gte", t.asDate()); + BSONObjBuilder query; + query.append("ts", q.done()); + tailingQuery(ns, query.done(), fields); + } + + /* Do a tailing query, but only send the ts field back. */ + void ghostQueryGTE(const char *ns, OpTime t) { + const BSONObj fields = BSON("ts" << 1 << "_id" << 0); + return tailingQueryGTE(ns, t, &fields); + } + + bool more() { + uassert( 15910, "Doesn't have cursor for reading oplog", cursor.get() ); + return cursor->more(); + } + + bool moreInCurrentBatch() { + uassert( 15911, "Doesn't have cursor for reading oplog", cursor.get() ); + return cursor->moreInCurrentBatch(); + } + + /* old mongod's can't do the await flag... */ + bool awaitCapable() { + return cursor->hasResultFlag(ResultFlag_AwaitCapable); + } + + void peek(vector<BSONObj>& v, int n) { + if( cursor.get() ) + cursor->peek(v,n); + } + BSONObj nextSafe() { return cursor->nextSafe(); } + BSONObj next() { return cursor->next(); } + void putBack(BSONObj op) { cursor->putBack(op); } + + private: + bool commonConnect(const string& hostName); + bool passthroughHandshake(const BSONObj& rid, const int f); + }; + +} diff --git a/src/mongo/db/ops/count.cpp b/src/mongo/db/ops/count.cpp new file mode 100644 index 00000000000..3c183596b9d --- /dev/null +++ b/src/mongo/db/ops/count.cpp @@ -0,0 +1,103 @@ +// count.cpp + +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "count.h" + +#include "../client.h" +#include "../clientcursor.h" +#include "../namespace.h" +#include "../queryutil.h" + +namespace mongo { + + long long runCount( const char *ns, const BSONObj &cmd, string &err ) { + Client::Context cx(ns); + NamespaceDetails *d = nsdetails( ns ); + if ( !d ) { + err = "ns missing"; + return -1; + } + BSONObj query = cmd.getObjectField("query"); + + // count of all objects + if ( query.isEmpty() ) { + return applySkipLimit( d->stats.nrecords , cmd ); + } + + string exceptionInfo; + long long count = 0; + long long skip = cmd["skip"].numberLong(); + long long limit = cmd["limit"].numberLong(); + bool simpleEqualityMatch; + shared_ptr<Cursor> cursor = NamespaceDetailsTransient::getCursor( ns, query, BSONObj(), false, &simpleEqualityMatch ); + ClientCursor::CleanupPointer ccPointer; + ElapsedTracker timeToStartYielding( 256, 20 ); + try { + while( cursor->ok() ) { + if ( !ccPointer ) { + if ( timeToStartYielding.intervalHasElapsed() ) { + // Lazily construct a ClientCursor, avoiding a performance regression when scanning a very + // small number of documents. + ccPointer.reset( new ClientCursor( QueryOption_NoCursorTimeout, cursor, ns ) ); + } + } + else if ( !ccPointer->yieldSometimes( simpleEqualityMatch ? ClientCursor::DontNeed : ClientCursor::MaybeCovered ) || + !cursor->ok() ) { + break; + } + + // With simple equality matching there is no need to use the matcher because the bounds + // are enforced by the FieldRangeVectorIterator and only key fields have constraints. There + // is no need to do key deduping because an exact value is specified in the query for all key + // fields and duplicate keys are not allowed per document. + // NOTE In the distant past we used a min/max bounded BtreeCursor with a shallow + // equality comparison to check for matches in the simple match case. That may be + // more performant, but I don't think we've measured the performance. + if ( simpleEqualityMatch || + ( cursor->currentMatches() && !cursor->getsetdup( cursor->currLoc() ) ) ) { + + if ( skip > 0 ) { + --skip; + } + else { + ++count; + if ( limit > 0 && count >= limit ) { + break; + } + } + } + cursor->advance(); + } + ccPointer.reset(); + return count; + + } catch ( const DBException &e ) { + exceptionInfo = e.toString(); + } catch ( const std::exception &e ) { + exceptionInfo = e.what(); + } catch ( ... ) { + exceptionInfo = "unknown exception"; + } + // Historically we have returned zero in many count assertion cases - see SERVER-2291. + log() << "Count with ns: " << ns << " and query: " << query + << " failed with exception: " << exceptionInfo + << endl; + return 0; + } + +} // namespace mongo diff --git a/src/mongo/db/ops/count.h b/src/mongo/db/ops/count.h new file mode 100644 index 00000000000..807741e1253 --- /dev/null +++ b/src/mongo/db/ops/count.h @@ -0,0 +1,30 @@ +// count.h + +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "../jsobj.h" +#include "../diskloc.h" + +namespace mongo { + + /** + * { count: "collectionname"[, query: <query>] } + * @return -1 on ns does not exist error and other errors, 0 on other errors, otherwise the match count. + */ + long long runCount(const char *ns, const BSONObj& cmd, string& err); + +} // namespace mongo diff --git a/src/mongo/db/ops/delete.cpp b/src/mongo/db/ops/delete.cpp new file mode 100644 index 00000000000..e33611c151e --- /dev/null +++ b/src/mongo/db/ops/delete.cpp @@ -0,0 +1,158 @@ +// delete.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "delete.h" +#include "../queryoptimizer.h" +#include "../oplog.h" + +namespace mongo { + + /* ns: namespace, e.g. <database>.<collection> + pattern: the "where" clause / criteria + justOne: stop after 1 match + god: allow access to system namespaces, and don't yield + */ + long long deleteObjects(const char *ns, BSONObj pattern, bool justOneOrig, bool logop, bool god, RemoveSaver * rs ) { + if( !god ) { + if ( strstr(ns, ".system.") ) { + /* note a delete from system.indexes would corrupt the db + if done here, as there are pointers into those objects in + NamespaceDetails. + */ + uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) ); + } + if ( strchr( ns , '$' ) ) { + log() << "cannot delete from collection with reserved $ in name: " << ns << endl; + uassert( 10100 , "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 ); + } + } + + { + NamespaceDetails *d = nsdetails( ns ); + if ( ! d ) + return 0; + uassert( 10101 , "can't remove from a capped collection" , ! d->capped ); + } + + long long nDeleted = 0; + + shared_ptr< Cursor > creal = NamespaceDetailsTransient::getCursor( ns, pattern, BSONObj(), false, 0 ); + + if( !creal->ok() ) + return nDeleted; + + shared_ptr< Cursor > cPtr = creal; + auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) ); + cc->setDoingDeletes( true ); + + CursorId id = cc->cursorid(); + + bool justOne = justOneOrig; + bool canYield = !god && !(creal->matcher() && creal->matcher()->docMatcher().atomic()); + + do { + // TODO: we can generalize this I believe + // + bool willNeedRecord = (creal->matcher() && creal->matcher()->needRecord()) || pattern.isEmpty() || isSimpleIdQuery( pattern ); + if ( ! willNeedRecord ) { + // TODO: this is a total hack right now + // check if the index full encompasses query + + if ( pattern.nFields() == 1 && + str::equals( pattern.firstElement().fieldName() , creal->indexKeyPattern().firstElement().fieldName() ) ) + willNeedRecord = true; + } + + if ( canYield && ! cc->yieldSometimes( willNeedRecord ? ClientCursor::WillNeed : ClientCursor::MaybeCovered ) ) { + cc.release(); // has already been deleted elsewhere + // TODO should we assert or something? + break; + } + if ( !cc->ok() ) { + break; // if we yielded, could have hit the end + } + + // this way we can avoid calling updateLocation() every time (expensive) + // as well as some other nuances handled + cc->setDoingDeletes( true ); + + DiskLoc rloc = cc->currLoc(); + BSONObj key = cc->currKey(); + + bool match = creal->currentMatches(); + bool dup = cc->c()->getsetdup(rloc); + + if ( ! cc->advance() ) + justOne = true; + + if ( ! match ) + continue; + + assert( !dup ); // can't be a dup, we deleted it! + + if ( !justOne ) { + /* NOTE: this is SLOW. this is not good, noteLocation() was designed to be called across getMore + blocks. here we might call millions of times which would be bad. + */ + cc->c()->prepareToTouchEarlierIterate(); + } + + if ( logop ) { + BSONElement e; + if( BSONObj( rloc.rec() ).getObjectID( e ) ) { + BSONObjBuilder b; + b.append( e ); + bool replJustOne = true; + logOp( "d", ns, b.done(), 0, &replJustOne ); + } + else { + problem() << "deleted object without id, not logging" << endl; + } + } + + if ( rs ) + rs->goingToDelete( rloc.obj() /*cc->c->current()*/ ); + + theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc); + nDeleted++; + if ( justOne ) { + break; + } + cc->c()->recoverFromTouchingEarlierIterate(); + + if( !god ) + getDur().commitIfNeeded(); + + if( debug && god && nDeleted == 100 ) + log() << "warning high number of deletes with god=true which could use significant memory" << endl; + } + while ( cc->ok() ); + + if ( cc.get() && ClientCursor::find( id , false ) == 0 ) { + // TODO: remove this and the id declaration above if this doesn't trigger + // if it does, then i'm very confused (ERH 06/2011) + error() << "this should be impossible" << endl; + printStackTrace(); + cc.release(); + } + + return nDeleted; + } + +} diff --git a/src/mongo/db/ops/delete.h b/src/mongo/db/ops/delete.h new file mode 100644 index 00000000000..a74b7a664bc --- /dev/null +++ b/src/mongo/db/ops/delete.h @@ -0,0 +1,33 @@ +// delete.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../../pch.h" +#include "../jsobj.h" +#include "../clientcursor.h" + +namespace mongo { + + class RemoveSaver; + + // If justOne is true, deletedId is set to the id of the deleted object. + long long deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop = false, bool god=false, RemoveSaver * rs=0); + + +} diff --git a/src/mongo/db/ops/query.cpp b/src/mongo/db/ops/query.cpp new file mode 100644 index 00000000000..15e3ed9053f --- /dev/null +++ b/src/mongo/db/ops/query.cpp @@ -0,0 +1,870 @@ +// query.cpp + +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "pch.h" +#include "query.h" +#include "../pdfile.h" +#include "../jsobjmanipulator.h" +#include "../../bson/util/builder.h" +#include <time.h> +#include "../introspect.h" +#include "../btree.h" +#include "../../util/lruishmap.h" +#include "../json.h" +#include "../repl.h" +#include "../replutil.h" +#include "../scanandorder.h" +#include "../security.h" +#include "../curop-inl.h" +#include "../commands.h" +#include "../queryoptimizer.h" +#include "../lasterror.h" +#include "../../s/d_logic.h" +#include "../repl_block.h" +#include "../../server.h" +#include "../d_concurrency.h" + +namespace mongo { + + /* We cut off further objects once we cross this threshold; thus, you might get + a little bit more than this, it is a threshold rather than a limit. + */ + const int MaxBytesToReturnToClientAtOnce = 4 * 1024 * 1024; + + //ns->query->DiskLoc +// LRUishMap<BSONObj,DiskLoc,5> lrutest(123); + + extern bool useCursors; + extern bool useHints; + + bool runCommands(const char *ns, BSONObj& jsobj, CurOp& curop, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) { + try { + return _runCommands(ns, jsobj, b, anObjBuilder, fromRepl, queryOptions); + } + catch( SendStaleConfigException& ){ + throw; + } + catch ( AssertionException& e ) { + assert( e.getCode() != SendStaleConfigCode && e.getCode() != RecvStaleConfigCode ); + + e.getInfo().append( anObjBuilder , "assertion" , "assertionCode" ); + curop.debug().exceptionInfo = e.getInfo(); + } + anObjBuilder.append("errmsg", "db assertion failure"); + anObjBuilder.append("ok", 0.0); + BSONObj x = anObjBuilder.done(); + b.appendBuf((void*) x.objdata(), x.objsize()); + return true; + } + + + BSONObj id_obj = fromjson("{\"_id\":1}"); + BSONObj empty_obj = fromjson("{}"); + + + //int dump = 0; + + /* empty result for error conditions */ + QueryResult* emptyMoreResult(long long cursorid) { + BufBuilder b(32768); + b.skip(sizeof(QueryResult)); + QueryResult *qr = (QueryResult *) b.buf(); + qr->cursorId = 0; // 0 indicates no more data to retrieve. + qr->startingFrom = 0; + qr->len = b.len(); + qr->setOperation(opReply); + qr->initializeResultFlags(); + qr->nReturned = 0; + b.decouple(); + return qr; + } + + QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop, int pass, bool& exhaust ) { + exhaust = false; + ClientCursor::Pointer p(cursorid); + ClientCursor *cc = p.c(); + + int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce; + + BufBuilder b( bufSize ); + b.skip(sizeof(QueryResult)); + int resultFlags = ResultFlag_AwaitCapable; + int start = 0; + int n = 0; + + if ( unlikely(!cc) ) { + LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl; + cursorid = 0; + resultFlags = ResultFlag_CursorNotFound; + } + else { + // check for spoofing of the ns such that it does not match the one originally there for the cursor + uassert(14833, "auth error", str::equals(ns, cc->ns().c_str())); + + if ( pass == 0 ) + cc->updateSlaveLocation( curop ); + + int queryOptions = cc->queryOptions(); + + curop.debug().query = cc->query(); + + start = cc->pos(); + Cursor *c = cc->c(); + c->checkLocation(); + DiskLoc last; + + scoped_ptr<Projection::KeyOnly> keyFieldsOnly; + if ( cc->modifiedKeys() == false && cc->isMultiKey() == false && cc->fields ) + keyFieldsOnly.reset( cc->fields->checkKey( cc->indexKeyPattern() ) ); + + // This manager may be stale, but it's the state of chunking when the cursor was created. + ShardChunkManagerPtr manager = cc->getChunkManager(); + + while ( 1 ) { + if ( !c->ok() ) { + if ( c->tailable() ) { + /* when a tailable cursor hits "EOF", ok() goes false, and current() is null. however + advance() can still be retries as a reactivation attempt. when there is new data, it will + return true. that's what we are doing here. + */ + if ( c->advance() ) + continue; + + if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) { + return 0; + } + + break; + } + p.release(); + bool ok = ClientCursor::erase(cursorid); + assert(ok); + cursorid = 0; + cc = 0; + break; + } + + // in some cases (clone collection) there won't be a matcher + if ( c->matcher() && !c->matcher()->matchesCurrent( c ) ) { + } + else if ( manager && ! manager->belongsToMe( cc ) ){ + LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl; + } + else { + if( c->getsetdup(c->currLoc()) ) { + //out() << " but it's a dup \n"; + } + else { + last = c->currLoc(); + n++; + + if ( keyFieldsOnly ) { + fillQueryResultFromObj(b, 0, keyFieldsOnly->hydrate( c->currKey() ) ); + } + else { + BSONObj js = c->current(); + // show disk loc should be part of the main query, not in an $or clause, so this should be ok + fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0)); + } + + if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) { + c->advance(); + cc->incPos( n ); + break; + } + } + } + c->advance(); + + if ( ! cc->yieldSometimes( ClientCursor::MaybeCovered ) ) { + ClientCursor::erase(cursorid); + cursorid = 0; + cc = 0; + p.deleted(); + break; + } + } + + if ( cc ) { + cc->updateLocation(); + cc->mayUpgradeStorage(); + cc->storeOpForSlave( last ); + exhaust = cc->queryOptions() & QueryOption_Exhaust; + } + } + + QueryResult *qr = (QueryResult *) b.buf(); + qr->len = b.len(); + qr->setOperation(opReply); + qr->_resultFlags() = resultFlags; + qr->cursorId = cursorid; + qr->startingFrom = start; + qr->nReturned = n; + b.decouple(); + + return qr; + } + + class ExplainBuilder { + // Note: by default we filter out allPlans and oldPlan in the shell's + // explain() function. If you add any recursive structures, make sure to + // edit the JS to make sure everything gets filtered. + public: + ExplainBuilder() : _i() {} + void ensureStartScan() { + if ( !_a.get() ) { + _a.reset( new BSONArrayBuilder() ); + } + } + void noteCursor( Cursor *c ) { + BSONObjBuilder b( _a->subobjStart() ); + b << "cursor" << c->toString() << "indexBounds" << c->prettyIndexBounds(); + b.done(); + } + void noteScan( Cursor *c, long long nscanned, long long nscannedObjects, int n, bool scanAndOrder, + int millis, bool hint, int nYields , int nChunkSkips , bool indexOnly ) { + if ( _i == 1 ) { + _c.reset( new BSONArrayBuilder() ); + *_c << _b->obj(); + } + if ( _i == 0 ) { + _b.reset( new BSONObjBuilder() ); + } + else { + _b.reset( new BSONObjBuilder( _c->subobjStart() ) ); + } + *_b << "cursor" << c->toString(); + _b->appendNumber( "nscanned", nscanned ); + _b->appendNumber( "nscannedObjects", nscannedObjects ); + *_b << "n" << n; + + if ( scanAndOrder ) + *_b << "scanAndOrder" << true; + + *_b << "millis" << millis; + + *_b << "nYields" << nYields; + *_b << "nChunkSkips" << nChunkSkips; + *_b << "isMultiKey" << c->isMultiKey(); + *_b << "indexOnly" << indexOnly; + + *_b << "indexBounds" << c->prettyIndexBounds(); + + c->explainDetails( *_b ); + + if ( !hint ) { + *_b << "allPlans" << _a->arr(); + } + if ( _i != 0 ) { + _b->done(); + } + _a.reset( 0 ); + ++_i; + } + BSONObj finishWithSuffix( long long nscanned, long long nscannedObjects, int n, int millis, const BSONObj &suffix ) { + if ( _i > 1 ) { + BSONObjBuilder b; + b << "clauses" << _c->arr(); + b.appendNumber( "nscanned", nscanned ); + b.appendNumber( "nscannedObjects", nscannedObjects ); + b << "n" << n; + b << "millis" << millis; + b.appendElements( suffix ); + return b.obj(); + } + else { + stringstream host; + host << getHostNameCached() << ":" << cmdLine.port; + *_b << "server" << host.str(); + _b->appendElements( suffix ); + return _b->obj(); + } + } + private: + auto_ptr< BSONArrayBuilder > _a; + auto_ptr< BSONObjBuilder > _b; + auto_ptr< BSONArrayBuilder > _c; + int _i; + }; + + // Implements database 'query' requests using the query optimizer's QueryOp interface + class UserQueryOp : public QueryOp { + public: + + UserQueryOp( const ParsedQuery& pq, Message &response, ExplainBuilder &eb, CurOp &curop ) : + _buf( 32768 ) , // TODO be smarter here + _pq( pq ) , + _ntoskip( pq.getSkip() ) , + _nscanned(0), _oldNscanned(0), _nscannedObjects(0), _oldNscannedObjects(0), + _n(0), + _oldN(0), + _nYields(), + _nChunkSkips(), + _chunkManager( shardingState.needShardChunkManager(pq.ns()) ? + shardingState.getShardChunkManager(pq.ns()) : ShardChunkManagerPtr() ), + _inMemSort(false), + _capped(false), + _saveClientCursor(false), + _wouldSaveClientCursor(false), + _oplogReplay( pq.hasOption( QueryOption_OplogReplay) ), + _response( response ), + _eb( eb ), + _curop( curop ), + _yieldRecoveryFailed() + {} + + virtual void _init() { + // only need to put the QueryResult fields there if we're building the first buffer in the message. + if ( _response.empty() ) { + _buf.skip( sizeof( QueryResult ) ); + } + + if ( _oplogReplay ) { + _findingStartCursor.reset( new FindingStartCursor( qp() ) ); + _capped = true; + } + else { + _c = qp().newCursor( DiskLoc() , _pq.getNumToReturn() + _pq.getSkip() ); + _capped = _c->capped(); + + // setup check for if we can only use index to extract + if ( _c->modifiedKeys() == false && _c->isMultiKey() == false && _pq.getFields() ) { + _keyFieldsOnly.reset( _pq.getFields()->checkKey( _c->indexKeyPattern() ) ); + } + } + + if ( qp().scanAndOrderRequired() ) { + _inMemSort = true; + _so.reset( new ScanAndOrder( _pq.getSkip() , _pq.getNumToReturn() , _pq.getOrder(), qp().multikeyFrs() ) ); + } + + if ( _pq.isExplain() ) { + _eb.noteCursor( _c.get() ); + } + + } + + virtual bool prepareToYield() { + if ( _findingStartCursor.get() ) { + return _findingStartCursor->prepareToYield(); + } + else { + if ( _c && !_cc ) { + _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , _pq.ns() ) ); + } + if ( _cc ) { + return _cc->prepareToYield( _yieldData ); + } + } + // no active cursor - ok to yield + return true; + } + + virtual void recoverFromYield() { + _nYields++; + + if ( _findingStartCursor.get() ) { + _findingStartCursor->recoverFromYield(); + } + else if ( _cc && !ClientCursor::recoverFromYield( _yieldData ) ) { + _yieldRecoveryFailed = true; + _c.reset(); + _cc.reset(); + _so.reset(); + + if ( _capped ) { + msgassertedNoTrace( 13338, str::stream() << "capped cursor overrun during query: " << _pq.ns() ); + } + else if ( qp().mustAssertOnYieldFailure() ) { + msgassertedNoTrace( 15890, str::stream() << "UserQueryOp::recoverFromYield() failed to recover: " << _pq.ns() ); + } + else { + // we don't fail query since we're fine with returning partial data if collection dropped + + // todo: this is wrong. the cursor could be gone if closeAllDatabases command just ran + } + + } + } + + virtual long long nscanned() { + if ( _findingStartCursor.get() ) { + return 0; // should only be one query plan, so value doesn't really matter. + } + return _c.get() ? _c->nscanned() : _nscanned; + } + + virtual void next() { + if ( _findingStartCursor.get() ) { + if ( !_findingStartCursor->done() ) { + _findingStartCursor->next(); + } + if ( _findingStartCursor->done() ) { + _c = _findingStartCursor->cursor(); + _findingStartCursor.reset( 0 ); + } + _capped = true; + return; + } + + if ( !_c || !_c->ok() ) { + finish( false ); + return; + } + + bool mayCreateCursor1 = _pq.wantMore() && ! _inMemSort && _pq.getNumToReturn() != 1 && useCursors; + + if( 0 ) { + cout << "SCANNING this: " << this << " key: " << _c->currKey() << " obj: " << _c->current() << endl; + } + + if ( _pq.getMaxScan() && _nscanned >= _pq.getMaxScan() ) { + finish( true ); //? + return; + } + + _nscanned = _c->nscanned(); + if ( !matcher( _c )->matchesCurrent(_c.get() , &_details ) ) { + // not a match, continue onward + if ( _details._loadedObject ) + _nscannedObjects++; + } + else { + _nscannedObjects++; + DiskLoc cl = _c->currLoc(); + if ( _chunkManager && ! _chunkManager->belongsToMe( cl.obj() ) ) { // TODO: should make this covered at some point + _nChunkSkips++; + // log() << "TEMP skipping un-owned chunk: " << _c->current() << endl; + } + else if( _c->getsetdup(cl) ) { + // dup + } + else { + // got a match. + + if ( _inMemSort ) { + // note: no cursors for non-indexed, ordered results. results must be fairly small. + _so->add( _pq.returnKey() ? _c->currKey() : _c->current(), _pq.showDiskLoc() ? &cl : 0 ); + } + else if ( _ntoskip > 0 ) { + _ntoskip--; + } + else { + if ( _pq.isExplain() ) { + _n++; + if ( n() >= _pq.getNumToReturn() && !_pq.wantMore() ) { + // .limit() was used, show just that much. + finish( true ); //? + return; + } + } + else { + + if ( _pq.returnKey() ) { + BSONObjBuilder bb( _buf ); + bb.appendKeys( _c->indexKeyPattern() , _c->currKey() ); + bb.done(); + } + else if ( _keyFieldsOnly ) { + fillQueryResultFromObj( _buf , 0 , _keyFieldsOnly->hydrate( _c->currKey() ) ); + } + else { + BSONObj js = _c->current(); + assert( js.isValid() ); + + if ( _oplogReplay ) { + BSONElement e = js["ts"]; + if ( e.type() == Date || e.type() == Timestamp ) + _slaveReadTill = e._opTime(); + } + + fillQueryResultFromObj( _buf , _pq.getFields() , js , (_pq.showDiskLoc() ? &cl : 0)); + } + _n++; + if ( ! _c->supportGetMore() ) { + if ( _pq.enough( n() ) || _buf.len() >= MaxBytesToReturnToClientAtOnce ) { + finish( true ); + return; + } + } + else if ( _pq.enoughForFirstBatch( n() , _buf.len() ) ) { + /* if only 1 requested, no cursor saved for efficiency...we assume it is findOne() */ + if ( mayCreateCursor1 ) { + _wouldSaveClientCursor = true; + if ( _c->advance() ) { + // more...so save a cursor + _saveClientCursor = true; + } + } + finish( true ); + return; + } + } + } + } + } + _c->advance(); + } + + // this plan won, so set data for response broadly + void finish( bool stop ) { + massert( 13638, "client cursor dropped during explain query yield", !_pq.isExplain() || _c.get() ); + + if ( _pq.isExplain() ) { + _n = _inMemSort ? _so->size() : _n; + } + else if ( _inMemSort ) { + if( _so.get() ) + _so->fill( _buf, _pq.getFields() , _n ); + } + + if ( _c.get() ) { + _nscanned = _c->nscanned(); + + if ( _pq.hasOption( QueryOption_CursorTailable ) && _pq.getNumToReturn() != 1 ) + _c->setTailable(); + + // If the tailing request succeeded. + if ( _c->tailable() ) + _saveClientCursor = true; + } + + if ( _pq.isExplain() ) { + _eb.noteScan( _c.get(), _nscanned, _nscannedObjects, _n, scanAndOrderRequired(), + _curop.elapsedMillis(), useHints && !_pq.getHint().eoo(), _nYields , + _nChunkSkips, _keyFieldsOnly.get() > 0 ); + } + else { + if ( _buf.len() ) { + _response.appendData( _buf.buf(), _buf.len() ); + _buf.decouple(); + } + } + + if ( stop ) { + setStop(); + } + else { + setComplete(); + } + + } + + void finishExplain( const BSONObj &suffix ) { + BSONObj obj = _eb.finishWithSuffix( totalNscanned(), nscannedObjects(), n(), _curop.elapsedMillis(), suffix); + fillQueryResultFromObj(_buf, 0, obj); + _n = 1; + _oldN = 0; + _response.appendData( _buf.buf(), _buf.len() ); + _buf.decouple(); + } + + virtual bool mayRecordPlan() const { + return !_yieldRecoveryFailed && ( _pq.getNumToReturn() != 1 ) && ( ( _n > _pq.getNumToReturn() / 2 ) || ( complete() && !stopRequested() ) ); + } + + virtual QueryOp *_createChild() const { + if ( _pq.isExplain() ) { + _eb.ensureStartScan(); + } + UserQueryOp *ret = new UserQueryOp( _pq, _response, _eb, _curop ); + ret->_oldN = n(); + ret->_oldNscanned = totalNscanned(); + ret->_oldNscannedObjects = nscannedObjects(); + ret->_ntoskip = _ntoskip; + return ret; + } + + bool scanAndOrderRequired() const { return _inMemSort; } + shared_ptr<Cursor> cursor() { return _c; } + int n() const { return _oldN + _n; } + long long totalNscanned() const { return _nscanned + _oldNscanned; } + long long nscannedObjects() const { return _nscannedObjects + _oldNscannedObjects; } + bool saveClientCursor() const { return _saveClientCursor; } + bool wouldSaveClientCursor() const { return _wouldSaveClientCursor; } + + void finishForOplogReplay( ClientCursor * cc ) { + if ( _oplogReplay && ! _slaveReadTill.isNull() ) + cc->slaveReadTill( _slaveReadTill ); + + } + + ShardChunkManagerPtr getChunkManager(){ return _chunkManager; } + + private: + BufBuilder _buf; + const ParsedQuery& _pq; + scoped_ptr<Projection::KeyOnly> _keyFieldsOnly; + + long long _ntoskip; + long long _nscanned; + long long _oldNscanned; + long long _nscannedObjects; + long long _oldNscannedObjects; + int _n; // found so far + int _oldN; + + int _nYields; + int _nChunkSkips; + + MatchDetails _details; + + ShardChunkManagerPtr _chunkManager; + + bool _inMemSort; + auto_ptr< ScanAndOrder > _so; + + shared_ptr<Cursor> _c; + ClientCursor::CleanupPointer _cc; + ClientCursor::YieldData _yieldData; + + bool _capped; + bool _saveClientCursor; + bool _wouldSaveClientCursor; + bool _oplogReplay; + auto_ptr< FindingStartCursor > _findingStartCursor; + + Message &_response; + ExplainBuilder &_eb; + CurOp &_curop; + OpTime _slaveReadTill; + + bool _yieldRecoveryFailed; + }; + + /* run a query -- includes checking for and running a Command \ + @return points to ns if exhaust mode. 0=normal mode + */ + const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { + shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) ); + ParsedQuery& pq( *pq_shared ); + int ntoskip = q.ntoskip; + BSONObj jsobj = q.query; + int queryOptions = q.queryOptions; + const char *ns = q.ns; + + if( logLevel >= 2 ) + log() << "runQuery called " << ns << " " << jsobj << endl; + + curop.debug().ns = ns; + curop.debug().ntoreturn = pq.getNumToReturn(); + curop.setQuery(jsobj); + + if ( pq.couldBeCommand() ) { + BufBuilder bb; + bb.skip(sizeof(QueryResult)); + BSONObjBuilder cmdResBuf; + if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) { + curop.debug().iscommand = true; + curop.debug().query = jsobj; + curop.markCommand(); + + auto_ptr< QueryResult > qr; + qr.reset( (QueryResult *) bb.buf() ); + bb.decouple(); + qr->setResultFlagsToOk(); + qr->len = bb.len(); + curop.debug().responseLength = bb.len(); + qr->setOperation(opReply); + qr->cursorId = 0; + qr->startingFrom = 0; + qr->nReturned = 1; + result.setData( qr.release(), true ); + } + else { + uasserted(13530, "bad or malformed command request?"); + } + return 0; + } + + /* --- regular query --- */ + + int n = 0; + BSONElement hint = useHints ? pq.getHint() : BSONElement(); + bool explain = pq.isExplain(); + bool snapshot = pq.isSnapshot(); + BSONObj order = pq.getOrder(); + BSONObj query = pq.getFilter(); + + /* The ElemIter will not be happy if this isn't really an object. So throw exception + here when that is true. + (Which may indicate bad data from client.) + */ + if ( query.objsize() == 0 ) { + out() << "Bad query object?\n jsobj:"; + out() << jsobj.toString() << "\n query:"; + out() << query.toString() << endl; + uassert( 10110 , "bad query object", false); + } + + Client::ReadContext ctx( ns , dbpath ); // read locks + + replVerifyReadsOk(pq); + + if ( pq.hasOption( QueryOption_CursorTailable ) ) { + NamespaceDetails *d = nsdetails( ns ); + uassert( 13051, "tailable cursor requested on non capped collection", d && d->capped ); + const BSONObj nat1 = BSON( "$natural" << 1 ); + if ( order.isEmpty() ) { + order = nat1; + } + else { + uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 ); + } + } + + BSONObj snapshotHint; // put here to keep the data in scope + if( snapshot ) { + NamespaceDetails *d = nsdetails(ns); + if ( d ) { + int i = d->findIdIndex(); + if( i < 0 ) { + if ( strstr( ns , ".system." ) == 0 ) + log() << "warning: no _id index on $snapshot query, ns:" << ns << endl; + } + else { + /* [dm] the name of an _id index tends to vary, so we build the hint the hard way here. + probably need a better way to specify "use the _id index" as a hint. if someone is + in the query optimizer please fix this then! + */ + BSONObjBuilder b; + b.append("$hint", d->idx(i).indexName()); + snapshotHint = b.obj(); + hint = snapshotHint.firstElement(); + } + } + } + + if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) { + + bool nsFound = false; + bool indexFound = false; + + BSONObj resObject; + Client& c = cc(); + bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound ); + if ( nsFound == false || indexFound == true ) { + BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32); + bb.skip(sizeof(QueryResult)); + + curop.debug().idhack = true; + if ( found ) { + n = 1; + fillQueryResultFromObj( bb , pq.getFields() , resObject ); + } + auto_ptr< QueryResult > qr; + qr.reset( (QueryResult *) bb.buf() ); + bb.decouple(); + qr->setResultFlagsToOk(); + qr->len = bb.len(); + + curop.debug().responseLength = bb.len(); + qr->setOperation(opReply); + qr->cursorId = 0; + qr->startingFrom = 0; + qr->nReturned = n; + result.setData( qr.release(), true ); + return NULL; + } + } + + // regular, not QO bypass query + + BSONObj oldPlan; + if ( explain && ! pq.hasIndexSpecifier() ) { + MultiPlanScanner mps( ns, query, order ); + if ( mps.usingCachedPlan() ) + oldPlan = mps.oldExplain(); + } + auto_ptr< MultiPlanScanner > mps( new MultiPlanScanner( ns, query, order, &hint, !explain, pq.getMin(), pq.getMax(), false, true ) ); + BSONObj explainSuffix; + if ( explain ) { + BSONObjBuilder bb; + if ( !oldPlan.isEmpty() ) + bb.append( "oldPlan", oldPlan.firstElement().embeddedObject().firstElement().embeddedObject() ); + explainSuffix = bb.obj(); + } + ExplainBuilder eb; + UserQueryOp original( pq, result, eb, curop ); + shared_ptr< UserQueryOp > o = mps->runOp( original ); + UserQueryOp &dqo = *o; + if ( ! dqo.complete() ) + throw MsgAssertionException( dqo.exception() ); + if ( explain ) { + dqo.finishExplain( explainSuffix ); + } + n = dqo.n(); + long long nscanned = dqo.totalNscanned(); + curop.debug().scanAndOrder = dqo.scanAndOrderRequired(); + + shared_ptr<Cursor> cursor = dqo.cursor(); + if( logLevel >= 5 ) + log() << " used cursor: " << cursor.get() << endl; + long long cursorid = 0; + const char * exhaust = 0; + if ( dqo.saveClientCursor() || ( dqo.wouldSaveClientCursor() && mps->mayRunMore() ) ) { + ClientCursor *cc; + bool moreClauses = mps->mayRunMore(); + if ( moreClauses ) { + // this MultiCursor will use a dumb NoOp to advance(), so no need to specify mayYield + shared_ptr< Cursor > multi( new MultiCursor( mps, cursor, dqo.matcher( cursor ), dqo ) ); + cc = new ClientCursor(queryOptions, multi, ns, jsobj.getOwned()); + } + else { + if( ! cursor->matcher() ) cursor->setMatcher( dqo.matcher( cursor ) ); + cc = new ClientCursor( queryOptions, cursor, ns, jsobj.getOwned() ); + } + + cc->setChunkManager( dqo.getChunkManager() ); + + cursorid = cc->cursorid(); + DEV tlog(2) << "query has more, cursorid: " << cursorid << endl; + cc->setPos( n ); + cc->pq = pq_shared; + cc->fields = pq.getFieldPtr(); + cc->originalMessage = m; + cc->updateLocation(); + if ( !cc->ok() && cc->c()->tailable() ) + DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl; + if( queryOptions & QueryOption_Exhaust ) { + exhaust = ns; + curop.debug().exhaust = true; + } + dqo.finishForOplogReplay(cc); + } + + QueryResult *qr = (QueryResult *) result.header(); + qr->cursorId = cursorid; + qr->setResultFlagsToOk(); + // qr->len is updated automatically by appendData() + curop.debug().responseLength = qr->len; + qr->setOperation(opReply); + qr->startingFrom = 0; + qr->nReturned = n; + + int duration = curop.elapsedMillis(); + bool dbprofile = curop.shouldDBProfile( duration ); + if ( dbprofile || duration >= cmdLine.slowMS ) { + curop.debug().nscanned = (int) nscanned; + curop.debug().ntoskip = ntoskip; + } + curop.debug().nreturned = n; + return exhaust; + } + +} // namespace mongo diff --git a/src/mongo/db/ops/query.h b/src/mongo/db/ops/query.h new file mode 100644 index 00000000000..3324b75fe16 --- /dev/null +++ b/src/mongo/db/ops/query.h @@ -0,0 +1,248 @@ +// query.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../../pch.h" +#include "../../util/net/message.h" +#include "../dbmessage.h" +#include "../jsobj.h" +#include "../diskloc.h" +#include "../projection.h" + +// struct QueryOptions, QueryResult, QueryResultFlags in: +#include "../../client/dbclient.h" + +namespace mongo { + + extern const int MaxBytesToReturnToClientAtOnce; + + QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& op, int pass, bool& exhaust); + + const char * runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result); + + /* This is for languages whose "objects" are not well ordered (JSON is well ordered). + [ { a : ... } , { b : ... } ] -> { a : ..., b : ... } + */ + inline BSONObj transformOrderFromArrayFormat(BSONObj order) { + /* note: this is slow, but that is ok as order will have very few pieces */ + BSONObjBuilder b; + char p[2] = "0"; + + while ( 1 ) { + BSONObj j = order.getObjectField(p); + if ( j.isEmpty() ) + break; + BSONElement e = j.firstElement(); + uassert( 10102 , "bad order array", !e.eoo()); + uassert( 10103 , "bad order array [2]", e.isNumber()); + b.append(e); + (*p)++; + uassert( 10104 , "too many ordering elements", *p <= '9'); + } + + return b.obj(); + } + + /** + * this represents a total user query + * includes fields from the query message, both possible query levels + * parses everything up front + */ + class ParsedQuery : boost::noncopyable { + public: + ParsedQuery( QueryMessage& qm ) + : _ns( qm.ns ) , _ntoskip( qm.ntoskip ) , _ntoreturn( qm.ntoreturn ) , _options( qm.queryOptions ) { + init( qm.query ); + initFields( qm.fields ); + } + ParsedQuery( const char* ns , int ntoskip , int ntoreturn , int queryoptions , const BSONObj& query , const BSONObj& fields ) + : _ns( ns ) , _ntoskip( ntoskip ) , _ntoreturn( ntoreturn ) , _options( queryoptions ) { + init( query ); + initFields( fields ); + } + + const char * ns() const { return _ns; } + bool isLocalDB() const { return strncmp(_ns, "local.", 6) == 0; } + + const BSONObj& getFilter() const { return _filter; } + Projection* getFields() const { return _fields.get(); } + shared_ptr<Projection> getFieldPtr() const { return _fields; } + + int getSkip() const { return _ntoskip; } + int getNumToReturn() const { return _ntoreturn; } + bool wantMore() const { return _wantMore; } + int getOptions() const { return _options; } + bool hasOption( int x ) const { return x & _options; } + + bool isExplain() const { return _explain; } + bool isSnapshot() const { return _snapshot; } + bool returnKey() const { return _returnKey; } + bool showDiskLoc() const { return _showDiskLoc; } + + const BSONObj& getMin() const { return _min; } + const BSONObj& getMax() const { return _max; } + const BSONObj& getOrder() const { return _order; } + const BSONElement& getHint() const { return _hint; } + int getMaxScan() const { return _maxScan; } + + bool couldBeCommand() const { + /* we assume you are using findOne() for running a cmd... */ + return _ntoreturn == 1 && strstr( _ns , ".$cmd" ); + } + + bool hasIndexSpecifier() const { + return ! _hint.eoo() || ! _min.isEmpty() || ! _max.isEmpty(); + } + + /* if ntoreturn is zero, we return up to 101 objects. on the subsequent getmore, there + is only a size limit. The idea is that on a find() where one doesn't use much results, + we don't return much, but once getmore kicks in, we start pushing significant quantities. + + The n limit (vs. size) is important when someone fetches only one small field from big + objects, which causes massive scanning server-side. + */ + bool enoughForFirstBatch( int n , int len ) const { + if ( _ntoreturn == 0 ) + return ( len > 1024 * 1024 ) || n >= 101; + return n >= _ntoreturn || len > MaxBytesToReturnToClientAtOnce; + } + + bool enough( int n ) const { + if ( _ntoreturn == 0 ) + return false; + return n >= _ntoreturn; + } + + private: + void init( const BSONObj& q ) { + _reset(); + uassert( 10105 , "bad skip value in query", _ntoskip >= 0); + + if ( _ntoreturn < 0 ) { + /* _ntoreturn greater than zero is simply a hint on how many objects to send back per + "cursor batch". + A negative number indicates a hard limit. + */ + _wantMore = false; + _ntoreturn = -_ntoreturn; + } + + + BSONElement e = q["query"]; + if ( ! e.isABSONObj() ) + e = q["$query"]; + + if ( e.isABSONObj() ) { + _filter = e.embeddedObject(); + _initTop( q ); + } + else { + _filter = q; + } + } + + void _reset() { + _wantMore = true; + _explain = false; + _snapshot = false; + _returnKey = false; + _showDiskLoc = false; + _maxScan = 0; + } + + void _initTop( const BSONObj& top ) { + BSONObjIterator i( top ); + while ( i.more() ) { + BSONElement e = i.next(); + const char * name = e.fieldName(); + + if ( strcmp( "$orderby" , name ) == 0 || + strcmp( "orderby" , name ) == 0 ) { + if ( e.type() == Object ) { + _order = e.embeddedObject(); + } + else if ( e.type() == Array ) { + _order = transformOrderFromArrayFormat( _order ); + } + else { + uasserted(13513, "sort must be an object or array"); + } + continue; + } + + if( *name == '$' ) { + name++; + if ( strcmp( "explain" , name ) == 0 ) + _explain = e.trueValue(); + else if ( strcmp( "snapshot" , name ) == 0 ) + _snapshot = e.trueValue(); + else if ( strcmp( "min" , name ) == 0 ) + _min = e.embeddedObject(); + else if ( strcmp( "max" , name ) == 0 ) + _max = e.embeddedObject(); + else if ( strcmp( "hint" , name ) == 0 ) + _hint = e; + else if ( strcmp( "returnKey" , name ) == 0 ) + _returnKey = e.trueValue(); + else if ( strcmp( "maxScan" , name ) == 0 ) + _maxScan = e.numberInt(); + else if ( strcmp( "showDiskLoc" , name ) == 0 ) + _showDiskLoc = e.trueValue(); + else if ( strcmp( "comment" , name ) == 0 ) { + ; // no-op + } + } + } + + if ( _snapshot ) { + uassert( 12001 , "E12001 can't sort with $snapshot", _order.isEmpty() ); + uassert( 12002 , "E12002 can't use hint with $snapshot", _hint.eoo() ); + } + + } + + void initFields( const BSONObj& fields ) { + if ( fields.isEmpty() ) + return; + _fields.reset( new Projection() ); + _fields->init( fields ); + } + + const char * const _ns; + const int _ntoskip; + int _ntoreturn; + BSONObj _filter; + BSONObj _order; + const int _options; + shared_ptr< Projection > _fields; + bool _wantMore; + bool _explain; + bool _snapshot; + bool _returnKey; + bool _showDiskLoc; + BSONObj _min; + BSONObj _max; + BSONElement _hint; + int _maxScan; + }; + + +} // namespace mongo + + diff --git a/src/mongo/db/ops/update.cpp b/src/mongo/db/ops/update.cpp new file mode 100644 index 00000000000..2abc6987218 --- /dev/null +++ b/src/mongo/db/ops/update.cpp @@ -0,0 +1,1308 @@ +// update.cpp + +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "pch.h" +#include "query.h" +#include "../pdfile.h" +#include "../jsobjmanipulator.h" +#include "../queryoptimizer.h" +#include "../repl.h" +#include "../btree.h" +#include "../../util/stringutils.h" +#include "update.h" + +//#define DEBUGUPDATE(x) cout << x << endl; +#define DEBUGUPDATE(x) + +namespace mongo { + + const char* Mod::modNames[] = { "$inc", "$set", "$push", "$pushAll", "$pull", "$pullAll" , "$pop", "$unset" , + "$bitand" , "$bitor" , "$bit" , "$addToSet", "$rename", "$rename" + }; + unsigned Mod::modNamesNum = sizeof(Mod::modNames)/sizeof(char*); + + bool Mod::_pullElementMatch( BSONElement& toMatch ) const { + + if ( elt.type() != Object ) { + // if elt isn't an object, then comparison will work + return toMatch.woCompare( elt , false ) == 0; + } + + if ( matcherOnPrimitive ) + return matcher->matches( toMatch.wrap( "" ) ); + + if ( toMatch.type() != Object ) { + // looking for an object, so this can't match + return false; + } + + // now we have an object on both sides + return matcher->matches( toMatch.embeddedObject() ); + } + + template< class Builder > + void Mod::appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const { + BSONType a = in.type(); + BSONType b = elt.type(); + + if ( a == NumberDouble || b == NumberDouble ) { + ms.incType = NumberDouble; + ms.incdouble = elt.numberDouble() + in.numberDouble(); + } + else if ( a == NumberLong || b == NumberLong ) { + ms.incType = NumberLong; + ms.inclong = elt.numberLong() + in.numberLong(); + } + else { + int x = elt.numberInt() + in.numberInt(); + if ( x < 0 && elt.numberInt() > 0 && in.numberInt() > 0 ) { + // overflow + ms.incType = NumberLong; + ms.inclong = elt.numberLong() + in.numberLong(); + } + else { + ms.incType = NumberInt; + ms.incint = elt.numberInt() + in.numberInt(); + } + } + + ms.appendIncValue( bb , false ); + } + + template< class Builder > + void appendUnset( Builder &b ) { + } + + template<> + void appendUnset( BSONArrayBuilder &b ) { + b.appendNull(); + } + + template< class Builder > + void Mod::apply( Builder& b , BSONElement in , ModState& ms ) const { + if ( ms.dontApply ) { + return; + } + + switch ( op ) { + + case INC: { + appendIncremented( b , in , ms ); + break; + } + + case SET: { + _checkForAppending( elt ); + b.appendAs( elt , shortFieldName ); + break; + } + + case UNSET: { + appendUnset( b ); + break; + } + + case PUSH: { + uassert( 10131 , "$push can only be applied to an array" , in.type() == Array ); + BSONObjBuilder bb( b.subarrayStart( shortFieldName ) ); + BSONObjIterator i( in.embeddedObject() ); + int n=0; + while ( i.more() ) { + bb.append( i.next() ); + n++; + } + + ms.pushStartSize = n; + + bb.appendAs( elt , bb.numStr( n ) ); + bb.done(); + break; + } + + case ADDTOSET: { + uassert( 12592 , "$addToSet can only be applied to an array" , in.type() == Array ); + BSONObjBuilder bb( b.subarrayStart( shortFieldName ) ); + + BSONObjIterator i( in.embeddedObject() ); + int n=0; + + if ( isEach() ) { + + BSONElementSet toadd; + parseEach( toadd ); + + while ( i.more() ) { + BSONElement cur = i.next(); + bb.append( cur ); + n++; + toadd.erase( cur ); + } + + { + BSONObjIterator i( getEach() ); + while ( i.more() ) { + BSONElement e = i.next(); + if ( toadd.count(e) ) { + bb.appendAs( e , BSONObjBuilder::numStr( n++ ) ); + toadd.erase( e ); + } + } + } + + } + else { + + bool found = false; + + while ( i.more() ) { + BSONElement cur = i.next(); + bb.append( cur ); + n++; + if ( elt.woCompare( cur , false ) == 0 ) + found = true; + } + + if ( ! found ) + bb.appendAs( elt , bb.numStr( n ) ); + + } + + bb.done(); + break; + } + + + + case PUSH_ALL: { + uassert( 10132 , "$pushAll can only be applied to an array" , in.type() == Array ); + uassert( 10133 , "$pushAll has to be passed an array" , elt.type() ); + + BSONObjBuilder bb( b.subarrayStart( shortFieldName ) ); + + BSONObjIterator i( in.embeddedObject() ); + int n=0; + while ( i.more() ) { + bb.append( i.next() ); + n++; + } + + ms.pushStartSize = n; + + i = BSONObjIterator( elt.embeddedObject() ); + while ( i.more() ) { + bb.appendAs( i.next() , bb.numStr( n++ ) ); + } + + bb.done(); + break; + } + + case PULL: + case PULL_ALL: { + uassert( 10134 , "$pull/$pullAll can only be applied to an array" , in.type() == Array ); + BSONObjBuilder bb( b.subarrayStart( shortFieldName ) ); + + int n = 0; + + BSONObjIterator i( in.embeddedObject() ); + while ( i.more() ) { + BSONElement e = i.next(); + bool allowed = true; + + if ( op == PULL ) { + allowed = ! _pullElementMatch( e ); + } + else { + BSONObjIterator j( elt.embeddedObject() ); + while( j.more() ) { + BSONElement arrJ = j.next(); + if ( e.woCompare( arrJ, false ) == 0 ) { + allowed = false; + break; + } + } + } + + if ( allowed ) + bb.appendAs( e , bb.numStr( n++ ) ); + } + + bb.done(); + break; + } + + case POP: { + uassert( 10135 , "$pop can only be applied to an array" , in.type() == Array ); + BSONObjBuilder bb( b.subarrayStart( shortFieldName ) ); + + int n = 0; + + BSONObjIterator i( in.embeddedObject() ); + if ( elt.isNumber() && elt.number() < 0 ) { + // pop from front + if ( i.more() ) { + i.next(); + n++; + } + + while( i.more() ) { + bb.appendAs( i.next() , bb.numStr( n - 1 ) ); + n++; + } + } + else { + // pop from back + while( i.more() ) { + n++; + BSONElement arrI = i.next(); + if ( i.more() ) { + bb.append( arrI ); + } + } + } + + ms.pushStartSize = n; + assert( ms.pushStartSize == in.embeddedObject().nFields() ); + bb.done(); + break; + } + + case BIT: { + uassert( 10136 , "$bit needs an array" , elt.type() == Object ); + uassert( 10137 , "$bit can only be applied to numbers" , in.isNumber() ); + uassert( 10138 , "$bit cannot update a value of type double" , in.type() != NumberDouble ); + + int x = in.numberInt(); + long long y = in.numberLong(); + + BSONObjIterator it( elt.embeddedObject() ); + while ( it.more() ) { + BSONElement e = it.next(); + uassert( 10139 , "$bit field must be number" , e.isNumber() ); + if ( str::equals(e.fieldName(), "and") ) { + switch( in.type() ) { + case NumberInt: x = x&e.numberInt(); break; + case NumberLong: y = y&e.numberLong(); break; + default: assert( 0 ); + } + } + else if ( str::equals(e.fieldName(), "or") ) { + switch( in.type() ) { + case NumberInt: x = x|e.numberInt(); break; + case NumberLong: y = y|e.numberLong(); break; + default: assert( 0 ); + } + } + else { + uasserted(9016, str::stream() << "unknown $bit operation: " << e.fieldName()); + } + } + + switch( in.type() ) { + case NumberInt: b.append( shortFieldName , x ); break; + case NumberLong: b.append( shortFieldName , y ); break; + default: assert( 0 ); + } + + break; + } + + case RENAME_FROM: { + break; + } + + case RENAME_TO: { + ms.handleRename( b, shortFieldName ); + break; + } + + default: + stringstream ss; + ss << "Mod::apply can't handle type: " << op; + throw UserException( 9017, ss.str() ); + } + } + + // -1 inside a non-object (non-object could be array) + // 0 missing + // 1 found + int validRenamePath( BSONObj obj, const char *path ) { + while( const char *p = strchr( path, '.' ) ) { + string left( path, p - path ); + BSONElement e = obj.getField( left ); + if ( e.eoo() ) { + return 0; + } + if ( e.type() != Object ) { + return -1; + } + obj = e.embeddedObject(); + path = p + 1; + } + return !obj.getField( path ).eoo(); + } + + auto_ptr<ModSetState> ModSet::prepare(const BSONObj &obj) const { + DEBUGUPDATE( "\t start prepare" ); + auto_ptr<ModSetState> mss( new ModSetState( obj ) ); + + + // Perform this check first, so that we don't leave a partially modified object on uassert. + for ( ModHolder::const_iterator i = _mods.begin(); i != _mods.end(); ++i ) { + DEBUGUPDATE( "\t\t prepare : " << i->first ); + ModState& ms = mss->_mods[i->first]; + + const Mod& m = i->second; + BSONElement e = obj.getFieldDotted(m.fieldName); + + ms.m = &m; + ms.old = e; + + if ( m.op == Mod::RENAME_FROM ) { + int source = validRenamePath( obj, m.fieldName ); + uassert( 13489, "$rename source field invalid", source != -1 ); + if ( source != 1 ) { + ms.dontApply = true; + } + continue; + } + + if ( m.op == Mod::RENAME_TO ) { + int source = validRenamePath( obj, m.renameFrom() ); + if ( source == 1 ) { + int target = validRenamePath( obj, m.fieldName ); + uassert( 13490, "$rename target field invalid", target != -1 ); + ms.newVal = obj.getFieldDotted( m.renameFrom() ); + mss->amIInPlacePossible( false ); + } + else { + ms.dontApply = true; + } + continue; + } + + if ( e.eoo() ) { + mss->amIInPlacePossible( m.op == Mod::UNSET ); + continue; + } + + switch( m.op ) { + case Mod::INC: + uassert( 10140 , "Cannot apply $inc modifier to non-number", e.isNumber() || e.eoo() ); + if ( mss->amIInPlacePossible( e.isNumber() ) ) { + // check more typing info here + if ( m.elt.type() != e.type() ) { + // if i'm incrementing with a double, then the storage has to be a double + mss->amIInPlacePossible( m.elt.type() != NumberDouble ); + } + + // check for overflow + if ( e.type() == NumberInt && e.numberLong() + m.elt.numberLong() > numeric_limits<int>::max() ) { + mss->amIInPlacePossible( false ); + } + } + break; + + case Mod::SET: + mss->amIInPlacePossible( m.elt.type() == e.type() && + m.elt.valuesize() == e.valuesize() ); + break; + + case Mod::PUSH: + case Mod::PUSH_ALL: + uassert( 10141 , "Cannot apply $push/$pushAll modifier to non-array", e.type() == Array || e.eoo() ); + mss->amIInPlacePossible( false ); + break; + + case Mod::PULL: + case Mod::PULL_ALL: { + uassert( 10142 , "Cannot apply $pull/$pullAll modifier to non-array", e.type() == Array || e.eoo() ); + BSONObjIterator i( e.embeddedObject() ); + while( mss->_inPlacePossible && i.more() ) { + BSONElement arrI = i.next(); + if ( m.op == Mod::PULL ) { + mss->amIInPlacePossible( ! m._pullElementMatch( arrI ) ); + } + else if ( m.op == Mod::PULL_ALL ) { + BSONObjIterator j( m.elt.embeddedObject() ); + while( mss->_inPlacePossible && j.moreWithEOO() ) { + BSONElement arrJ = j.next(); + if ( arrJ.eoo() ) + break; + mss->amIInPlacePossible( arrI.woCompare( arrJ, false ) ); + } + } + } + break; + } + + case Mod::POP: { + uassert( 10143 , "Cannot apply $pop modifier to non-array", e.type() == Array || e.eoo() ); + mss->amIInPlacePossible( e.embeddedObject().isEmpty() ); + break; + } + + case Mod::ADDTOSET: { + uassert( 12591 , "Cannot apply $addToSet modifier to non-array", e.type() == Array || e.eoo() ); + + BSONObjIterator i( e.embeddedObject() ); + if ( m.isEach() ) { + BSONElementSet toadd; + m.parseEach( toadd ); + while( i.more() ) { + BSONElement arrI = i.next(); + toadd.erase( arrI ); + } + mss->amIInPlacePossible( toadd.size() == 0 ); + } + else { + bool found = false; + while( i.more() ) { + BSONElement arrI = i.next(); + if ( arrI.woCompare( m.elt , false ) == 0 ) { + found = true; + break; + } + } + mss->amIInPlacePossible( found ); + } + break; + } + + default: + // mods we don't know about shouldn't be done in place + mss->amIInPlacePossible( false ); + } + } + + DEBUGUPDATE( "\t mss\n" << mss->toString() << "\t--" ); + + return mss; + } + + void ModState::appendForOpLog( BSONObjBuilder& b ) const { + if ( dontApply ) { + return; + } + + if ( incType ) { + DEBUGUPDATE( "\t\t\t\t\t appendForOpLog inc fieldname: " << m->fieldName << " short:" << m->shortFieldName ); + BSONObjBuilder bb( b.subobjStart( "$set" ) ); + appendIncValue( bb , true ); + bb.done(); + return; + } + + if ( m->op == Mod::RENAME_FROM ) { + DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_FROM fieldName:" << m->fieldName ); + BSONObjBuilder bb( b.subobjStart( "$unset" ) ); + bb.append( m->fieldName, 1 ); + bb.done(); + return; + } + + if ( m->op == Mod::RENAME_TO ) { + DEBUGUPDATE( "\t\t\t\t\t appendForOpLog RENAME_TO fieldName:" << m->fieldName ); + BSONObjBuilder bb( b.subobjStart( "$set" ) ); + bb.appendAs( newVal, m->fieldName ); + return; + } + + const char * name = fixedOpName ? fixedOpName : Mod::modNames[op()]; + + DEBUGUPDATE( "\t\t\t\t\t appendForOpLog name:" << name << " fixed: " << fixed << " fn: " << m->fieldName ); + + BSONObjBuilder bb( b.subobjStart( name ) ); + if ( fixed ) { + bb.appendAs( *fixed , m->fieldName ); + } + else { + bb.appendAs( m->elt , m->fieldName ); + } + bb.done(); + } + + string ModState::toString() const { + stringstream ss; + if ( fixedOpName ) + ss << " fixedOpName: " << fixedOpName; + if ( fixed ) + ss << " fixed: " << fixed; + return ss.str(); + } + + template< class Builder > + void ModState::handleRename( Builder &newObjBuilder, const char *shortFieldName ) { + newObjBuilder.appendAs( newVal , shortFieldName ); + BSONObjBuilder b; + b.appendAs( newVal, shortFieldName ); + assert( _objData.isEmpty() ); + _objData = b.obj(); + newVal = _objData.firstElement(); + } + + void ModSetState::applyModsInPlace( bool isOnDisk ) { + // TODO i think this assert means that we can get rid of the isOnDisk param + // and just use isOwned as the determination + DEV assert( isOnDisk == ! _obj.isOwned() ); + + for ( ModStateHolder::iterator i = _mods.begin(); i != _mods.end(); ++i ) { + ModState& m = i->second; + + if ( m.dontApply ) { + continue; + } + + switch ( m.m->op ) { + case Mod::UNSET: + case Mod::ADDTOSET: + case Mod::RENAME_FROM: + case Mod::RENAME_TO: + // this should have been handled by prepare + break; + case Mod::PULL: + case Mod::PULL_ALL: + // this should have been handled by prepare + break; + case Mod::POP: + assert( m.old.eoo() || ( m.old.isABSONObj() && m.old.Obj().isEmpty() ) ); + break; + // [dm] the BSONElementManipulator statements below are for replication (correct?) + case Mod::INC: + if ( isOnDisk ) + m.m->IncrementMe( m.old ); + else + m.m->incrementMe( m.old ); + m.fixedOpName = "$set"; + m.fixed = &(m.old); + break; + case Mod::SET: + if ( isOnDisk ) + BSONElementManipulator( m.old ).ReplaceTypeAndValue( m.m->elt ); + else + BSONElementManipulator( m.old ).replaceTypeAndValue( m.m->elt ); + break; + default: + uassert( 13478 , "can't apply mod in place - shouldn't have gotten here" , 0 ); + } + } + } + + void ModSet::extractFields( map< string, BSONElement > &fields, const BSONElement &top, const string &base ) { + if ( top.type() != Object ) { + fields[ base + top.fieldName() ] = top; + return; + } + BSONObjIterator i( top.embeddedObject() ); + bool empty = true; + while( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + extractFields( fields, e, base + top.fieldName() + "." ); + empty = false; + } + if ( empty ) + fields[ base + top.fieldName() ] = top; + } + + template< class Builder > + void ModSetState::_appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen ) { + const char * temp = m.fieldName(); + temp += root.size(); + const char * dot = strchr( temp , '.' ); + if ( dot ) { + string nr( m.fieldName() , 0 , 1 + ( dot - m.fieldName() ) ); + string nf( temp , 0 , dot - temp ); + if ( onedownseen.count( nf ) ) + return; + onedownseen.insert( nf ); + BSONObjBuilder bb ( b.subobjStart( nf ) ); + createNewFromMods( nr , bb , BSONObj() ); // don't infer an array from name + bb.done(); + } + else { + appendNewFromMod( m , b ); + } + + } + + template< class Builder > + void ModSetState::createNewFromMods( const string& root , Builder& b , const BSONObj &obj ) { + DEBUGUPDATE( "\t\t createNewFromMods root: " << root ); + BSONObjIteratorSorted es( obj ); + BSONElement e = es.next(); + + ModStateHolder::iterator m = _mods.lower_bound( root ); + StringBuilder buf(root.size() + 2 ); + buf << root << (char)255; + ModStateHolder::iterator mend = _mods.lower_bound( buf.str() ); + + set<string> onedownseen; + + while ( e.type() && m != mend ) { + string field = root + e.fieldName(); + FieldCompareResult cmp = compareDottedFieldNames( m->second.m->fieldName , field ); + + DEBUGUPDATE( "\t\t\t field:" << field << "\t mod:" << m->second.m->fieldName << "\t cmp:" << cmp << "\t short: " << e.fieldName() ); + + switch ( cmp ) { + + case LEFT_SUBFIELD: { // Mod is embedded under this element + uassert( 10145 , str::stream() << "LEFT_SUBFIELD only supports Object: " << field << " not: " << e.type() , e.type() == Object || e.type() == Array ); + if ( onedownseen.count( e.fieldName() ) == 0 ) { + onedownseen.insert( e.fieldName() ); + if ( e.type() == Object ) { + BSONObjBuilder bb( b.subobjStart( e.fieldName() ) ); + stringstream nr; nr << root << e.fieldName() << "."; + createNewFromMods( nr.str() , bb , e.embeddedObject() ); + bb.done(); + } + else { + BSONArrayBuilder ba( b.subarrayStart( e.fieldName() ) ); + stringstream nr; nr << root << e.fieldName() << "."; + createNewFromMods( nr.str() , ba , e.embeddedObject() ); + ba.done(); + } + // inc both as we handled both + e = es.next(); + m++; + } + else { + // this is a very weird case + // have seen it in production, but can't reproduce + // this assert prevents an inf. loop + // but likely isn't the correct solution + assert(0); + } + continue; + } + case LEFT_BEFORE: // Mod on a field that doesn't exist + DEBUGUPDATE( "\t\t\t\t creating new field for: " << m->second.m->fieldName ); + _appendNewFromMods( root , m->second , b , onedownseen ); + m++; + continue; + case SAME: + DEBUGUPDATE( "\t\t\t\t applying mod on: " << m->second.m->fieldName ); + m->second.apply( b , e ); + e = es.next(); + m++; + continue; + case RIGHT_BEFORE: // field that doesn't have a MOD + DEBUGUPDATE( "\t\t\t\t just copying" ); + b.append( e ); // if array, ignore field name + e = es.next(); + continue; + case RIGHT_SUBFIELD: + massert( 10399 , "ModSet::createNewFromMods - RIGHT_SUBFIELD should be impossible" , 0 ); + break; + default: + massert( 10400 , "unhandled case" , 0 ); + } + } + + // finished looping the mods, just adding the rest of the elements + while ( e.type() ) { + DEBUGUPDATE( "\t\t\t copying: " << e.fieldName() ); + b.append( e ); // if array, ignore field name + e = es.next(); + } + + // do mods that don't have fields already + for ( ; m != mend; m++ ) { + DEBUGUPDATE( "\t\t\t\t appending from mod at end: " << m->second.m->fieldName ); + _appendNewFromMods( root , m->second , b , onedownseen ); + } + } + + BSONObj ModSetState::createNewFromMods() { + BSONObjBuilder b( (int)(_obj.objsize() * 1.1) ); + createNewFromMods( "" , b , _obj ); + return _newFromMods = b.obj(); + } + + string ModSetState::toString() const { + stringstream ss; + for ( ModStateHolder::const_iterator i=_mods.begin(); i!=_mods.end(); ++i ) { + ss << "\t\t" << i->first << "\t" << i->second.toString() << "\n"; + } + return ss.str(); + } + + bool ModSetState::FieldCmp::operator()( const string &l, const string &r ) const { + return lexNumCmp( l.c_str(), r.c_str() ) < 0; + } + + BSONObj ModSet::createNewFromQuery( const BSONObj& query ) { + BSONObj newObj; + + { + BSONObjBuilder bb; + EmbeddedBuilder eb( &bb ); + BSONObjIteratorSorted i( query ); + while ( i.more() ) { + BSONElement e = i.next(); + if ( e.fieldName()[0] == '$' ) // for $atomic and anything else we add + continue; + + if ( e.type() == Object && e.embeddedObject().firstElementFieldName()[0] == '$' ) { + // this means this is a $gt type filter, so don't make part of the new object + continue; + } + + eb.appendAs( e , e.fieldName() ); + } + eb.done(); + newObj = bb.obj(); + } + + auto_ptr<ModSetState> mss = prepare( newObj ); + + if ( mss->canApplyInPlace() ) + mss->applyModsInPlace( false ); + else + newObj = mss->createNewFromMods(); + + return newObj; + } + + /* get special operations like $inc + { $inc: { a:1, b:1 } } + { $set: { a:77 } } + { $push: { a:55 } } + { $pushAll: { a:[77,88] } } + { $pull: { a:66 } } + { $pullAll : { a:[99,1010] } } + NOTE: MODIFIES source from object! + */ + ModSet::ModSet( + const BSONObj &from , + const set<string>& idxKeys, + const set<string> *backgroundKeys) + : _isIndexed(0) , _hasDynamicArray( false ) { + + BSONObjIterator it(from); + + while ( it.more() ) { + BSONElement e = it.next(); + const char *fn = e.fieldName(); + + uassert( 10147 , "Invalid modifier specified: " + string( fn ), e.type() == Object ); + BSONObj j = e.embeddedObject(); + DEBUGUPDATE( "\t" << j ); + + BSONObjIterator jt(j); + Mod::Op op = opFromStr( fn ); + + while ( jt.more() ) { + BSONElement f = jt.next(); // x:44 + + const char * fieldName = f.fieldName(); + + uassert( 15896 , "Modified field name may not start with $", fieldName[0] != '$' || op == Mod::UNSET ); // allow remove of invalid field name in case it was inserted before this check was added (~ version 2.1) + uassert( 10148 , "Mod on _id not allowed", strcmp( fieldName, "_id" ) != 0 ); + uassert( 10149 , "Invalid mod field name, may not end in a period", fieldName[ strlen( fieldName ) - 1 ] != '.' ); + uassert( 10150 , "Field name duplication not allowed with modifiers", ! haveModForField( fieldName ) ); + uassert( 10151 , "have conflicting mods in update" , ! haveConflictingMod( fieldName ) ); + uassert( 10152 , "Modifier $inc allowed for numbers only", f.isNumber() || op != Mod::INC ); + uassert( 10153 , "Modifier $pushAll/pullAll allowed for arrays only", f.type() == Array || ( op != Mod::PUSH_ALL && op != Mod::PULL_ALL ) ); + + if ( op == Mod::RENAME_TO ) { + uassert( 13494, "$rename target must be a string", f.type() == String ); + const char *target = f.valuestr(); + uassert( 13495, "$rename source must differ from target", strcmp( fieldName, target ) != 0 ); + uassert( 13496, "invalid mod field name, source may not be empty", fieldName[0] ); + uassert( 13479, "invalid mod field name, target may not be empty", target[0] ); + uassert( 13480, "invalid mod field name, source may not begin or end in period", fieldName[0] != '.' && fieldName[ strlen( fieldName ) - 1 ] != '.' ); + uassert( 13481, "invalid mod field name, target may not begin or end in period", target[0] != '.' && target[ strlen( target ) - 1 ] != '.' ); + uassert( 13482, "$rename affecting _id not allowed", !( fieldName[0] == '_' && fieldName[1] == 'i' && fieldName[2] == 'd' && ( !fieldName[3] || fieldName[3] == '.' ) ) ); + uassert( 13483, "$rename affecting _id not allowed", !( target[0] == '_' && target[1] == 'i' && target[2] == 'd' && ( !target[3] || target[3] == '.' ) ) ); + uassert( 13484, "field name duplication not allowed with $rename target", !haveModForField( target ) ); + uassert( 13485, "conflicting mods not allowed with $rename target", !haveConflictingMod( target ) ); + uassert( 13486, "$rename target may not be a parent of source", !( strncmp( fieldName, target, strlen( target ) ) == 0 && fieldName[ strlen( target ) ] == '.' ) ); + uassert( 13487, "$rename source may not be dynamic array", strstr( fieldName , ".$" ) == 0 ); + uassert( 13488, "$rename target may not be dynamic array", strstr( target , ".$" ) == 0 ); + + Mod from; + from.init( Mod::RENAME_FROM, f ); + from.setFieldName( fieldName ); + updateIsIndexed( from, idxKeys, backgroundKeys ); + _mods[ from.fieldName ] = from; + + Mod to; + to.init( Mod::RENAME_TO, f ); + to.setFieldName( target ); + updateIsIndexed( to, idxKeys, backgroundKeys ); + _mods[ to.fieldName ] = to; + + DEBUGUPDATE( "\t\t " << fieldName << "\t" << from.fieldName << "\t" << to.fieldName ); + continue; + } + + _hasDynamicArray = _hasDynamicArray || strstr( fieldName , ".$" ) > 0; + + Mod m; + m.init( op , f ); + m.setFieldName( f.fieldName() ); + updateIsIndexed( m, idxKeys, backgroundKeys ); + _mods[m.fieldName] = m; + + DEBUGUPDATE( "\t\t " << fieldName << "\t" << m.fieldName << "\t" << _hasDynamicArray ); + } + } + + } + + ModSet * ModSet::fixDynamicArray( const char * elemMatchKey ) const { + ModSet * n = new ModSet(); + n->_isIndexed = _isIndexed; + n->_hasDynamicArray = _hasDynamicArray; + for ( ModHolder::const_iterator i=_mods.begin(); i!=_mods.end(); i++ ) { + string s = i->first; + size_t idx = s.find( ".$" ); + if ( idx == string::npos ) { + n->_mods[s] = i->second; + continue; + } + StringBuilder buf(s.size()+strlen(elemMatchKey)); + buf << s.substr(0,idx+1) << elemMatchKey << s.substr(idx+2); + string fixed = buf.str(); + DEBUGUPDATE( "fixed dynamic: " << s << " -->> " << fixed ); + n->_mods[fixed] = i->second; + ModHolder::iterator temp = n->_mods.find( fixed ); + temp->second.setFieldName( temp->first.c_str() ); + } + return n; + } + + void checkNoMods( BSONObj o ) { + BSONObjIterator i( o ); + while( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + uassert( 10154 , "Modifiers and non-modifiers cannot be mixed", e.fieldName()[ 0 ] != '$' ); + } + } + + static void checkTooLarge(const BSONObj& newObj) { + uassert( 12522 , "$ operator made object too large" , newObj.objsize() <= BSONObjMaxUserSize ); + } + + /* note: this is only (as-is) called for + + - not multi + - not mods is indexed + - not upsert + */ + static UpdateResult _updateById(bool isOperatorUpdate, int idIdxNo, ModSet *mods, int profile, NamespaceDetails *d, + NamespaceDetailsTransient *nsdt, + bool god, const char *ns, + const BSONObj& updateobj, BSONObj patternOrig, bool logop, OpDebug& debug) { + + DiskLoc loc; + { + IndexDetails& i = d->idx(idIdxNo); + BSONObj key = i.getKeyFromQuery( patternOrig ); + loc = i.idxInterface().findSingle(i, i.head, key); + if( loc.isNull() ) { + // no upsert support in _updateById yet, so we are done. + return UpdateResult(0, 0, 0); + } + } + Record *r = loc.rec(); + + if ( ! r->likelyInPhysicalMemory() ) { + { + scoped_ptr<LockMongoFilesShared> lk( new LockMongoFilesShared() ); + dbtempreleasewritelock t; + r->touch(); + lk.reset(0); // we have to release mmmutex before we can re-acquire dbmutex + } + + { + // we need to re-find in case something changed + d = nsdetails( ns ); + if ( ! d ) { + // dropped + return UpdateResult(0, 0, 0); + } + nsdt = &NamespaceDetailsTransient::get(ns); + IndexDetails& i = d->idx(idIdxNo); + BSONObj key = i.getKeyFromQuery( patternOrig ); + loc = i.idxInterface().findSingle(i, i.head, key); + if( loc.isNull() ) { + // no upsert support in _updateById yet, so we are done. + return UpdateResult(0, 0, 0); + } + + r = loc.rec(); + } + } + + /* look for $inc etc. note as listed here, all fields to inc must be this type, you can't set some + regular ones at the moment. */ + if ( isOperatorUpdate ) { + const BSONObj& onDisk = loc.obj(); + auto_ptr<ModSetState> mss = mods->prepare( onDisk ); + + if( mss->canApplyInPlace() ) { + mss->applyModsInPlace(true); + DEBUGUPDATE( "\t\t\t updateById doing in place update" ); + } + else { + BSONObj newObj = mss->createNewFromMods(); + checkTooLarge(newObj); + assert(nsdt); + theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug); + } + + if ( logop ) { + DEV assert( mods->size() ); + + BSONObj pattern = patternOrig; + if ( mss->haveArrayDepMod() ) { + BSONObjBuilder patternBuilder; + patternBuilder.appendElements( pattern ); + mss->appendSizeSpecForArrayDepMods( patternBuilder ); + pattern = patternBuilder.obj(); + } + + if( mss->needOpLogRewrite() ) { + DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() ); + logOp("u", ns, mss->getOpLogRewrite() , &pattern ); + } + else { + logOp("u", ns, updateobj, &pattern ); + } + } + return UpdateResult( 1 , 1 , 1); + } // end $operator update + + // regular update + BSONElementManipulator::lookForTimestamps( updateobj ); + checkNoMods( updateobj ); + assert(nsdt); + theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug ); + if ( logop ) { + logOp("u", ns, updateobj, &patternOrig ); + } + return UpdateResult( 1 , 0 , 1 ); + } + + UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug, RemoveSaver* rs ) { + DEBUGUPDATE( "update: " << ns << " update: " << updateobj << " query: " << patternOrig << " upsert: " << upsert << " multi: " << multi ); + Client& client = cc(); + int profile = client.database()->profile; + + debug.updateobj = updateobj; + + // idea with these here it to make them loop invariant for multi updates, and thus be a bit faster for that case + // The pointers may be left invalid on a failed or terminal yield recovery. + NamespaceDetails *d = nsdetails(ns); // can be null if an upsert... + NamespaceDetailsTransient *nsdt = &NamespaceDetailsTransient::get(ns); + + auto_ptr<ModSet> mods; + bool isOperatorUpdate = updateobj.firstElementFieldName()[0] == '$'; + int modsIsIndexed = false; // really the # of indexes + if ( isOperatorUpdate ) { + if( d && d->indexBuildInProgress ) { + set<string> bgKeys; + d->inProgIdx().keyPattern().getFieldNames(bgKeys); + mods.reset( new ModSet(updateobj, nsdt->indexKeys(), &bgKeys) ); + } + else { + mods.reset( new ModSet(updateobj, nsdt->indexKeys()) ); + } + modsIsIndexed = mods->isIndexed(); + } + + if( !multi && isSimpleIdQuery(patternOrig) && d && !modsIsIndexed ) { + int idxNo = d->findIdIndex(); + if( idxNo >= 0 ) { + debug.idhack = true; + UpdateResult result = _updateById(isOperatorUpdate, idxNo, mods.get(), profile, d, nsdt, god, ns, updateobj, patternOrig, logop, debug); + if ( result.existing || ! upsert ) { + return result; + } + else if ( upsert && ! isOperatorUpdate && ! logop) { + // this handles repl inserts + checkNoMods( updateobj ); + debug.upsert = true; + BSONObj no = updateobj; + theDataFileMgr.insertWithObjMod(ns, no, god); + return UpdateResult( 0 , 0 , 1 , no ); + } + } + } + + int numModded = 0; + long long nscanned = 0; + shared_ptr< Cursor > c = NamespaceDetailsTransient::getCursor( ns, patternOrig ); + + d = nsdetails(ns); + nsdt = &NamespaceDetailsTransient::get(ns); + bool autoDedup = c->autoDedup(); + + if( c->ok() ) { + set<DiskLoc> seenObjects; + MatchDetails details; + auto_ptr<ClientCursor> cc; + do { + nscanned++; + + bool atomic = c->matcher() && c->matcher()->docMatcher().atomic(); + + if ( !atomic ) { + // ***************** + if ( cc.get() == 0 ) { + shared_ptr< Cursor > cPtr = c; + cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) ); + } + + bool didYield; + if ( ! cc->yieldSometimes( ClientCursor::WillNeed, &didYield ) ) { + cc.release(); + break; + } + if ( !c->ok() ) { + break; + } + + if ( didYield ) { + d = nsdetails(ns); + nsdt = &NamespaceDetailsTransient::get(ns); + } + // ***************** + } + + if ( !c->currentMatches( &details ) ) { + c->advance(); + + if ( nscanned % 256 == 0 && ! atomic ) { + if ( cc.get() == 0 ) { + shared_ptr< Cursor > cPtr = c; + cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) ); + } + if ( ! cc->yield() ) { + cc.release(); + // TODO should we assert or something? + break; + } + if ( !c->ok() ) { + break; + } + d = nsdetails(ns); + nsdt = &NamespaceDetailsTransient::get(ns); + } + continue; + } + + Record *r = c->_current(); + DiskLoc loc = c->currLoc(); + + // TODO Maybe this is unnecessary since we have seenObjects + if ( c->getsetdup( loc ) && autoDedup ) { + c->advance(); + continue; + } + + BSONObj js(r); + + BSONObj pattern = patternOrig; + + if ( logop ) { + BSONObjBuilder idPattern; + BSONElement id; + // NOTE: If the matching object lacks an id, we'll log + // with the original pattern. This isn't replay-safe. + // It might make sense to suppress the log instead + // if there's no id. + if ( js.getObjectID( id ) ) { + idPattern.append( id ); + pattern = idPattern.obj(); + } + else { + uassert( 10157 , "multi-update requires all modified objects to have an _id" , ! multi ); + } + } + + if ( profile && !multi ) + debug.nscanned = (int) nscanned; + + /* look for $inc etc. note as listed here, all fields to inc must be this type, you can't set some + regular ones at the moment. */ + if ( isOperatorUpdate ) { + + if ( multi ) { + c->advance(); // go to next record in case this one moves + if ( autoDedup && seenObjects.count( loc ) ) + continue; + } + + const BSONObj& onDisk = loc.obj(); + + ModSet * useMods = mods.get(); + bool forceRewrite = false; + + auto_ptr<ModSet> mymodset; + if ( details._elemMatchKey && mods->hasDynamicArray() ) { + useMods = mods->fixDynamicArray( details._elemMatchKey ); + mymodset.reset( useMods ); + forceRewrite = true; + } + + auto_ptr<ModSetState> mss = useMods->prepare( onDisk ); + + bool willAdvanceCursor = multi && c->ok() && ( modsIsIndexed || ! mss->canApplyInPlace() ); + + if ( willAdvanceCursor ) { + if ( cc.get() ) { + cc->setDoingDeletes( true ); + } + c->prepareToTouchEarlierIterate(); + } + + if ( modsIsIndexed <= 0 && mss->canApplyInPlace() ) { + mss->applyModsInPlace( true );// const_cast<BSONObj&>(onDisk) ); + + DEBUGUPDATE( "\t\t\t doing in place update" ); + if ( profile && !multi ) + debug.fastmod = true; + + if ( modsIsIndexed ) { + seenObjects.insert( loc ); + } + + d->paddingFits(); + } + else { + if ( rs ) + rs->goingToDelete( onDisk ); + + BSONObj newObj = mss->createNewFromMods(); + checkTooLarge(newObj); + DiskLoc newLoc = theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug); + if ( newLoc != loc || modsIsIndexed ){ + // log() << "Moved obj " << newLoc.obj()["_id"] << " from " << loc << " to " << newLoc << endl; + // object moved, need to make sure we don' get again + seenObjects.insert( newLoc ); + } + + } + + if ( logop ) { + DEV assert( mods->size() ); + + if ( mss->haveArrayDepMod() ) { + BSONObjBuilder patternBuilder; + patternBuilder.appendElements( pattern ); + mss->appendSizeSpecForArrayDepMods( patternBuilder ); + pattern = patternBuilder.obj(); + } + + if ( forceRewrite || mss->needOpLogRewrite() ) { + DEBUGUPDATE( "\t rewrite update: " << mss->getOpLogRewrite() ); + logOp("u", ns, mss->getOpLogRewrite() , &pattern ); + } + else { + logOp("u", ns, updateobj, &pattern ); + } + } + numModded++; + if ( ! multi ) + return UpdateResult( 1 , 1 , numModded ); + if ( willAdvanceCursor ) + c->recoverFromTouchingEarlierIterate(); + + if ( nscanned % 64 == 0 && ! atomic ) { + if ( cc.get() == 0 ) { + shared_ptr< Cursor > cPtr = c; + cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , cPtr , ns ) ); + } + if ( ! cc->yield() ) { + cc.release(); + break; + } + if ( !c->ok() ) { + break; + } + d = nsdetails(ns); + nsdt = &NamespaceDetailsTransient::get(ns); + } + + getDur().commitIfNeeded(); + + continue; + } + + uassert( 10158 , "multi update only works with $ operators" , ! multi ); + + BSONElementManipulator::lookForTimestamps( updateobj ); + checkNoMods( updateobj ); + theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug, god); + if ( logop ) { + DEV wassert( !god ); // god doesn't get logged, this would be bad. + logOp("u", ns, updateobj, &pattern ); + } + return UpdateResult( 1 , 0 , 1 ); + } while ( c->ok() ); + } // endif + + if ( numModded ) + return UpdateResult( 1 , 1 , numModded ); + + // todo: no need for "if( profile )" here as that probably just makes things slower? + if ( profile ) + debug.nscanned = (int) nscanned; + + if ( upsert ) { + if ( updateobj.firstElementFieldName()[0] == '$' ) { + // upsert of an $operation. build a default object + BSONObj newObj = mods->createNewFromQuery( patternOrig ); + checkNoMods( newObj ); + debug.fastmodinsert = true; + theDataFileMgr.insertWithObjMod(ns, newObj, god); + if ( logop ) + logOp( "i", ns, newObj ); + + return UpdateResult( 0 , 1 , 1 , newObj ); + } + uassert( 10159 , "multi update only works with $ operators" , ! multi ); + checkNoMods( updateobj ); + debug.upsert = true; + BSONObj no = updateobj; + theDataFileMgr.insertWithObjMod(ns, no, god); + if ( logop ) + logOp( "i", ns, no ); + return UpdateResult( 0 , 0 , 1 , no ); + } + + return UpdateResult( 0 , isOperatorUpdate , 0 ); + } + + UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj patternOrig, bool upsert, bool multi, bool logop , OpDebug& debug ) { + uassert( 10155 , "cannot update reserved $ collection", strchr(ns, '$') == 0 ); + if ( strstr(ns, ".system.") ) { + /* dm: it's very important that system.indexes is never updated as IndexDetails has pointers into it */ + uassert( 10156 , str::stream() << "cannot update system collection: " << ns << " q: " << patternOrig << " u: " << updateobj , legalClientSystemNS( ns , true ) ); + } + return _updateObjects(false, ns, updateobj, patternOrig, upsert, multi, logop, debug); + } + +} diff --git a/src/mongo/db/ops/update.h b/src/mongo/db/ops/update.h new file mode 100644 index 00000000000..9446db06d36 --- /dev/null +++ b/src/mongo/db/ops/update.h @@ -0,0 +1,700 @@ +// update.h + +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "../../pch.h" +#include "../jsobj.h" +#include "../../util/embedded_builder.h" +#include "../matcher.h" + +namespace mongo { + + // ---------- public ------------- + + struct UpdateResult { + const bool existing; // if existing objects were modified + const bool mod; // was this a $ mod + const long long num; // how many objects touched + OID upserted; // if something was upserted, the new _id of the object + + UpdateResult( bool e, bool m, unsigned long long n , const BSONObj& upsertedObject = BSONObj() ) + : existing(e) , mod(m), num(n) { + upserted.clear(); + BSONElement id = upsertedObject["_id"]; + if ( ! e && n == 1 && id.type() == jstOID ) { + upserted = id.OID(); + } + } + }; + + class RemoveSaver; + + /* returns true if an existing object was updated, false if no existing object was found. + multi - update multiple objects - mostly useful with things like $set + god - allow access to system namespaces + */ + UpdateResult updateObjects(const char *ns, const BSONObj& updateobj, BSONObj pattern, bool upsert, bool multi , bool logop , OpDebug& debug ); + UpdateResult _updateObjects(bool god, const char *ns, const BSONObj& updateobj, BSONObj pattern, + bool upsert, bool multi , bool logop , OpDebug& debug , RemoveSaver * rs = 0 ); + + + + // ---------- private ------------- + + class ModState; + class ModSetState; + + /* Used for modifiers such as $inc, $set, $push, ... + * stores the info about a single operation + * once created should never be modified + */ + struct Mod { + // See opFromStr below + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + enum Op { INC, SET, PUSH, PUSH_ALL, PULL, PULL_ALL , POP, UNSET, BITAND, BITOR , BIT , ADDTOSET, RENAME_FROM, RENAME_TO } op; + + static const char* modNames[]; + static unsigned modNamesNum; + + const char *fieldName; + const char *shortFieldName; + + BSONElement elt; // x:5 note: this is the actual element from the updateobj + boost::shared_ptr<Matcher> matcher; + bool matcherOnPrimitive; + + void init( Op o , BSONElement& e ) { + op = o; + elt = e; + if ( op == PULL && e.type() == Object ) { + BSONObj t = e.embeddedObject(); + if ( t.firstElement().getGtLtOp() == 0 ) { + matcher.reset( new Matcher( t ) ); + matcherOnPrimitive = false; + } + else { + matcher.reset( new Matcher( BSON( "" << t ) ) ); + matcherOnPrimitive = true; + } + } + } + + void setFieldName( const char * s ) { + fieldName = s; + shortFieldName = strrchr( fieldName , '.' ); + if ( shortFieldName ) + shortFieldName++; + else + shortFieldName = fieldName; + } + + /** + * @param in incrememnts the actual value inside in + */ + void incrementMe( BSONElement& in ) const { + BSONElementManipulator manip( in ); + switch ( in.type() ) { + case NumberDouble: + manip.setNumber( elt.numberDouble() + in.numberDouble() ); + break; + case NumberLong: + manip.setLong( elt.numberLong() + in.numberLong() ); + break; + case NumberInt: + manip.setInt( elt.numberInt() + in.numberInt() ); + break; + default: + assert(0); + } + } + void IncrementMe( BSONElement& in ) const { + BSONElementManipulator manip( in ); + switch ( in.type() ) { + case NumberDouble: + manip.SetNumber( elt.numberDouble() + in.numberDouble() ); + break; + case NumberLong: + manip.SetLong( elt.numberLong() + in.numberLong() ); + break; + case NumberInt: + manip.SetInt( elt.numberInt() + in.numberInt() ); + break; + default: + assert(0); + } + } + + template< class Builder > + void appendIncremented( Builder& bb , const BSONElement& in, ModState& ms ) const; + + bool operator<( const Mod &other ) const { + return strcmp( fieldName, other.fieldName ) < 0; + } + + bool arrayDep() const { + switch (op) { + case PUSH: + case PUSH_ALL: + case POP: + return true; + default: + return false; + } + } + + static bool isIndexed( const string& fullName , const set<string>& idxKeys ) { + const char * fieldName = fullName.c_str(); + // check if there is an index key that is a parent of mod + for( const char *dot = strchr( fieldName, '.' ); dot; dot = strchr( dot + 1, '.' ) ) + if ( idxKeys.count( string( fieldName, dot - fieldName ) ) ) + return true; + + // check if there is an index key equal to mod + if ( idxKeys.count(fullName) ) + return true; + // check if there is an index key that is a child of mod + set< string >::const_iterator j = idxKeys.upper_bound( fullName ); + if ( j != idxKeys.end() && j->find( fullName ) == 0 && (*j)[fullName.size()] == '.' ) + return true; + + return false; + } + + bool isIndexed( const set<string>& idxKeys ) const { + string fullName = fieldName; + + if ( isIndexed( fullName , idxKeys ) ) + return true; + + if ( strstr( fieldName , "." ) ) { + // check for a.0.1 + StringBuilder buf( fullName.size() + 1 ); + for ( size_t i=0; i<fullName.size(); i++ ) { + char c = fullName[i]; + + if ( c == '$' && + i > 0 && fullName[i-1] == '.' && + i+1<fullName.size() && + fullName[i+1] == '.' ) { + i++; + continue; + } + + buf << c; + + if ( c != '.' ) + continue; + + if ( ! isdigit( fullName[i+1] ) ) + continue; + + bool possible = true; + size_t j=i+2; + for ( ; j<fullName.size(); j++ ) { + char d = fullName[j]; + if ( d == '.' ) + break; + if ( isdigit( d ) ) + continue; + possible = false; + break; + } + + if ( possible ) + i = j; + } + string x = buf.str(); + if ( isIndexed( x , idxKeys ) ) + return true; + } + + return false; + } + + template< class Builder > + void apply( Builder& b , BSONElement in , ModState& ms ) const; + + /** + * @return true iff toMatch should be removed from the array + */ + bool _pullElementMatch( BSONElement& toMatch ) const; + + void _checkForAppending( const BSONElement& e ) const { + if ( e.type() == Object ) { + // this is a tiny bit slow, but rare and important + // only when setting something TO an object, not setting something in an object + // and it checks for { $set : { x : { 'a.b' : 1 } } } + // which is feel has been common + uassert( 12527 , "not okForStorage" , e.embeddedObject().okForStorage() ); + } + } + + bool isEach() const { + if ( elt.type() != Object ) + return false; + BSONElement e = elt.embeddedObject().firstElement(); + if ( e.type() != Array ) + return false; + return strcmp( e.fieldName() , "$each" ) == 0; + } + + BSONObj getEach() const { + return elt.embeddedObjectUserCheck().firstElement().embeddedObjectUserCheck(); + } + + void parseEach( BSONElementSet& s ) const { + BSONObjIterator i(getEach()); + while ( i.more() ) { + s.insert( i.next() ); + } + } + + const char *renameFrom() const { + massert( 13492, "mod must be RENAME_TO type", op == Mod::RENAME_TO ); + return elt.fieldName(); + } + }; + + /** + * stores a set of Mods + * once created, should never be changed + */ + class ModSet : boost::noncopyable { + typedef map<string,Mod> ModHolder; + ModHolder _mods; + int _isIndexed; + bool _hasDynamicArray; + + static void extractFields( map< string, BSONElement > &fields, const BSONElement &top, const string &base ); + + FieldCompareResult compare( const ModHolder::iterator &m, map< string, BSONElement >::iterator &p, const map< string, BSONElement >::iterator &pEnd ) const { + bool mDone = ( m == _mods.end() ); + bool pDone = ( p == pEnd ); + assert( ! mDone ); + assert( ! pDone ); + if ( mDone && pDone ) + return SAME; + // If one iterator is done we want to read from the other one, so say the other one is lower. + if ( mDone ) + return RIGHT_BEFORE; + if ( pDone ) + return LEFT_BEFORE; + + return compareDottedFieldNames( m->first, p->first.c_str() ); + } + + bool mayAddEmbedded( map< string, BSONElement > &existing, string right ) { + for( string left = EmbeddedBuilder::splitDot( right ); + left.length() > 0 && left[ left.length() - 1 ] != '.'; + left += "." + EmbeddedBuilder::splitDot( right ) ) { + if ( existing.count( left ) > 0 && existing[ left ].type() != Object ) + return false; + if ( haveModForField( left.c_str() ) ) + return false; + } + return true; + } + static Mod::Op opFromStr( const char *fn ) { + assert( fn[0] == '$' ); + switch( fn[1] ) { + case 'i': { + if ( fn[2] == 'n' && fn[3] == 'c' && fn[4] == 0 ) + return Mod::INC; + break; + } + case 's': { + if ( fn[2] == 'e' && fn[3] == 't' && fn[4] == 0 ) + return Mod::SET; + break; + } + case 'p': { + if ( fn[2] == 'u' ) { + if ( fn[3] == 's' && fn[4] == 'h' ) { + if ( fn[5] == 0 ) + return Mod::PUSH; + if ( fn[5] == 'A' && fn[6] == 'l' && fn[7] == 'l' && fn[8] == 0 ) + return Mod::PUSH_ALL; + } + else if ( fn[3] == 'l' && fn[4] == 'l' ) { + if ( fn[5] == 0 ) + return Mod::PULL; + if ( fn[5] == 'A' && fn[6] == 'l' && fn[7] == 'l' && fn[8] == 0 ) + return Mod::PULL_ALL; + } + } + else if ( fn[2] == 'o' && fn[3] == 'p' && fn[4] == 0 ) + return Mod::POP; + break; + } + case 'u': { + if ( fn[2] == 'n' && fn[3] == 's' && fn[4] == 'e' && fn[5] == 't' && fn[6] == 0 ) + return Mod::UNSET; + break; + } + case 'b': { + if ( fn[2] == 'i' && fn[3] == 't' ) { + if ( fn[4] == 0 ) + return Mod::BIT; + if ( fn[4] == 'a' && fn[5] == 'n' && fn[6] == 'd' && fn[7] == 0 ) + return Mod::BITAND; + if ( fn[4] == 'o' && fn[5] == 'r' && fn[6] == 0 ) + return Mod::BITOR; + } + break; + } + case 'a': { + if ( fn[2] == 'd' && fn[3] == 'd' ) { + // add + if ( fn[4] == 'T' && fn[5] == 'o' && fn[6] == 'S' && fn[7] == 'e' && fn[8] == 't' && fn[9] == 0 ) + return Mod::ADDTOSET; + + } + break; + } + case 'r': { + if ( fn[2] == 'e' && fn[3] == 'n' && fn[4] == 'a' && fn[5] == 'm' && fn[6] =='e' ) { + return Mod::RENAME_TO; // with this return code we handle both RENAME_TO and RENAME_FROM + } + break; + } + default: break; + } + uassert( 10161 , "Invalid modifier specified " + string( fn ), false ); + return Mod::INC; + } + + ModSet() {} + + void updateIsIndexed( const Mod &m, const set<string> &idxKeys, const set<string> *backgroundKeys ) { + if ( m.isIndexed( idxKeys ) || + (backgroundKeys && m.isIndexed(*backgroundKeys)) ) { + _isIndexed++; + } + } + + public: + + ModSet( const BSONObj &from , + const set<string>& idxKeys = set<string>(), + const set<string>* backgroundKeys = 0 + ); + + // TODO: this is inefficient - should probably just handle when iterating + ModSet * fixDynamicArray( const char * elemMatchKey ) const; + + bool hasDynamicArray() const { return _hasDynamicArray; } + + /** + * creates a ModSetState suitable for operation on obj + * doesn't change or modify this ModSet or any underying Mod + */ + auto_ptr<ModSetState> prepare( const BSONObj& obj ) const; + + /** + * given a query pattern, builds an object suitable for an upsert + * will take the query spec and combine all $ operators + */ + BSONObj createNewFromQuery( const BSONObj& query ); + + /** + * + */ + int isIndexed() const { + return _isIndexed; + } + + unsigned size() const { return _mods.size(); } + + bool haveModForField( const char *fieldName ) const { + return _mods.find( fieldName ) != _mods.end(); + } + + bool haveConflictingMod( const string& fieldName ) { + size_t idx = fieldName.find( '.' ); + if ( idx == string::npos ) + idx = fieldName.size(); + + ModHolder::const_iterator start = _mods.lower_bound(fieldName.substr(0,idx)); + for ( ; start != _mods.end(); start++ ) { + FieldCompareResult r = compareDottedFieldNames( fieldName , start->first ); + switch ( r ) { + case LEFT_SUBFIELD: return true; + case LEFT_BEFORE: return false; + case SAME: return true; + case RIGHT_BEFORE: return false; + case RIGHT_SUBFIELD: return true; + } + } + return false; + + + } + + }; + + /** + * stores any information about a single Mod operating on a single Object + */ + class ModState { + public: + const Mod * m; + BSONElement old; + BSONElement newVal; + BSONObj _objData; + + const char * fixedOpName; + BSONElement * fixed; + int pushStartSize; + + BSONType incType; + int incint; + double incdouble; + long long inclong; + + bool dontApply; + + ModState() { + fixedOpName = 0; + fixed = 0; + pushStartSize = -1; + incType = EOO; + dontApply = false; + } + + Mod::Op op() const { + return m->op; + } + + const char * fieldName() const { + return m->fieldName; + } + + bool needOpLogRewrite() const { + if ( dontApply ) + return false; + + if ( fixed || fixedOpName || incType ) + return true; + + switch( op() ) { + case Mod::RENAME_FROM: + case Mod::RENAME_TO: + return true; + case Mod::BIT: + case Mod::BITAND: + case Mod::BITOR: + // TODO: should we convert this to $set? + return false; + default: + return false; + } + } + + void appendForOpLog( BSONObjBuilder& b ) const; + + template< class Builder > + void apply( Builder& b , BSONElement in ) { + m->apply( b , in , *this ); + } + + template< class Builder > + void appendIncValue( Builder& b , bool useFullName ) const { + const char * n = useFullName ? m->fieldName : m->shortFieldName; + + switch ( incType ) { + case NumberDouble: + b.append( n , incdouble ); break; + case NumberLong: + b.append( n , inclong ); break; + case NumberInt: + b.append( n , incint ); break; + default: + assert(0); + } + } + + string toString() const; + + template< class Builder > + void handleRename( Builder &newObjBuilder, const char *shortFieldName ); + }; + + /** + * this is used to hold state, meta data while applying a ModSet to a BSONObj + * the goal is to make ModSet const so its re-usable + */ + class ModSetState : boost::noncopyable { + struct FieldCmp { + bool operator()( const string &l, const string &r ) const; + }; + typedef map<string,ModState,FieldCmp> ModStateHolder; + const BSONObj& _obj; + ModStateHolder _mods; + bool _inPlacePossible; + BSONObj _newFromMods; // keep this data alive, as oplog generation may depend on it + + ModSetState( const BSONObj& obj ) + : _obj( obj ) , _inPlacePossible(true) { + } + + /** + * @return if in place is still possible + */ + bool amIInPlacePossible( bool inPlacePossible ) { + if ( ! inPlacePossible ) + _inPlacePossible = false; + return _inPlacePossible; + } + + template< class Builder > + void createNewFromMods( const string& root , Builder& b , const BSONObj &obj ); + + template< class Builder > + void _appendNewFromMods( const string& root , ModState& m , Builder& b , set<string>& onedownseen ); + + template< class Builder > + void appendNewFromMod( ModState& ms , Builder& b ) { + if ( ms.dontApply ) { + return; + } + + //const Mod& m = *(ms.m); // HACK + Mod& m = *((Mod*)(ms.m)); // HACK + + switch ( m.op ) { + + case Mod::PUSH: { + if ( m.isEach() ) { + b.appendArray( m.shortFieldName, m.getEach() ); + } else { + BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) ); + arr.appendAs( m.elt, "0" ); + arr.done(); + } + break; + } + case Mod::ADDTOSET: { + if ( m.isEach() ) { + // Remove any duplicates in given array + BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) ); + BSONElementSet toadd; + m.parseEach( toadd ); + BSONObjIterator i( m.getEach() ); + int n = 0; + while ( i.more() ) { + BSONElement e = i.next(); + if ( toadd.count(e) ) { + arr.appendAs( e , BSONObjBuilder::numStr( n++ ) ); + toadd.erase( e ); + } + } + arr.done(); + } + else { + BSONObjBuilder arr( b.subarrayStart( m.shortFieldName ) ); + arr.appendAs( m.elt, "0" ); + arr.done(); + } + break; + } + + case Mod::PUSH_ALL: { + b.appendAs( m.elt, m.shortFieldName ); + break; + } + + case Mod::UNSET: + case Mod::PULL: + case Mod::PULL_ALL: + // no-op b/c unset/pull of nothing does nothing + break; + + case Mod::INC: + ms.fixedOpName = "$set"; + case Mod::SET: { + m._checkForAppending( m.elt ); + b.appendAs( m.elt, m.shortFieldName ); + break; + } + // shouldn't see RENAME_FROM here + case Mod::RENAME_TO: + ms.handleRename( b, m.shortFieldName ); + break; + default: + stringstream ss; + ss << "unknown mod in appendNewFromMod: " << m.op; + throw UserException( 9015, ss.str() ); + } + + } + + public: + + bool canApplyInPlace() const { + return _inPlacePossible; + } + + /** + * modified underlying _obj + * @param isOnDisk - true means this is an on disk object, and this update needs to be made durable + */ + void applyModsInPlace( bool isOnDisk ); + + BSONObj createNewFromMods(); + + // re-writing for oplog + + bool needOpLogRewrite() const { + for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) + if ( i->second.needOpLogRewrite() ) + return true; + return false; + } + + BSONObj getOpLogRewrite() const { + BSONObjBuilder b; + for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) + i->second.appendForOpLog( b ); + return b.obj(); + } + + bool haveArrayDepMod() const { + for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) + if ( i->second.m->arrayDep() ) + return true; + return false; + } + + void appendSizeSpecForArrayDepMods( BSONObjBuilder &b ) const { + for ( ModStateHolder::const_iterator i = _mods.begin(); i != _mods.end(); i++ ) { + const ModState& m = i->second; + if ( m.m->arrayDep() ) { + if ( m.pushStartSize == -1 ) + b.appendNull( m.fieldName() ); + else + b << m.fieldName() << BSON( "$size" << m.pushStartSize ); + } + } + } + + string toString() const; + + friend class ModSet; + }; + +} + diff --git a/src/mongo/db/pagefault.cpp b/src/mongo/db/pagefault.cpp new file mode 100644 index 00000000000..4b9b1b23e02 --- /dev/null +++ b/src/mongo/db/pagefault.cpp @@ -0,0 +1,55 @@ +// @file pagefault.cpp + +#include "pch.h" +#include "diskloc.h" +#include "pagefault.h" +#include "client.h" +#include "pdfile.h" +#include "server.h" + +namespace mongo { + + PageFaultException::PageFaultException(Record *_r) + { + assert( cc()._pageFaultRetryableSection != 0 ); + cc()._pageFaultRetryableSection->_laps++; + assert( cc()._pageFaultRetryableSection->_laps < 1000 ); + r = _r; + era = LockMongoFilesShared::getEra(); + } + + void PageFaultException::touch() { + assert( !d.dbMutex.atLeastReadLocked() ); + LockMongoFilesShared lk; + if( LockMongoFilesShared::getEra() != era ) { + // files opened and closed. we don't try to handle but just bail out; this is much simpler + // and less error prone and saves us from taking a dbmutex readlock. + dlog(2) << "era changed" << endl; + return; + } + r->touch(); + } + + PageFaultRetryableSection::~PageFaultRetryableSection() { + cc()._pageFaultRetryableSection = old; + } + PageFaultRetryableSection::PageFaultRetryableSection() { + _laps = 0; + old = cc()._pageFaultRetryableSection; + if( d.dbMutex.atLeastReadLocked() ) { + cc()._pageFaultRetryableSection = 0; + if( debug || logLevel > 2 ) { + LOGSOME << "info PageFaultRetryableSection will not yield, already locked upon reaching" << endl; + } + } + else if( cc()._pageFaultRetryableSection ) { + cc()._pageFaultRetryableSection = 0; + dlog(2) << "info nested PageFaultRetryableSection will not yield on fault" << endl; + } + else { + cc()._pageFaultRetryableSection = this; + cc()._hasWrittenThisPass = false; + } + } + +} diff --git a/src/mongo/db/pagefault.h b/src/mongo/db/pagefault.h new file mode 100644 index 00000000000..8bbf4ecab52 --- /dev/null +++ b/src/mongo/db/pagefault.h @@ -0,0 +1,46 @@ +// @file pagefault.h + +// define this : _PAGEFAULTEXCEPTION + +#pragma once + +namespace mongo { + + class Record; + + class PageFaultException /*: public DBException*/ { + unsigned era; + Record *r; + public: + PageFaultException(const PageFaultException& rhs) : era(rhs.era), r(rhs.r) { } + explicit PageFaultException(Record*); + void touch(); + }; + + class PageFaultRetryableSection : boost::noncopyable { + PageFaultRetryableSection *old; + public: + unsigned _laps; + PageFaultRetryableSection(); + ~PageFaultRetryableSection(); + }; +#if 0 + inline void how_to_use_example() { + // ... + { + PageFaultRetryableSection s; + while( 1 ) { + try { + writelock lk; // or readlock + // do work + break; + } + catch( PageFaultException& e ) { + e.touch(); + } + } + } + // ... + } +#endif +} diff --git a/src/mongo/db/pcre.txt b/src/mongo/db/pcre.txt new file mode 100644 index 00000000000..3e21047eabc --- /dev/null +++ b/src/mongo/db/pcre.txt @@ -0,0 +1,15 @@ + + +You need to install pcre. + +This could be scripted: + +cd /tmp +curl -O ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-7.4.tar.gz +tar -xzf pcre-7.4.tar.gz +./configure --enable-utf8 --with-match-limit=200000 --with-match-limit-recursion=4000 +make +make install + + +At that point is will be installed in /usr/*. the version in p/pcre-7.4 is for VC++. diff --git a/src/mongo/db/pdfile.cpp b/src/mongo/db/pdfile.cpp new file mode 100644 index 00000000000..069eeadec37 --- /dev/null +++ b/src/mongo/db/pdfile.cpp @@ -0,0 +1,2425 @@ +// pdfile.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* +todo: +_ table scans must be sequential, not next/prev pointers +_ coalesce deleted +_ disallow system* manipulations from the database. +*/ + +#include "pch.h" +#include "pdfile.h" +#include "db.h" +#include "../util/mmap.h" +#include "../util/hashtab.h" +#include "../util/file_allocator.h" +#include "../util/processinfo.h" +#include "../util/file.h" +#include "btree.h" +#include "btreebuilder.h" +#include <algorithm> +#include <list> +#include "repl.h" +#include "dbhelpers.h" +#include "namespace-inl.h" +#include "queryutil.h" +#include "extsort.h" +#include "curop-inl.h" +#include "background.h" +#include "compact.h" +#include "ops/delete.h" +#include "instance.h" +#include "replutil.h" + +namespace mongo { + + BOOST_STATIC_ASSERT( sizeof(Extent)-4 == 48+128 ); + BOOST_STATIC_ASSERT( sizeof(DataFileHeader)-4 == 8192 ); + + void printMemInfo( const char * where ) { + cout << "mem info: "; + if ( where ) + cout << where << " "; + ProcessInfo pi; + if ( ! pi.supported() ) { + cout << " not supported" << endl; + return; + } + + cout << "vsize: " << pi.getVirtualMemorySize() << " resident: " << pi.getResidentSize() << " mapped: " << ( MemoryMappedFile::totalMappedLength() / ( 1024 * 1024 ) ) << endl; + } + + bool isValidNS( const StringData& ns ) { + // TODO: should check for invalid characters + + const char * x = strchr( ns.data() , '.' ); + if ( ! x ) + return false; + + x++; + return *x > 0; + } + + bool inDBRepair = false; + struct doingRepair { + doingRepair() { + assert( ! inDBRepair ); + inDBRepair = true; + } + ~doingRepair() { + inDBRepair = false; + } + }; + + map<string, unsigned> BackgroundOperation::dbsInProg; + set<string> BackgroundOperation::nsInProg; + + bool BackgroundOperation::inProgForDb(const char *db) { + assertInWriteLock(); + return dbsInProg[db] != 0; + } + + bool BackgroundOperation::inProgForNs(const char *ns) { + assertInWriteLock(); + return nsInProg.count(ns) != 0; + } + + void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) { + uassert(12586, "cannot perform operation: a background operation is currently running for this database", + !inProgForDb(db)); + } + + void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) { + uassert(12587, "cannot perform operation: a background operation is currently running for this collection", + !inProgForNs(ns)); + } + + BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) { + assertInWriteLock(); + dbsInProg[_ns.db]++; + assert( nsInProg.count(_ns.ns()) == 0 ); + nsInProg.insert(_ns.ns()); + } + + BackgroundOperation::~BackgroundOperation() { + wassert( d.dbMutex.isWriteLocked() ); + dbsInProg[_ns.db]--; + nsInProg.erase(_ns.ns()); + } + + void BackgroundOperation::dump(stringstream& ss) { + if( nsInProg.size() ) { + ss << "\n<b>Background Jobs in Progress</b>\n"; + for( set<string>::iterator i = nsInProg.begin(); i != nsInProg.end(); i++ ) + ss << " " << *i << '\n'; + } + for( map<string,unsigned>::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) { + if( i->second ) + ss << "database " << i->first << ": " << i->second << '\n'; + } + } + + /* ----------------------------------------- */ + + string dbpath = "/data/db/"; + const char FREELIST_NS[] = ".$freelist"; + bool directoryperdb = false; + string repairpath; + string pidfilepath; + + DataFileMgr theDataFileMgr; + DatabaseHolder _dbHolder; + int MAGIC = 0x1000; + + DatabaseHolder& dbHolderUnchecked() { + return _dbHolder; + } + + void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0); + void ensureIdIndexForNewNs(const char *ns) { + if ( ( strstr( ns, ".system." ) == 0 || legalClientSystemNS( ns , false ) ) && + strstr( ns, FREELIST_NS ) == 0 ) { + log( 1 ) << "adding _id index for collection " << ns << endl; + ensureHaveIdIndex( ns ); + } + } + + string getDbContext() { + stringstream ss; + Client * c = currentClient.get(); + if ( c ) { + Client::Context * cx = c->getContext(); + if ( cx ) { + Database *database = cx->db(); + if ( database ) { + ss << database->name << ' '; + ss << cx->ns() << ' '; + } + } + } + return ss.str(); + } + + /*---------------------------------------------------------------------*/ + + // inheritable class to implement an operation that may be applied to all + // files in a database using _applyOpToDataFiles() + class FileOp { + public: + virtual ~FileOp() {} + // Return true if file exists and operation successful + virtual bool apply( const boost::filesystem::path &p ) = 0; + virtual const char * op() const = 0; + }; + + void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath ); + + void _deleteDataFiles(const char *database) { + if ( directoryperdb ) { + FileAllocator::get()->waitUntilFinished(); + MONGO_BOOST_CHECK_EXCEPTION_WITH_MSG( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ), "delete data files with a directoryperdb" ); + return; + } + class : public FileOp { + virtual bool apply( const boost::filesystem::path &p ) { + return boost::filesystem::remove( p ); + } + virtual const char * op() const { + return "remove"; + } + } deleter; + _applyOpToDataFiles( database, deleter, true ); + } + + int Extent::initialSize(int len) { + long long sz = len * 16; + if ( len < 1000 ) sz = len * 64; + if ( sz > 1000000000 ) + sz = 1000000000; + int z = ((int)sz) & 0xffffff00; + assert( z > len ); + return z; + } + + bool _userCreateNS(const char *ns, const BSONObj& options, string& err, bool *deferIdIndex) { + if ( nsdetails(ns) ) { + err = "collection already exists"; + return false; + } + + log(1) << "create collection " << ns << ' ' << options << endl; + + /* todo: do this only when we have allocated space successfully? or we could insert with a { ok: 0 } field + and then go back and set to ok : 1 after we are done. + */ + bool isFreeList = strstr(ns, FREELIST_NS) != 0; + if( !isFreeList ) + addNewNamespaceToCatalog(ns, options.isEmpty() ? 0 : &options); + + long long size = Extent::initialSize(128); + { + BSONElement e = options.getField("size"); + if ( e.isNumber() ) { + size = e.numberLong(); + size += 256; + size &= 0xffffffffffffff00LL; + } + } + + uassert( 10083 , "create collection invalid size spec", size > 0 ); + + bool newCapped = false; + int mx = 0; + if( options["capped"].trueValue() ) { + newCapped = true; + BSONElement e = options.getField("max"); + if ( e.isNumber() ) { + mx = e.numberInt(); + } + } + + // $nExtents just for debug/testing. + BSONElement e = options.getField( "$nExtents" ); + Database *database = cc().database(); + if ( e.type() == Array ) { + // We create one extent per array entry, with size specified + // by the array value. + BSONObjIterator i( e.embeddedObject() ); + while( i.more() ) { + BSONElement e = i.next(); + int size = int( e.number() ); + assert( size <= 0x7fffffff ); + // $nExtents is just for testing - always allocate new extents + // rather than reuse existing extents so we have some predictibility + // in the extent size used by our tests + database->suitableFile( ns, (int) size, false, false )->createExtent( ns, (int) size, newCapped ); + } + } + else if ( int( e.number() ) > 0 ) { + // We create '$nExtents' extents, each of size 'size'. + int nExtents = int( e.number() ); + assert( size <= 0x7fffffff ); + for ( int i = 0; i < nExtents; ++i ) { + assert( size <= 0x7fffffff ); + // $nExtents is just for testing - always allocate new extents + // rather than reuse existing extents so we have some predictibility + // in the extent size used by our tests + database->suitableFile( ns, (int) size, false, false )->createExtent( ns, (int) size, newCapped ); + } + } + else { + // This is the non test case, where we don't have a $nExtents spec. + while ( size > 0 ) { + int max = MongoDataFile::maxSize() - DataFileHeader::HeaderSize; + int desiredExtentSize = (int) (size > max ? max : size); + if ( desiredExtentSize < Extent::minSize() ) { + desiredExtentSize = Extent::minSize(); + } + desiredExtentSize &= 0xffffff00; + Extent *e = database->allocExtent( ns, desiredExtentSize, newCapped, true ); + size -= e->length; + } + } + + NamespaceDetails *d = nsdetails(ns); + assert(d); + + bool ensure = false; + if ( options.getField( "autoIndexId" ).type() ) { + if ( options["autoIndexId"].trueValue() ) { + ensure = true; + } + } + else { + if ( !newCapped ) { + ensure=true; + } + } + if( ensure ) { + if( deferIdIndex ) + *deferIdIndex = true; + else + ensureIdIndexForNewNs( ns ); + } + + if ( mx > 0 ) + getDur().writingInt( d->max ) = mx; + + return true; + } + + /** { ..., capped: true, size: ..., max: ... } + @param deferIdIndex - if not not, defers id index creation. sets the bool value to true if we wanted to create the id index. + @return true if successful + */ + bool userCreateNS(const char *ns, BSONObj options, string& err, bool logForReplication, bool *deferIdIndex) { + const char *coll = strchr( ns, '.' ) + 1; + massert( 10356 , str::stream() << "invalid ns: " << ns , NamespaceString::validCollectionName(ns)); + char cl[ 256 ]; + nsToDatabase( ns, cl ); + bool ok = _userCreateNS(ns, options, err, deferIdIndex); + if ( logForReplication && ok ) { + if ( options.getField( "create" ).eoo() ) { + BSONObjBuilder b; + b << "create" << coll; + b.appendElements( options ); + options = b.obj(); + } + string logNs = string( cl ) + ".$cmd"; + logOp("c", logNs.c_str(), options); + } + return ok; + } + + /*---------------------------------------------------------------------*/ + + int MongoDataFile::maxSize() { + if ( sizeof( int* ) == 4 ) { + return 512 * 1024 * 1024; + } + else if ( cmdLine.smallfiles ) { + return 0x7ff00000 >> 2; + } + else { + return 0x7ff00000; + } + } + + NOINLINE_DECL void MongoDataFile::badOfs2(int ofs) const { + stringstream ss; + ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database"; + uasserted(13441, ss.str()); + } + + NOINLINE_DECL void MongoDataFile::badOfs(int ofs) const { + stringstream ss; + ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database"; + uasserted(13440, ss.str()); + } + + int MongoDataFile::defaultSize( const char *filename ) const { + int size; + if ( fileNo <= 4 ) + size = (64*1024*1024) << fileNo; + else + size = 0x7ff00000; + if ( cmdLine.smallfiles ) { + size = size >> 2; + } + return size; + } + + static void check(void *_mb) { + if( sizeof(char *) == 4 ) + uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", _mb != 0); + else + uassert( 10085 , "can't map file memory", _mb != 0); + } + + /** @return true if found and opened. if uninitialized (prealloc only) does not open. */ + bool MongoDataFile::openExisting( const char *filename ) { + assert( _mb == 0 ); + if( !exists(filename) ) + return false; + if( !mmf.open(filename,false) ) { + dlog(2) << "info couldn't open " << filename << " probably end of datafile list" << endl; + return false; + } + _mb = mmf.getView(); assert(_mb); + unsigned long long sz = mmf.length(); + assert( sz <= 0x7fffffff ); + assert( sz % 4096 == 0 ); + if( sz < 64*1024*1024 && !cmdLine.smallfiles ) { + if( sz >= 16*1024*1024 && sz % (1024*1024) == 0 ) { + log() << "info openExisting file size " << sz << " but cmdLine.smallfiles=false" << endl; + } + else { + log() << "openExisting size " << sz << " less then minimum file size expectation " << filename << endl; + assert(false); + } + } + check(_mb); + if( header()->uninitialized() ) + return false; + return true; + } + + void MongoDataFile::open( const char *filename, int minSize, bool preallocateOnly ) { + long size = defaultSize( filename ); + while ( size < minSize ) { + if ( size < maxSize() / 2 ) + size *= 2; + else { + size = maxSize(); + break; + } + } + if ( size > maxSize() ) + size = maxSize(); + + assert( size >= 64*1024*1024 || cmdLine.smallfiles ); + assert( size % 4096 == 0 ); + + if ( preallocateOnly ) { + if ( cmdLine.prealloc ) { + FileAllocator::get()->requestAllocation( filename, size ); + } + return; + } + + { + assert( _mb == 0 ); + unsigned long long sz = size; + if( mmf.create(filename, sz, false) ) + _mb = mmf.getView(); + assert( sz <= 0x7fffffff ); + size = (int) sz; + } + check(_mb); + header()->init(fileNo, size, filename); + } + + void MongoDataFile::flush( bool sync ) { + mmf.flush( sync ); + } + + void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) { + NamespaceIndex *ni = nsindex(ns); + NamespaceDetails *details = ni->details(ns); + if ( details ) { + assert( !details->lastExtent.isNull() ); + assert( !details->firstExtent.isNull() ); + getDur().writingDiskLoc(e->xprev) = details->lastExtent; + getDur().writingDiskLoc(details->lastExtent.ext()->xnext) = eloc; + assert( !eloc.isNull() ); + getDur().writingDiskLoc(details->lastExtent) = eloc; + } + else { + ni->add_ns(ns, eloc, capped); + details = ni->details(ns); + } + + { + NamespaceDetails *dw = details->writingWithoutExtra(); + dw->lastExtentSize = e->length; + } + details->addDeletedRec(emptyLoc.drec(), emptyLoc); + } + + Extent* MongoDataFile::createExtent(const char *ns, int approxSize, bool newCapped, int loops) { + { + // make sizes align with VM page size + int newSize = (approxSize + 0xfff) & 0xfffff000; + assert( newSize >= 0 ); + if( newSize < Extent::maxSize() ) + approxSize = newSize; + } + massert( 10357 , "shutdown in progress", ! inShutdown() ); + massert( 10358 , "bad new extent size", approxSize >= Extent::minSize() && approxSize <= Extent::maxSize() ); + massert( 10359 , "header==0 on new extent: 32 bit mmap space exceeded?", header() ); // null if file open failed + int ExtentSize = min(header()->unusedLength, approxSize); + DiskLoc loc; + if ( ExtentSize < Extent::minSize() ) { + /* note there could be a lot of looping here is db just started and + no files are open yet. we might want to do something about that. */ + if ( loops > 8 ) { + assert( loops < 10000 ); + out() << "warning: loops=" << loops << " fileno:" << fileNo << ' ' << ns << '\n'; + } + log() << "newExtent: " << ns << " file " << fileNo << " full, adding a new file\n"; + return cc().database()->addAFile( 0, true )->createExtent(ns, approxSize, newCapped, loops+1); + } + int offset = header()->unused.getOfs(); + + DataFileHeader *h = header(); + h->unused.writing().set( fileNo, offset + ExtentSize ); + getDur().writingInt(h->unusedLength) = h->unusedLength - ExtentSize; + loc.set(fileNo, offset); + Extent *e = _getExtent(loc); + DiskLoc emptyLoc = getDur().writing(e)->init(ns, ExtentSize, fileNo, offset, newCapped); + + addNewExtentToNamespace(ns, e, loc, emptyLoc, newCapped); + + DEV tlog(1) << "new extent " << ns << " size: 0x" << hex << ExtentSize << " loc: 0x" << hex << offset + << " emptyLoc:" << hex << emptyLoc.getOfs() << dec << endl; + return e; + } + + Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) { + string s = cc().database()->name + FREELIST_NS; + NamespaceDetails *f = nsdetails(s.c_str()); + if( f ) { + int low, high; + if( capped ) { + // be strict about the size + low = approxSize; + if( low > 2048 ) low -= 256; + high = (int) (approxSize * 1.05) + 256; + } + else { + low = (int) (approxSize * 0.8); + high = (int) (approxSize * 1.4); + } + if( high <= 0 ) { + // overflowed + high = max(approxSize, Extent::maxSize()); + } + int n = 0; + Extent *best = 0; + int bestDiff = 0x7fffffff; + { + Timer t; + DiskLoc L = f->firstExtent; + while( !L.isNull() ) { + Extent * e = L.ext(); + if( e->length >= low && e->length <= high ) { + int diff = abs(e->length - approxSize); + if( diff < bestDiff ) { + bestDiff = diff; + best = e; + if( ((double) diff) / approxSize < 0.1 ) { + // close enough + break; + } + if( t.seconds() >= 2 ) { + // have spent lots of time in write lock, and we are in [low,high], so close enough + // could come into play if extent freelist is very long + break; + } + } + else { + OCCASIONALLY { + if( high < 64 * 1024 && t.seconds() >= 2 ) { + // be less picky if it is taking a long time + high = 64 * 1024; + } + } + } + } + L = e->xnext; + ++n; + } + if( t.seconds() >= 10 ) { + log() << "warning: slow scan in allocFromFreeList (in write lock)" << endl; + } + } + + if( n > 128 ) log( n < 512 ) << "warning: newExtent " << n << " scanned\n"; + + if( best ) { + Extent *e = best; + // remove from the free list + if( !e->xprev.isNull() ) + e->xprev.ext()->xnext.writing() = e->xnext; + if( !e->xnext.isNull() ) + e->xnext.ext()->xprev.writing() = e->xprev; + if( f->firstExtent == e->myLoc ) + f->firstExtent.writing() = e->xnext; + if( f->lastExtent == e->myLoc ) + f->lastExtent.writing() = e->xprev; + + // use it + OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n"; + DiskLoc emptyLoc = e->reuse(ns, capped); + addNewExtentToNamespace(ns, e, e->myLoc, emptyLoc, capped); + return e; + } + } + + return 0; + // return createExtent(ns, approxSize, capped); + } + + /*---------------------------------------------------------------------*/ + + void Extent::markEmpty() { + xnext.Null(); + xprev.Null(); + firstRecord.Null(); + lastRecord.Null(); + } + + DiskLoc Extent::reuse(const char *nsname, bool capped) { + return getDur().writing(this)->_reuse(nsname, capped); + } + + void getEmptyLoc(const char *ns, const DiskLoc extentLoc, int extentLength, bool capped, /*out*/DiskLoc& emptyLoc, /*out*/int& delRecLength) { + emptyLoc = extentLoc; + emptyLoc.inc( Extent::HeaderSize() ); + delRecLength = extentLength - Extent::HeaderSize(); + if( delRecLength >= 32*1024 && str::contains(ns, '$') && !capped ) { + // probably an index. so skip forward to keep its records page aligned + int& ofs = emptyLoc.GETOFS(); + int newOfs = (ofs + 0xfff) & ~0xfff; + delRecLength -= (newOfs-ofs); + dassert( delRecLength > 0 ); + ofs = newOfs; + } + } + + DiskLoc Extent::_reuse(const char *nsname, bool capped) { + LOG(3) << "reset extent was:" << nsDiagnostic.toString() << " now:" << nsname << '\n'; + massert( 10360 , "Extent::reset bad magic value", magic == 0x41424344 ); + nsDiagnostic = nsname; + markEmpty(); + + DiskLoc emptyLoc; + int delRecLength; + getEmptyLoc(nsname, myLoc, length, capped, emptyLoc, delRecLength); + + // todo: some dup code here and below in Extent::init + DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength); + empty = getDur().writing(empty); + empty->lengthWithHeaders = delRecLength; + empty->extentOfs = myLoc.getOfs(); + empty->nextDeleted.Null(); + + return emptyLoc; + } + + /* assumes already zeroed -- insufficient for block 'reuse' perhaps */ + DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset, bool capped) { + magic = 0x41424344; + myLoc.set(_fileNo, _offset); + xnext.Null(); + xprev.Null(); + nsDiagnostic = nsname; + length = _length; + firstRecord.Null(); + lastRecord.Null(); + + DiskLoc emptyLoc; + int delRecLength; + getEmptyLoc(nsname, myLoc, _length, capped, emptyLoc, delRecLength); + + DeletedRecord *empty = getDur().writing( DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength) ); + empty->lengthWithHeaders = delRecLength; + empty->extentOfs = myLoc.getOfs(); + + return emptyLoc; + } + + /* + Record* Extent::newRecord(int len) { + if( firstEmptyRegion.isNull() )8 + return 0; + + assert(len > 0); + int newRecSize = len + Record::HeaderSize; + DiskLoc newRecordLoc = firstEmptyRegion; + Record *r = getRecord(newRecordLoc); + int left = r->netLength() - len; + if( left < 0 ) { + // + firstEmptyRegion.Null(); + return 0; + } + + DiskLoc nextEmpty = r->next.getNextEmpty(firstEmptyRegion); + r->lengthWithHeaders = newRecSize; + r->next.markAsFirstOrLastInExtent(this); // we're now last in the extent + if( !lastRecord.isNull() ) { + assert(getRecord(lastRecord)->next.lastInExtent()); // it was the last one + getRecord(lastRecord)->next.set(newRecordLoc); // until now + r->prev.set(lastRecord); + } + else { + r->prev.markAsFirstOrLastInExtent(this); // we are the first in the extent + assert( firstRecord.isNull() ); + firstRecord = newRecordLoc; + } + lastRecord = newRecordLoc; + + if( left < Record::HeaderSize + 32 ) { + firstEmptyRegion.Null(); + } + else { + firstEmptyRegion.inc(newRecSize); + Record *empty = getRecord(firstEmptyRegion); + empty->next.set(nextEmpty); // not for empty records, unless in-use records, next and prev can be null. + empty->prev.Null(); + empty->lengthWithHeaders = left; + } + + return r; + } + */ + + int Extent::maxSize() { + int maxExtentSize = 0x7ff00000; + if ( cmdLine.smallfiles ) { + maxExtentSize >>= 2; + } + return maxExtentSize; + } + + /*---------------------------------------------------------------------*/ + + shared_ptr<Cursor> DataFileMgr::findAll(const char *ns, const DiskLoc &startLoc) { + NamespaceDetails * d = nsdetails( ns ); + if ( ! d ) + return shared_ptr<Cursor>(new BasicCursor(DiskLoc())); + + DiskLoc loc = d->firstExtent; + Extent *e = getExtent(loc); + + DEBUGGING { + out() << "listing extents for " << ns << endl; + DiskLoc tmp = loc; + set<DiskLoc> extents; + + while ( 1 ) { + Extent *f = getExtent(tmp); + out() << "extent: " << tmp.toString() << endl; + extents.insert(tmp); + tmp = f->xnext; + if ( tmp.isNull() ) + break; + f = f->getNextExtent(); + } + + out() << endl; + d->dumpDeleted(&extents); + } + + if ( d->capped ) + return shared_ptr<Cursor>( new ForwardCappedCursor( d , startLoc ) ); + + if ( !startLoc.isNull() ) + return shared_ptr<Cursor>(new BasicCursor( startLoc )); + + while ( e->firstRecord.isNull() && !e->xnext.isNull() ) { + /* todo: if extent is empty, free it for reuse elsewhere. + that is a bit complicated have to clean up the freelists. + */ + RARELY out() << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead. ns:" << ns << endl; + // find a nonempty extent + // it might be nice to free the whole extent here! but have to clean up free recs then. + e = e->getNextExtent(); + } + return shared_ptr<Cursor>(new BasicCursor( e->firstRecord )); + } + + /* get a table scan cursor, but can be forward or reverse direction. + order.$natural - if set, > 0 means forward (asc), < 0 backward (desc). + */ + shared_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc) { + BSONElement el = order.getField("$natural"); // e.g., { $natural : -1 } + + if ( el.number() >= 0 ) + return DataFileMgr::findAll(ns, startLoc); + + // "reverse natural order" + NamespaceDetails *d = nsdetails(ns); + + if ( !d ) + return shared_ptr<Cursor>(new BasicCursor(DiskLoc())); + + if ( !d->capped ) { + if ( !startLoc.isNull() ) + return shared_ptr<Cursor>(new ReverseCursor( startLoc )); + Extent *e = d->lastExtent.ext(); + while ( e->lastRecord.isNull() && !e->xprev.isNull() ) { + OCCASIONALLY out() << " findTableScan: extent empty, skipping ahead" << endl; + e = e->getPrevExtent(); + } + return shared_ptr<Cursor>(new ReverseCursor( e->lastRecord )); + } + else { + return shared_ptr<Cursor>( new ReverseCappedCursor( d, startLoc ) ); + } + } + + void printFreeList() { + string s = cc().database()->name + FREELIST_NS; + log() << "dump freelist " << s << endl; + NamespaceDetails *freeExtents = nsdetails(s.c_str()); + if( freeExtents == 0 ) { + log() << " freeExtents==0" << endl; + return; + } + DiskLoc a = freeExtents->firstExtent; + while( !a.isNull() ) { + Extent *e = a.ext(); + log() << " extent " << a.toString() << " len:" << e->length << " prev:" << e->xprev.toString() << endl; + a = e->xnext; + } + + log() << "end freelist" << endl; + } + + /** free a list of extents that are no longer in use. this is a double linked list of extents + (could be just one in the list) + */ + void freeExtents(DiskLoc firstExt, DiskLoc lastExt) { + { + assert( !firstExt.isNull() && !lastExt.isNull() ); + Extent *f = firstExt.ext(); + Extent *l = lastExt.ext(); + assert( f->xprev.isNull() ); + assert( l->xnext.isNull() ); + assert( f==l || !f->xnext.isNull() ); + assert( f==l || !l->xprev.isNull() ); + } + + string s = cc().database()->name + FREELIST_NS; + NamespaceDetails *freeExtents = nsdetails(s.c_str()); + if( freeExtents == 0 ) { + string err; + _userCreateNS(s.c_str(), BSONObj(), err, 0); // todo: this actually allocates an extent, which is bad! + freeExtents = nsdetails(s.c_str()); + massert( 10361 , "can't create .$freelist", freeExtents); + } + if( freeExtents->firstExtent.isNull() ) { + freeExtents->firstExtent.writing() = firstExt; + freeExtents->lastExtent.writing() = lastExt; + } + else { + DiskLoc a = freeExtents->firstExtent; + assert( a.ext()->xprev.isNull() ); + getDur().writingDiskLoc( a.ext()->xprev ) = lastExt; + getDur().writingDiskLoc( lastExt.ext()->xnext ) = a; + getDur().writingDiskLoc( freeExtents->firstExtent ) = firstExt; + } + + //printFreeList(); + } + + /* drop a collection/namespace */ + void dropNS(const string& nsToDrop) { + NamespaceDetails* d = nsdetails(nsToDrop.c_str()); + uassert( 10086 , (string)"ns not found: " + nsToDrop , d ); + + BackgroundOperation::assertNoBgOpInProgForNs(nsToDrop.c_str()); + + NamespaceString s(nsToDrop); + assert( s.db == cc().database()->name ); + if( s.isSystem() ) { + if( s.coll == "system.profile" ) + uassert( 10087 , "turn off profiling before dropping system.profile collection", cc().database()->profile == 0 ); + else + uasserted( 12502, "can't drop system ns" ); + } + + { + // remove from the system catalog + BSONObj cond = BSON( "name" << nsToDrop ); // { name: "colltodropname" } + string system_namespaces = cc().database()->name + ".system.namespaces"; + /*int n = */ deleteObjects(system_namespaces.c_str(), cond, false, false, true); + // no check of return code as this ns won't exist for some of the new storage engines + } + + // free extents + if( !d->firstExtent.isNull() ) { + freeExtents(d->firstExtent, d->lastExtent); + getDur().writingDiskLoc( d->firstExtent ).setInvalid(); + getDur().writingDiskLoc( d->lastExtent ).setInvalid(); + } + + // remove from the catalog hashtable + cc().database()->namespaceIndex.kill_ns(nsToDrop.c_str()); + } + + void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result ) { + log(1) << "dropCollection: " << name << endl; + NamespaceDetails *d = nsdetails(name.c_str()); + if( d == 0 ) + return; + + BackgroundOperation::assertNoBgOpInProgForNs(name.c_str()); + + if ( d->nIndexes != 0 ) { + try { + assert( dropIndexes(d, name.c_str(), "*", errmsg, result, true) ); + } + catch( DBException& e ) { + stringstream ss; + ss << "drop: dropIndexes for collection failed - consider trying repair "; + ss << " cause: " << e.what(); + uasserted(12503,ss.str()); + } + assert( d->nIndexes == 0 ); + } + log(1) << "\t dropIndexes done" << endl; + result.append("ns", name.c_str()); + ClientCursor::invalidate(name.c_str()); + Top::global.collectionDropped( name ); + NamespaceDetailsTransient::eraseForPrefix( name.c_str() ); + dropNS(name); + } + + /* unindex all keys in index for this record. */ + static void _unindexRecord(IndexDetails& id, BSONObj& obj, const DiskLoc& dl, bool logMissing = true) { + BSONObjSet keys; + id.getKeysFromObject(obj, keys); + IndexInterface& ii = id.idxInterface(); + for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) { + BSONObj j = *i; + + bool ok = false; + try { + ok = ii.unindex(id.head, id, j, dl); + } + catch (AssertionException& e) { + problem() << "Assertion failure: _unindex failed " << id.indexNamespace() << endl; + out() << "Assertion failure: _unindex failed: " << e.what() << '\n'; + out() << " obj:" << obj.toString() << '\n'; + out() << " key:" << j.toString() << '\n'; + out() << " dl:" << dl.toString() << endl; + sayDbContext(); + } + + if ( !ok && logMissing ) { + log() << "unindex failed (key too big?) " << id.indexNamespace() << " key: " << j << " " << obj["_id"] << endl; + } + } + } +//zzz + /* unindex all keys in all indexes for this record. */ + static void unindexRecord(NamespaceDetails *d, Record *todelete, const DiskLoc& dl, bool noWarn = false) { + BSONObj obj(todelete); + int n = d->nIndexes; + for ( int i = 0; i < n; i++ ) + _unindexRecord(d->idx(i), obj, dl, !noWarn); + if( d->indexBuildInProgress ) { // background index + // always pass nowarn here, as this one may be missing for valid reasons as we are concurrently building it + _unindexRecord(d->idx(n), obj, dl, false); + } + } + + /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc. + caller must check if capped + */ + void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl) { + /* remove ourself from the record next/prev chain */ + { + if ( todelete->prevOfs != DiskLoc::NullOfs ) + getDur().writingInt( todelete->getPrev(dl).rec()->nextOfs ) = todelete->nextOfs; + if ( todelete->nextOfs != DiskLoc::NullOfs ) + getDur().writingInt( todelete->getNext(dl).rec()->prevOfs ) = todelete->prevOfs; + } + + /* remove ourself from extent pointers */ + { + Extent *e = getDur().writing( todelete->myExtent(dl) ); + if ( e->firstRecord == dl ) { + if ( todelete->nextOfs == DiskLoc::NullOfs ) + e->firstRecord.Null(); + else + e->firstRecord.set(dl.a(), todelete->nextOfs); + } + if ( e->lastRecord == dl ) { + if ( todelete->prevOfs == DiskLoc::NullOfs ) + e->lastRecord.Null(); + else + e->lastRecord.set(dl.a(), todelete->prevOfs); + } + } + + /* add to the free list */ + { + { + NamespaceDetails::Stats *s = getDur().writing(&d->stats); + s->datasize -= todelete->netLength(); + s->nrecords--; + } + + if ( strstr(ns, ".system.indexes") ) { + /* temp: if in system.indexes, don't reuse, and zero out: we want to be + careful until validated more, as IndexDetails has pointers + to this disk location. so an incorrectly done remove would cause + a lot of problems. + */ + memset(getDur().writingPtr(todelete, todelete->lengthWithHeaders), 0, todelete->lengthWithHeaders); + } + else { + DEV { + unsigned long long *p = (unsigned long long *) todelete->data; + *getDur().writing(p) = 0; + //DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse. + } + d->addDeletedRec((DeletedRecord*)todelete, dl); + } + } + } + + void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn, bool doLog ) { + dassert( todelete == dl.rec() ); + + NamespaceDetails* d = nsdetails(ns); + if ( d->capped && !cappedOK ) { + out() << "failing remove on a capped ns " << ns << endl; + uassert( 10089 , "can't remove from a capped collection" , 0 ); + return; + } + + BSONObj toDelete; + if ( doLog ) { + BSONElement e = dl.obj()["_id"]; + if ( e.type() ) { + toDelete = e.wrap(); + } + } + + /* check if any cursors point to us. if so, advance them. */ + ClientCursor::aboutToDelete(dl); + + unindexRecord(d, todelete, dl, noWarn); + + _deleteRecord(d, ns, todelete, dl); + NamespaceDetailsTransient::get( ns ).notifyOfWriteOp(); + + if ( ! toDelete.isEmpty() ) { + logOp( "d" , ns , toDelete ); + } + } + + + /** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record. + */ + const DiskLoc DataFileMgr::updateRecord( + const char *ns, + NamespaceDetails *d, + NamespaceDetailsTransient *nsdt, + Record *toupdate, const DiskLoc& dl, + const char *_buf, int _len, OpDebug& debug, bool god) { + + dassert( toupdate == dl.rec() ); + + BSONObj objOld(toupdate); + BSONObj objNew(_buf); + DEV assert( objNew.objsize() == _len ); + DEV assert( objNew.objdata() == _buf ); + + if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) { + /* add back the old _id value if the update removes it. Note this implementation is slow + (copies entire object multiple times), but this shouldn't happen often, so going for simple + code, not speed. + */ + BSONObjBuilder b; + BSONElement e; + assert( objOld.getObjectID(e) ); + b.append(e); // put _id first, for best performance + b.appendElements(objNew); + objNew = b.obj(); + } + + /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further + below. that is suboptimal, but it's pretty complicated to do it the other way without rollbacks... + */ + vector<IndexChanges> changes; + bool changedId = false; + getIndexChanges(changes, *d, objNew, objOld, changedId); + uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew , ! changedId ); + dupCheck(changes, *d, dl); + + if ( toupdate->netLength() < objNew.objsize() ) { + // doesn't fit. reallocate ----------------------------------------------------- + uassert( 10003 , "failing update: objects in a capped ns cannot grow", !(d && d->capped)); + d->paddingTooSmall(); + debug.moved = true; + deleteRecord(ns, toupdate, dl); + return insert(ns, objNew.objdata(), objNew.objsize(), god); + } + + nsdt->notifyOfWriteOp(); + d->paddingFits(); + + /* have any index keys changed? */ + { + int keyUpdates = 0; + int z = d->nIndexesBeingBuilt(); + for ( int x = 0; x < z; x++ ) { + IndexDetails& idx = d->idx(x); + IndexInterface& ii = idx.idxInterface(); + for ( unsigned i = 0; i < changes[x].removed.size(); i++ ) { + try { + bool found = ii.unindex(idx.head, idx, *changes[x].removed[i], dl); + if ( ! found ) { + RARELY warning() << "ns: " << ns << " couldn't unindex key: " << *changes[x].removed[i] + << " for doc: " << objOld["_id"] << endl; + } + } + catch (AssertionException&) { + debug.extra << " exception update unindex "; + problem() << " caught assertion update unindex " << idx.indexNamespace() << endl; + } + } + assert( !dl.isNull() ); + BSONObj idxKey = idx.info.obj().getObjectField("key"); + Ordering ordering = Ordering::make(idxKey); + keyUpdates += changes[x].added.size(); + for ( unsigned i = 0; i < changes[x].added.size(); i++ ) { + try { + /* we did the dupCheck() above. so we don't have to worry about it here. */ + ii.bt_insert( + idx.head, + dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx); + } + catch (AssertionException& e) { + debug.extra << " exception update index "; + problem() << " caught assertion update index " << idx.indexNamespace() << " " << e << " " << objNew["_id"] << endl; + } + } + } + + debug.keyUpdates = keyUpdates; + } + + // update in place + int sz = objNew.objsize(); + memcpy(getDur().writingPtr(toupdate->data, sz), objNew.objdata(), sz); + return dl; + } + + int Extent::followupSize(int len, int lastExtentLen) { + assert( len < Extent::maxSize() ); + int x = initialSize(len); + // changed from 1.20 to 1.35 in v2.1.x to get to larger extent size faster + int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.35); + int sz = y > x ? y : x; + + if ( sz < lastExtentLen ) { + // this means there was an int overflow + // so we should turn it into maxSize + sz = Extent::maxSize(); + } + else if ( sz > Extent::maxSize() ) { + sz = Extent::maxSize(); + } + + sz = ((int)sz) & 0xffffff00; + assert( sz > len ); + + return sz; + } + + /* step one of adding keys to index idxNo for a new record + @return true means done. false means multikey involved and more work to do + */ + static void _addKeysToIndexStepOneOfTwo(BSONObjSet & /*out*/keys, NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc recordLoc, IndexDetails& idx) { + idx.getKeysFromObject(obj, keys); + if( keys.empty() ) + return; + bool dupsAllowed = !idx.unique(); + BSONObj order = idx.keyPattern(); + IndexInterface& ii = idx.idxInterface(); + Ordering ordering = Ordering::make(order); + + assert( !recordLoc.isNull() ); + + try { + // we can't do the two step method with multi keys as insertion of one key changes the indexes + // structure. however we can do the first key of the set so we go ahead and do that FWIW + ii.phasedQueueItemToInsert(idxNo, idx.head, recordLoc, *keys.begin(), ordering, idx, dupsAllowed); + } + catch (AssertionException& e) { + if( e.getCode() == 10287 && idxNo == d->nIndexes ) { + DEV log() << "info: caught key already in index on bg indexing (ok)" << endl; + } + else { + throw; + } + } + } + + namespace dur { + extern unsigned notesThisLock; + } + + void upgradeToWritable(bool shouldBeUnlocked) { + // todo upgrade! + DEV { + // verify we haven't written yet (usually) + + // test binary does special things so this would assert there so don't check there + if( shouldBeUnlocked && !cmdLine.binaryName.empty() && cmdLine.binaryName != "test" ) { + static unsigned long long zeroes; + static unsigned long long tot; + tot++; + if( dur::notesThisLock == 0 ) + zeroes++; + if( tot > 1000 ) { + static int n; + DEV if( n++ == 0 ) + log() << "warning upgradeToWritable: already in writable too often" << endl; + } + } + } + } + + /** add index keys for a newly inserted record + done in two steps/phases to defer write lock portion + */ + static void indexRecordUsingTwoSteps(NamespaceDetails *d, BSONObj obj, DiskLoc loc, bool shouldBeUnlocked) { + vector<int> multi; + vector<BSONObjSet> multiKeys; + + IndexInterface::phasedBegin(); + + int n = d->nIndexesBeingBuilt(); + { + BSONObjSet keys; + for ( int i = 0; i < n; i++ ) { + IndexDetails& idx = d->idx(i); + // this call throws on unique constraint violation. we haven't done any writes yet so that is fine. + _addKeysToIndexStepOneOfTwo(/*out*/keys, d, i, obj, loc, idx); + if( keys.size() > 1 ) { + multi.push_back(i); + multiKeys.push_back(BSONObjSet()); + multiKeys[multiKeys.size()-1].swap(keys); + } + keys.clear(); + } + } + + // update lock to writable here. TODO + + upgradeToWritable(shouldBeUnlocked); + + IndexInterface::phasedFinish(); // step 2 + + // now finish adding multikeys + for( unsigned j = 0; j < multi.size(); j++ ) { + unsigned i = multi[j]; + BSONObjSet& keys = multiKeys[j]; + IndexDetails& idx = d->idx(i); + IndexInterface& ii = idx.idxInterface(); + Ordering ordering = Ordering::make(idx.keyPattern()); + d->setIndexIsMultikey(i); + for( BSONObjSet::iterator k = ++keys.begin()/*skip 1*/; k != keys.end(); k++ ) { + try { + ii.bt_insert(idx.head, loc, *k, ordering, !idx.unique(), idx); + } catch (AssertionException& e) { + if( e.getCode() == 10287 && (int) i == d->nIndexes ) { + DEV log() << "info: caught key already in index on bg indexing (ok)" << endl; + } + else { + /* roll back previously added index entries + note must do self index as it is multikey and could require some cleanup itself + */ + for( int j = 0; j < n; j++ ) { + try { + _unindexRecord(d->idx(j), obj, loc, false); + } + catch(...) { + log(3) << "unindex fails on rollback after unique key constraint prevented insert\n"; + } + } + throw; + } + } + } + } + } + + /* add keys to index idxNo for a new record */ + static void addKeysToIndex(NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc recordLoc, bool dupsAllowed) { + IndexDetails& idx = d->idx(idxNo); + BSONObjSet keys; + idx.getKeysFromObject(obj, keys); + if( keys.empty() ) + return; + BSONObj order = idx.keyPattern(); + IndexInterface& ii = idx.idxInterface(); + Ordering ordering = Ordering::make(order); + int n = 0; + for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) { + if( ++n == 2 ) { + d->setIndexIsMultikey(idxNo); + } + assert( !recordLoc.isNull() ); + try { + ii.bt_insert(idx.head, recordLoc, *i, ordering, dupsAllowed, idx); + } + catch (AssertionException& e) { + if( e.getCode() == 10287 && idxNo == d->nIndexes ) { + DEV log() << "info: caught key already in index on bg indexing (ok)" << endl; + continue; + } + if( !dupsAllowed ) { + // dup key exception, presumably. + throw; + } + problem() << " caught assertion addKeysToIndex " << idx.indexNamespace() << " " << obj["_id"] << endl; + } + } + } + +#if 0 + void testSorting() { + BSONObjBuilder b; + b.appendNull(""); + BSONObj x = b.obj(); + + BSONObjExternalSorter sorter(*IndexDetails::iis[1]); + + sorter.add(x, DiskLoc(3,7)); + sorter.add(x, DiskLoc(4,7)); + sorter.add(x, DiskLoc(2,7)); + sorter.add(x, DiskLoc(1,7)); + sorter.add(x, DiskLoc(3,77)); + + sorter.sort(); + + auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator(); + while( i->more() ) { + BSONObjExternalSorter::Data d = i->next(); + /*cout << d.second.toString() << endl; + cout << d.first.objsize() << endl; + cout<<"SORTER next:" << d.first.toString() << endl;*/ + } + } +#endif + + SortPhaseOne *precalced = 0; + + template< class V > + void buildBottomUpPhases2And3(bool dupsAllowed, IndexDetails& idx, BSONObjExternalSorter& sorter, + bool dropDups, list<DiskLoc> &dupsToDrop, CurOp * op, SortPhaseOne *phase1, ProgressMeterHolder &pm, + Timer& t + ) + { + BtreeBuilder<V> btBuilder(dupsAllowed, idx); + BSONObj keyLast; + auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator(); + assert( pm == op->setMessage( "index: (2/3) btree bottom up" , phase1->nkeys , 10 ) ); + while( i->more() ) { + RARELY killCurrentOp.checkForInterrupt(); + BSONObjExternalSorter::Data d = i->next(); + + try { + if ( !dupsAllowed && dropDups ) { + LastError::Disabled led( lastError.get() ); + btBuilder.addKey(d.first, d.second); + } + else { + btBuilder.addKey(d.first, d.second); + } + } + catch( AssertionException& e ) { + if ( dupsAllowed ) { + // unknow exception?? + throw; + } + + if( e.interrupted() ) { + killCurrentOp.checkForInterrupt(); + } + + if ( ! dropDups ) + throw; + + /* we could queue these on disk, but normally there are very few dups, so instead we + keep in ram and have a limit. + */ + dupsToDrop.push_back(d.second); + uassert( 10092 , "too may dups on index build with dropDups=true", dupsToDrop.size() < 1000000 ); + } + pm.hit(); + } + pm.finished(); + op->setMessage( "index: (3/3) btree-middle" ); + log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl; + btBuilder.commit(); + if ( btBuilder.getn() != phase1->nkeys && ! dropDups ) { + warning() << "not all entries were added to the index, probably some keys were too large" << endl; + } + } + + // throws DBException + unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { + CurOp * op = cc().curop(); + + Timer t; + + tlog(1) << "fastBuildIndex " << ns << " idxNo:" << idxNo << ' ' << idx.info.obj().toString() << endl; + + bool dupsAllowed = !idx.unique(); + bool dropDups = idx.dropDups() || inDBRepair; + BSONObj order = idx.keyPattern(); + + getDur().writingDiskLoc(idx.head).Null(); + + if ( logLevel > 1 ) printMemInfo( "before index start" ); + + /* get and sort all the keys ----- */ + ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->stats.nrecords , 10 ) ); + SortPhaseOne _ours; + SortPhaseOne *phase1 = precalced; + if( phase1 == 0 ) { + phase1 = &_ours; + SortPhaseOne& p1 = *phase1; + shared_ptr<Cursor> c = theDataFileMgr.findAll(ns); + p1.sorter.reset( new BSONObjExternalSorter(idx.idxInterface(), order) ); + p1.sorter->hintNumObjects( d->stats.nrecords ); + const IndexSpec& spec = idx.getSpec(); + while ( c->ok() ) { + BSONObj o = c->current(); + DiskLoc loc = c->currLoc(); + p1.addKeys(spec, o, loc); + c->advance(); + pm.hit(); + if ( logLevel > 1 && p1.n % 10000 == 0 ) { + printMemInfo( "\t iterating objects" ); + } + }; + } + pm.finished(); + + BSONObjExternalSorter& sorter = *(phase1->sorter); + + if( phase1->multi ) + d->setIndexIsMultikey(idxNo); + + if ( logLevel > 1 ) printMemInfo( "before final sort" ); + phase1->sorter->sort(); + if ( logLevel > 1 ) printMemInfo( "after final sort" ); + + log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl; + + list<DiskLoc> dupsToDrop; + + /* build index --- */ + if( idx.version() == 0 ) + buildBottomUpPhases2And3<V0>(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, phase1, pm, t); + else if( idx.version() == 1 ) + buildBottomUpPhases2And3<V1>(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, phase1, pm, t); + else + assert(false); + + log(1) << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl; + + for( list<DiskLoc>::iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); i++ ){ + theDataFileMgr.deleteRecord( ns, i->rec(), *i, false /* cappedOk */ , true /* noWarn */ , isMaster( ns ) /* logOp */ ); + getDur().commitIfNeeded(); + } + + return phase1->n; + } + + class BackgroundIndexBuildJob : public BackgroundOperation { + + unsigned long long addExistingToIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { + bool dupsAllowed = !idx.unique(); + bool dropDups = idx.dropDups(); + + ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->stats.nrecords ); + + unsigned long long n = 0; + auto_ptr<ClientCursor> cc; + { + shared_ptr<Cursor> c = theDataFileMgr.findAll(ns); + cc.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, ns) ); + } + CursorId id = cc->cursorid(); + + while ( cc->ok() ) { + BSONObj js = cc->current(); + try { + { + if ( !dupsAllowed && dropDups ) { + LastError::Disabled led( lastError.get() ); + addKeysToIndex(d, idxNo, js, cc->currLoc(), dupsAllowed); + } + else { + addKeysToIndex(d, idxNo, js, cc->currLoc(), dupsAllowed); + } + } + cc->advance(); + } + catch( AssertionException& e ) { + if( e.interrupted() ) { + killCurrentOp.checkForInterrupt(); + } + + if ( dropDups ) { + DiskLoc toDelete = cc->currLoc(); + bool ok = cc->advance(); + cc->updateLocation(); + theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true , true ); + if( ClientCursor::find(id, false) == 0 ) { + cc.release(); + if( !ok ) { + /* we were already at the end. normal. */ + } + else { + uasserted(12585, "cursor gone during bg index; dropDups"); + } + break; + } + } + else { + log() << "background addExistingToIndex exception " << e.what() << endl; + throw; + } + } + n++; + progress.hit(); + + getDur().commitIfNeeded(); + + if ( cc->yieldSometimes( ClientCursor::WillNeed ) ) { + progress.setTotalWhileRunning( d->stats.nrecords ); + } + else { + cc.release(); + uasserted(12584, "cursor gone during bg index"); + break; + } + } + progress.finished(); + return n; + } + + /* we do set a flag in the namespace for quick checking, but this is our authoritative info - + that way on a crash/restart, we don't think we are still building one. */ + set<NamespaceDetails*> bgJobsInProgress; + + void prep(const char *ns, NamespaceDetails *d) { + assertInWriteLock(); + uassert( 13130 , "can't start bg index b/c in recursive lock (db.eval?)" , mongo::d.dbMutex.getState() == 1 ); + bgJobsInProgress.insert(d); + } + void done(const char *ns, NamespaceDetails *d) { + NamespaceDetailsTransient::get(ns).addedIndex(); // clear query optimizer cache + assertInWriteLock(); + } + + public: + BackgroundIndexBuildJob(const char *ns) : BackgroundOperation(ns) { } + + unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) { + unsigned long long n = 0; + + prep(ns.c_str(), d); + assert( idxNo == d->nIndexes ); + try { + idx.head.writing() = idx.idxInterface().addBucket(idx); + n = addExistingToIndex(ns.c_str(), d, idx, idxNo); + } + catch(...) { + if( cc().database() && nsdetails(ns.c_str()) == d ) { + assert( idxNo == d->nIndexes ); + done(ns.c_str(), d); + } + else { + log() << "ERROR: db gone during bg index?" << endl; + } + throw; + } + assert( idxNo == d->nIndexes ); + done(ns.c_str(), d); + return n; + } + }; + + /** + * For the lifetime of this object, an index build is indicated on the specified + * namespace and the newest index is marked as absent. This simplifies + * the cleanup required on recovery. + */ + class RecoverableIndexState { + public: + RecoverableIndexState( NamespaceDetails *d ) : _d( d ) { + indexBuildInProgress() = 1; + nIndexes()--; + } + ~RecoverableIndexState() { + DESTRUCTOR_GUARD ( + nIndexes()++; + indexBuildInProgress() = 0; + ) + } + private: + int &nIndexes() { return getDur().writingInt( _d->nIndexes ); } + int &indexBuildInProgress() { return getDur().writingInt( _d->indexBuildInProgress ); } + NamespaceDetails *_d; + }; + + // throws DBException + static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) { + tlog() << "build index " << ns << ' ' << idx.keyPattern() << ( background ? " background" : "" ) << endl; + Timer t; + unsigned long long n; + + assert( !BackgroundOperation::inProgForNs(ns.c_str()) ); // should have been checked earlier, better not be... + assert( d->indexBuildInProgress == 0 ); + assertInWriteLock(); + RecoverableIndexState recoverable( d ); + + // Build index spec here in case the collection is empty and the index details are invalid + idx.getSpec(); + + if( inDBRepair || !background ) { + n = fastBuildIndex(ns.c_str(), d, idx, idxNo); + assert( !idx.head.isNull() ); + } + else { + BackgroundIndexBuildJob j(ns.c_str()); + n = j.go(ns, d, idx, idxNo); + } + tlog() << "build index done " << n << " records " << t.millis() / 1000.0 << " secs" << endl; + } + + /* add keys to indexes for a new record */ +#if 0 + static void oldIndexRecord__notused(NamespaceDetails *d, BSONObj obj, DiskLoc loc) { + int n = d->nIndexesBeingBuilt(); + for ( int i = 0; i < n; i++ ) { + try { + bool unique = d->idx(i).unique(); + addKeysToIndex(d, i, obj, loc, /*dupsAllowed*/!unique); + } + catch( DBException& ) { + /* try to roll back previously added index entries + note <= i (not < i) is important here as the index we were just attempted + may be multikey and require some cleanup. + */ + for( int j = 0; j <= i; j++ ) { + try { + _unindexRecord(d->idx(j), obj, loc, false); + } + catch(...) { + log(3) << "unindex fails on rollback after unique failure\n"; + } + } + throw; + } + } + } +#endif + + extern BSONObj id_obj; // { _id : 1 } + + void ensureHaveIdIndex(const char *ns) { + NamespaceDetails *d = nsdetails(ns); + if ( d == 0 || (d->flags & NamespaceDetails::Flag_HaveIdIndex) ) + return; + + *getDur().writing(&d->flags) |= NamespaceDetails::Flag_HaveIdIndex; + + { + NamespaceDetails::IndexIterator i = d->ii(); + while( i.more() ) { + if( i.next().isIdIndex() ) + return; + } + } + + string system_indexes = cc().database()->name + ".system.indexes"; + + BSONObjBuilder b; + b.append("name", "_id_"); + b.append("ns", ns); + b.append("key", id_obj); + BSONObj o = b.done(); + + /* edge case: note the insert could fail if we have hit maxindexes already */ + theDataFileMgr.insert(system_indexes.c_str(), o.objdata(), o.objsize(), true); + } + +#pragma pack(1) + struct IDToInsert_ { + char type; + char _id[4]; + OID oid; + IDToInsert_() { + type = (char) jstOID; + strcpy(_id, "_id"); + assert( sizeof(IDToInsert_) == 17 ); + } + } idToInsert_; + struct IDToInsert : public BSONElement { + IDToInsert() : BSONElement( ( char * )( &idToInsert_ ) ) {} + } idToInsert; +#pragma pack() + + void DataFileMgr::insertAndLog( const char *ns, const BSONObj &o, bool god ) { + BSONObj tmp = o; + insertWithObjMod( ns, tmp, god ); + logOp( "i", ns, tmp ); + } + + /** @param o the object to insert. can be modified to add _id and thus be an in/out param + */ + DiskLoc DataFileMgr::insertWithObjMod(const char *ns, BSONObj &o, bool god) { + bool addedID = false; + DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god, true, &addedID ); + if( addedID && !loc.isNull() ) + o = BSONObj( loc.rec() ); + return loc; + } + + bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject ); + + // We are now doing two btree scans for all unique indexes (one here, and one when we've + // written the record to the collection. This could be made more efficient inserting + // dummy data here, keeping pointers to the btree nodes holding the dummy data and then + // updating the dummy data with the DiskLoc of the real record. + void checkNoIndexConflicts( NamespaceDetails *d, const BSONObj &obj ) { + for ( int idxNo = 0; idxNo < d->nIndexes; idxNo++ ) { + if( d->idx(idxNo).unique() ) { + IndexDetails& idx = d->idx(idxNo); + BSONObjSet keys; + idx.getKeysFromObject(obj, keys); + BSONObj order = idx.keyPattern(); + IndexInterface& ii = idx.idxInterface(); + for ( BSONObjSet::iterator i=keys.begin(); i != keys.end(); i++ ) { + // WARNING: findSingle may not be compound index safe. this may need to change. see notes in + // findSingle code. + uassert( 12582, "duplicate key insert for unique index of capped collection", + ii.findSingle(idx, idx.head, *i ).isNull() ); + } + } + } + } + + /** add a record to the end of the linked list chain within this extent. + require: you must have already declared write intent for the record header. + */ + void addRecordToRecListInExtent(Record *r, DiskLoc loc) { + dassert( loc.rec() == r ); + Extent *e = r->myExtent(loc); + if ( e->lastRecord.isNull() ) { + Extent::FL *fl = getDur().writing(e->fl()); + fl->firstRecord = fl->lastRecord = loc; + r->prevOfs = r->nextOfs = DiskLoc::NullOfs; + } + else { + Record *oldlast = e->lastRecord.rec(); + r->prevOfs = e->lastRecord.getOfs(); + r->nextOfs = DiskLoc::NullOfs; + getDur().writingInt(oldlast->nextOfs) = loc.getOfs(); + getDur().writingDiskLoc(e->lastRecord) = loc; + } + } + + NOINLINE_DECL DiskLoc outOfSpace(const char *ns, NamespaceDetails *d, int lenWHdr, bool god, DiskLoc extentLoc) { + DiskLoc loc; + if ( d->capped == 0 ) { // size capped doesn't grow + log(1) << "allocating new extent for " << ns << " padding:" << d->paddingFactor << " lenWHdr: " << lenWHdr << endl; + cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false, !god); + loc = d->alloc(ns, lenWHdr, extentLoc); + if ( loc.isNull() ) { + log() << "warning: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again\n"; + for ( int z=0; z<10 && lenWHdr > d->lastExtentSize; z++ ) { + log() << "try #" << z << endl; + cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false, !god); + loc = d->alloc(ns, lenWHdr, extentLoc); + if ( ! loc.isNull() ) + break; + } + } + } + return loc; + } + + /** used by insert and also compact + * @return null loc if out of space + */ + DiskLoc allocateSpaceForANewRecord(const char *ns, NamespaceDetails *d, int lenWHdr, bool god) { + DiskLoc extentLoc; + DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc); + if ( loc.isNull() ) { + loc = outOfSpace(ns, d, lenWHdr, god, extentLoc); + } + return loc; + } + + bool NOINLINE_DECL insert_checkSys(const char *sys, const char *ns, bool& wouldAddIndex, const void *obuf, bool god) { + uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns); + if ( strstr(ns, ".system.") ) { + // later:check for dba-type permissions here if have that at some point separate + if ( strstr(ns, ".system.indexes" ) ) + wouldAddIndex = true; + else if ( legalClientSystemNS( ns , true ) ) { + if ( obuf && strstr( ns , ".system.users" ) ) { + BSONObj t( reinterpret_cast<const char *>( obuf ) ); + uassert( 14051 , "system.user entry needs 'user' field to be a string" , t["user"].type() == String ); + uassert( 14052 , "system.user entry needs 'pwd' field to be a string" , t["pwd"].type() == String ); + uassert( 14053 , "system.user entry needs 'user' field to be non-empty" , t["user"].String().size() ); + uassert( 14054 , "system.user entry needs 'pwd' field to be non-empty" , t["pwd"].String().size() ); + } + } + else if ( !god ) { + // todo this should probably uasseert rather than doing this: + log() << "ERROR: attempt to insert in system namespace " << ns << endl; + return false; + } + } + return true; + } + + NOINLINE_DECL NamespaceDetails* insert_newNamespace(const char *ns, int len, bool god) { + addNewNamespaceToCatalog(ns); + /* todo: shouldn't be in the namespace catalog until after the allocations here work. + also if this is an addIndex, those checks should happen before this! + */ + // This may create first file in the database. + int ies = Extent::initialSize(len); + if( str::contains(ns, '$') && len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 && len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) { + // probably an index. so we pick a value here for the first extent instead of using initialExtentSize() which is more + // for user collections. TODO: we could look at the # of records in the parent collection to be smarter here. + ies = (32+4) * 1024; + } + cc().database()->allocExtent(ns, ies, false, false); + NamespaceDetails *d = nsdetails(ns); + if ( !god ) + ensureIdIndexForNewNs(ns); + return d; + } + + void NOINLINE_DECL insert_makeIndex(NamespaceDetails *tableToIndex, const string& tabletoidxns, const DiskLoc& loc) { + uassert( 13143 , "can't create index on system.indexes" , tabletoidxns.find( ".system.indexes" ) == string::npos ); + + BSONObj info = loc.obj(); + bool background = info["background"].trueValue(); + // if this is not readable, let's move things along + if (background && ((!theReplSet && cc().isSyncThread()) || (theReplSet && !theReplSet->isSecondary()))) { + log() << "info: indexing in foreground on this replica; was a background index build on the primary" << endl; + background = false; + } + + int idxNo = tableToIndex->nIndexes; + IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes + getDur().writingDiskLoc(idx.info) = loc; + try { + buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background); + } + catch( DBException& e ) { + // save our error msg string as an exception or dropIndexes will overwrite our message + LastError *le = lastError.get(); + int savecode = 0; + string saveerrmsg; + if ( le ) { + savecode = le->code; + saveerrmsg = le->msg; + } + else { + savecode = e.getCode(); + saveerrmsg = e.what(); + } + + // roll back this index + string name = idx.indexName(); + BSONObjBuilder b; + string errmsg; + bool ok = dropIndexes(tableToIndex, tabletoidxns.c_str(), name.c_str(), errmsg, b, true); + if( !ok ) { + log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl; + } + + assert( le && !saveerrmsg.empty() ); + raiseError(savecode,saveerrmsg.c_str()); + throw; + } + } + + /* if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc + after the call -- that will prevent a double buffer copy in some cases (btree.cpp). + + @param mayAddIndex almost always true, except for invocation from rename namespace command. + @param addedID if not null, set to true if adding _id element. you must assure false before calling + if using. + */ + + DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, bool mayAddIndex, bool *addedID) { + bool wouldAddIndex = false; + massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) ); + uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) ); + { + const char *sys = strstr(ns, "system."); + if ( sys && !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) ) + return DiskLoc(); + } + bool addIndex = wouldAddIndex && mayAddIndex; + + NamespaceDetails *d = nsdetails(ns); + if ( d == 0 ) { + d = insert_newNamespace(ns, len, god); + } + + NamespaceDetails *tableToIndex = 0; + + string tabletoidxns; + BSONObj fixedIndexObject; + if ( addIndex ) { + assert( obuf ); + BSONObj io((const char *) obuf); + if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex, fixedIndexObject ) ) { + // prepare creates _id itself, or this indicates to fail the build silently (such + // as if index already exists) + return DiskLoc(); + } + if ( ! fixedIndexObject.isEmpty() ) { + obuf = fixedIndexObject.objdata(); + len = fixedIndexObject.objsize(); + } + } + + int addID = 0; // 0 if not adding _id; if adding, the length of that new element + if( !god ) { + /* Check if we have an _id field. If we don't, we'll add it. + Note that btree buckets which we insert aren't BSONObj's, but in that case god==true. + */ + BSONObj io((const char *) obuf); + BSONElement idField = io.getField( "_id" ); + uassert( 10099 , "_id cannot be an array", idField.type() != Array ); + // we don't add _id for capped collections as they don't have an _id index + if( idField.eoo() && !wouldAddIndex && strstr(ns, ".local.") == 0 && d->haveIdIndex() ) { + if( addedID ) + *addedID = true; + addID = len; + idToInsert_.oid.init(); + len += idToInsert.size(); + } + + BSONElementManipulator::lookForTimestamps( io ); + } + + int lenWHdr = len + Record::HeaderSize; + lenWHdr = (int) (lenWHdr * d->paddingFactor); + if ( lenWHdr == 0 ) { + // old datafiles, backward compatible here. + assert( d->paddingFactor == 0 ); + *getDur().writing(&d->paddingFactor) = 1.0; + lenWHdr = len + Record::HeaderSize; + } + + // If the collection is capped, check if the new object will violate a unique index + // constraint before allocating space. + if ( d->nIndexes && d->capped && !god ) { + checkNoIndexConflicts( d, BSONObj( reinterpret_cast<const char *>( obuf ) ) ); + } + + bool earlyIndex = true; + DiskLoc loc; + if( addID || tableToIndex || d->capped ) { + // if need id, we don't do the early indexing. this is not the common case so that is sort of ok + earlyIndex = false; + loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god); + } + else { + loc = d->allocWillBeAt(ns, lenWHdr); + if( loc.isNull() ) { + // need to get a new extent so we have to do the true alloc now (not common case) + earlyIndex = false; + loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god); + } + } + if ( loc.isNull() ) { + log() << "insert: couldn't alloc space for object ns:" << ns << " capped:" << d->capped << endl; + assert(d->capped); + return DiskLoc(); + } + + if( earlyIndex ) { + // add record to indexes using two step method so we can do the reading outside a write lock + if ( d->nIndexes ) { + assert( obuf ); + BSONObj obj((const char *) obuf); + try { + indexRecordUsingTwoSteps(d, obj, loc, true); + } + catch( AssertionException& ) { + // should be a dup key error on _id index + dassert( !tableToIndex && !d->capped ); + // no need to delete/rollback the record as it was not added yet + throw; + } + } + // really allocate now + DiskLoc real = allocateSpaceForANewRecord(ns, d, lenWHdr, god); + assert( real == loc ); + } + + Record *r = loc.rec(); + { + assert( r->lengthWithHeaders >= lenWHdr ); + r = (Record*) getDur().writingPtr(r, lenWHdr); + if( addID ) { + /* a little effort was made here to avoid a double copy when we add an ID */ + ((int&)*r->data) = *((int*) obuf) + idToInsert.size(); + memcpy(r->data+4, idToInsert.rawdata(), idToInsert.size()); + memcpy(r->data+4+idToInsert.size(), ((char *)obuf)+4, addID-4); + } + else { + if( obuf ) // obuf can be null from internal callers + memcpy(r->data, obuf, len); + } + } + + addRecordToRecListInExtent(r, loc); + + /* durability todo : this could be a bit annoying / slow to record constantly */ + { + NamespaceDetails::Stats *s = getDur().writing(&d->stats); + s->datasize += r->netLength(); + s->nrecords++; + } + + // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket + if ( !god ) + NamespaceDetailsTransient::get( ns ).notifyOfWriteOp(); + + if ( tableToIndex ) { + insert_makeIndex(tableToIndex, tabletoidxns, loc); + } + + /* add this record to our indexes */ + if ( !earlyIndex && d->nIndexes ) { + try { + BSONObj obj(r->data); + // not sure which of these is better -- either can be used. oldIndexRecord may be faster, + // but twosteps handles dup key errors more efficiently. + //oldIndexRecord(d, obj, loc); + indexRecordUsingTwoSteps(d, obj, loc, false); + + } + catch( AssertionException& e ) { + // should be a dup key error on _id index + if( tableToIndex || d->capped ) { + massert( 12583, "unexpected index insertion failure on capped collection", !d->capped ); + string s = e.toString(); + s += " : on addIndex/capped - collection and its index will not match"; + uassert_nothrow(s.c_str()); + error() << s << endl; + } + else { + // normal case -- we can roll back + _deleteRecord(d, ns, r, loc); + throw; + } + } + } + + d->paddingFits(); + + return loc; + } + + /* special version of insert for transaction logging -- streamlined a bit. + assumes ns is capped and no indexes + */ + Record* DataFileMgr::fast_oplog_insert(NamespaceDetails *d, const char *ns, int len) { + assert( d ); + RARELY assert( d == nsdetails(ns) ); + DEV assert( d == nsdetails(ns) ); + + DiskLoc extentLoc; + int lenWHdr = len + Record::HeaderSize; + DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc); + assert( !loc.isNull() ); + + Record *r = loc.rec(); + assert( r->lengthWithHeaders >= lenWHdr ); + + Extent *e = r->myExtent(loc); + if ( e->lastRecord.isNull() ) { + Extent::FL *fl = getDur().writing( e->fl() ); + fl->firstRecord = fl->lastRecord = loc; + + Record::NP *np = getDur().writing(r->np()); + np->nextOfs = np->prevOfs = DiskLoc::NullOfs; + } + else { + Record *oldlast = e->lastRecord.rec(); + Record::NP *np = getDur().writing(r->np()); + np->prevOfs = e->lastRecord.getOfs(); + np->nextOfs = DiskLoc::NullOfs; + getDur().writingInt( oldlast->nextOfs ) = loc.getOfs(); + e->lastRecord.writing() = loc; + } + + /* todo: don't update for oplog? seems wasteful. */ + { + NamespaceDetails::Stats *s = getDur().writing(&d->stats); + s->datasize += r->netLength(); + s->nrecords++; + } + + return r; + } + +} // namespace mongo + +#include "clientcursor.h" + +namespace mongo { + + void dropAllDatabasesExceptLocal() { + writelock lk(""); + + vector<string> n; + getDatabaseNames(n); + if( n.size() == 0 ) return; + log() << "dropAllDatabasesExceptLocal " << n.size() << endl; + for( vector<string>::iterator i = n.begin(); i != n.end(); i++ ) { + if( *i != "local" ) { + Client::Context ctx(*i); + dropDatabase(*i); + } + } + } + + void dropDatabase(string db) { + log(1) << "dropDatabase " << db << endl; + Database *d = cc().database(); + assert( d ); + assert( d->name == db ); + + BackgroundOperation::assertNoBgOpInProgForDb(d->name.c_str()); + + mongo::d.dbMutex.assertWriteLocked(); + + // Not sure we need this here, so removed. If we do, we need to move it down + // within other calls both (1) as they could be called from elsewhere and + // (2) to keep the lock order right - groupcommitmutex must be locked before + // mmmutex (if both are locked). + // + // RWLockRecursive::Exclusive lk(MongoFile::mmmutex); + + getDur().syncDataAndTruncateJournal(); + + Database::closeDatabase( d->name.c_str(), d->path ); + d = 0; // d is now deleted + + _deleteDataFiles( db.c_str() ); + } + + typedef boost::filesystem::path Path; + + void boostRenameWrapper( const Path &from, const Path &to ) { + try { + boost::filesystem::rename( from, to ); + } + catch ( const boost::filesystem::filesystem_error & ) { + // boost rename doesn't work across partitions + boost::filesystem::copy_file( from, to); + boost::filesystem::remove( from ); + } + } + + // back up original database files to 'temp' dir + void _renameForBackup( const char *database, const Path &reservedPath ) { + Path newPath( reservedPath ); + if ( directoryperdb ) + newPath /= database; + class Renamer : public FileOp { + public: + Renamer( const Path &newPath ) : newPath_( newPath ) {} + private: + const boost::filesystem::path &newPath_; + virtual bool apply( const Path &p ) { + if ( !boost::filesystem::exists( p ) ) + return false; + boostRenameWrapper( p, newPath_ / ( p.leaf() + ".bak" ) ); + return true; + } + virtual const char * op() const { + return "renaming"; + } + } renamer( newPath ); + _applyOpToDataFiles( database, renamer, true ); + } + + // move temp files to standard data dir + void _replaceWithRecovered( const char *database, const char *reservedPathString ) { + Path newPath( dbpath ); + if ( directoryperdb ) + newPath /= database; + class Replacer : public FileOp { + public: + Replacer( const Path &newPath ) : newPath_( newPath ) {} + private: + const boost::filesystem::path &newPath_; + virtual bool apply( const Path &p ) { + if ( !boost::filesystem::exists( p ) ) + return false; + boostRenameWrapper( p, newPath_ / p.leaf() ); + return true; + } + virtual const char * op() const { + return "renaming"; + } + } replacer( newPath ); + _applyOpToDataFiles( database, replacer, true, reservedPathString ); + } + + // generate a directory name for storing temp data files + Path uniqueReservedPath( const char *prefix ) { + Path repairPath = Path( repairpath ); + Path reservedPath; + int i = 0; + bool exists = false; + do { + stringstream ss; + ss << prefix << "_repairDatabase_" << i++; + reservedPath = repairPath / ss.str(); + BOOST_CHECK_EXCEPTION( exists = boost::filesystem::exists( reservedPath ) ); + } + while ( exists ); + return reservedPath; + } + + boost::intmax_t dbSize( const char *database ) { + class SizeAccumulator : public FileOp { + public: + SizeAccumulator() : totalSize_( 0 ) {} + boost::intmax_t size() const { + return totalSize_; + } + private: + virtual bool apply( const boost::filesystem::path &p ) { + if ( !boost::filesystem::exists( p ) ) + return false; + totalSize_ += boost::filesystem::file_size( p ); + return true; + } + virtual const char *op() const { + return "checking size"; + } + boost::intmax_t totalSize_; + }; + SizeAccumulator sa; + _applyOpToDataFiles( database, sa ); + return sa.size(); + } + + bool repairDatabase( string dbNameS , string &errmsg, + bool preserveClonedFilesOnFailure, bool backupOriginalFiles ) { + doingRepair dr; + dbNameS = nsToDatabase( dbNameS ); + const char * dbName = dbNameS.c_str(); + + stringstream ss; + ss << "localhost:" << cmdLine.port; + string localhost = ss.str(); + + problem() << "repairDatabase " << dbName << endl; + assert( cc().database()->name == dbName ); + assert( cc().database()->path == dbpath ); + + BackgroundOperation::assertNoBgOpInProgForDb(dbName); + + getDur().syncDataAndTruncateJournal(); // Must be done before and after repair + + boost::intmax_t totalSize = dbSize( dbName ); + boost::intmax_t freeSize = File::freeSpace(repairpath); + if ( freeSize > -1 && freeSize < totalSize ) { + stringstream ss; + ss << "Cannot repair database " << dbName << " having size: " << totalSize + << " (bytes) because free disk space is: " << freeSize << " (bytes)"; + errmsg = ss.str(); + problem() << errmsg << endl; + return false; + } + + Path reservedPath = + uniqueReservedPath( ( preserveClonedFilesOnFailure || backupOriginalFiles ) ? + "backup" : "_tmp" ); + BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( reservedPath ) ); + string reservedPathString = reservedPath.native_directory_string(); + + bool res; + { + // clone to temp location, which effectively does repair + Client::Context ctx( dbName, reservedPathString ); + assert( ctx.justCreated() ); + + res = cloneFrom(localhost.c_str(), errmsg, dbName, + /*logForReplication=*/false, /*slaveOk*/false, /*replauth*/false, + /*snapshot*/false, /*mayYield*/false, /*mayBeInterrupted*/true); + Database::closeDatabase( dbName, reservedPathString.c_str() ); + } + + if ( !res ) { + errmsg = str::stream() << "clone failed for " << dbName << " with error: " << errmsg; + problem() << errmsg << endl; + + if ( !preserveClonedFilesOnFailure ) + BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) ); + + getDur().syncDataAndTruncateJournal(); // Must be done before and after repair + + return false; + } + + MongoFile::flushAll(true); + + Client::Context ctx( dbName ); + Database::closeDatabase( dbName, dbpath ); + + if ( backupOriginalFiles ) { + _renameForBackup( dbName, reservedPath ); + } + else { + _deleteDataFiles( dbName ); + BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( Path( dbpath ) / dbName ) ); + } + + _replaceWithRecovered( dbName, reservedPathString.c_str() ); + + if ( !backupOriginalFiles ) + BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) ); + + getDur().syncDataAndTruncateJournal(); // Must be done before and after repair + + return true; + } + + void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator, const string& path ) { + if ( afterAllocator ) + FileAllocator::get()->waitUntilFinished(); + string c = database; + c += '.'; + boost::filesystem::path p(path); + if ( directoryperdb ) + p /= database; + boost::filesystem::path q; + q = p / (c+"ns"); + bool ok = false; + BOOST_CHECK_EXCEPTION( ok = fo.apply( q ) ); + if ( ok ) + log(2) << fo.op() << " file " << q.string() << endl; + int i = 0; + int extra = 10; // should not be necessary, this is defensive in case there are missing files + while ( 1 ) { + assert( i <= DiskLoc::MaxFiles ); + stringstream ss; + ss << c << i; + q = p / ss.str(); + BOOST_CHECK_EXCEPTION( ok = fo.apply(q) ); + if ( ok ) { + if ( extra != 10 ) { + log(1) << fo.op() << " file " << q.string() << endl; + log() << " _applyOpToDataFiles() warning: extra == " << extra << endl; + } + } + else if ( --extra <= 0 ) + break; + i++; + } + } + + NamespaceDetails* nsdetails_notinline(const char *ns) { return nsdetails(ns); } + + bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ) { + log() << "DatabaseHolder::closeAll path:" << path << endl; + d.dbMutex.assertWriteLocked(); + + map<string,Database*>& m = _paths[path]; + _size -= m.size(); + + set< string > dbs; + for ( map<string,Database*>::iterator i = m.begin(); i != m.end(); i++ ) { + wassert( i->second->path == path ); + dbs.insert( i->first ); + } + + currentClient.get()->getContext()->_clear(); + + BSONObjBuilder bb( result.subarrayStart( "dbs" ) ); + int n = 0; + int nNotClosed = 0; + for( set< string >::iterator i = dbs.begin(); i != dbs.end(); ++i ) { + string name = *i; + log(2) << "DatabaseHolder::closeAll path:" << path << " name:" << name << endl; + Client::Context ctx( name , path ); + if( !force && BackgroundOperation::inProgForDb(name.c_str()) ) { + log() << "WARNING: can't close database " << name << " because a bg job is in progress - try killOp command" << endl; + nNotClosed++; + } + else { + Database::closeDatabase( name.c_str() , path ); + bb.append( bb.numStr( n++ ) , name ); + } + } + bb.done(); + if( nNotClosed ) + result.append("nNotClosed", nNotClosed); + else { + ClientCursor::assertNoCursors(); + } + + return true; + } + +} // namespace mongo diff --git a/src/mongo/db/pdfile.h b/src/mongo/db/pdfile.h new file mode 100644 index 00000000000..cd6062b1a48 --- /dev/null +++ b/src/mongo/db/pdfile.h @@ -0,0 +1,546 @@ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* pdfile.h + + Files: + database.ns - namespace index + database.1 - data files + database.2 + ... +*/ + +#pragma once + +#include "../pch.h" +#include "../util/mmap.h" +#include "diskloc.h" +#include "jsobjmanipulator.h" +#include "namespace-inl.h" +#include "client.h" +#include "mongommf.h" + +namespace mongo { + + class DataFileHeader; + class Extent; + class Record; + class Cursor; + class OpDebug; + + void dropDatabase(string db); + bool repairDatabase(string db, string &errmsg, bool preserveClonedFilesOnFailure = false, bool backupOriginalFiles = false); + + /* low level - only drops this ns */ + void dropNS(const string& dropNs); + + /* deletes this ns, indexes and cursors */ + void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result ); + bool userCreateNS(const char *ns, BSONObj j, string& err, bool logForReplication, bool *deferIdIndex = 0); + shared_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc=DiskLoc()); + + bool isValidNS( const StringData& ns ); + + /*---------------------------------------------------------------------*/ + + class MongoDataFile { + friend class DataFileMgr; + friend class BasicCursor; + public: + MongoDataFile(int fn) : _mb(0), fileNo(fn) { } + + /** @return true if found and opened. if uninitialized (prealloc only) does not open. */ + bool openExisting( const char *filename ); + + /** creates if DNE */ + void open(const char *filename, int requestedDataSize = 0, bool preallocateOnly = false); + + /* allocate a new extent from this datafile. + @param capped - true if capped collection + @param loops is our recursion check variable - you want to pass in zero + */ + Extent* createExtent(const char *ns, int approxSize, bool capped = false, int loops = 0); + + DataFileHeader *getHeader() { return header(); } + + unsigned long long length() const { return mmf.length(); } + + /* return max size an extent may be */ + static int maxSize(); + + /** fsync */ + void flush( bool sync ); + + /** only use fore debugging */ + Extent* debug_getExtent(DiskLoc loc) { return _getExtent( loc ); } + private: + void badOfs(int) const; + void badOfs2(int) const; + int defaultSize( const char *filename ) const; + + Extent* getExtent(DiskLoc loc) const; + Extent* _getExtent(DiskLoc loc) const; + Record* recordAt(DiskLoc dl); + Record* makeRecord(DiskLoc dl, int size); + void grow(DiskLoc dl, int size); + + char* p() const { return (char *) _mb; } + DataFileHeader* header() { return (DataFileHeader*) _mb; } + + MongoMMF mmf; + void *_mb; // the memory mapped view + int fileNo; + }; + + class DataFileMgr { + friend class BasicCursor; + public: + void init(const string& path ); + + /* see if we can find an extent of the right size in the freelist. */ + static Extent* allocFromFreeList(const char *ns, int approxSize, bool capped = false); + + /** @return DiskLoc where item ends up */ + // changedId should be initialized to false + const DiskLoc updateRecord( + const char *ns, + NamespaceDetails *d, + NamespaceDetailsTransient *nsdt, + Record *toupdate, const DiskLoc& dl, + const char *buf, int len, OpDebug& debug, bool god=false); + + // The object o may be updated if modified on insert. + void insertAndLog( const char *ns, const BSONObj &o, bool god = false ); + + /** insert will add an _id to the object if not present. if you would like to see the final object + after such an addition, use this method. + @param o both and in and out param + */ + DiskLoc insertWithObjMod(const char *ns, BSONObj & /*out*/o, bool god = false); + + /** @param obj in value only for this version. */ + void insertNoReturnVal(const char *ns, BSONObj o, bool god = false); + + DiskLoc insert(const char *ns, const void *buf, int len, bool god = false, bool mayAddIndex = true, bool *addedID = 0); + static shared_ptr<Cursor> findAll(const char *ns, const DiskLoc &startLoc = DiskLoc()); + + /* special version of insert for transaction logging -- streamlined a bit. + assumes ns is capped and no indexes + no _id field check + */ + Record* fast_oplog_insert(NamespaceDetails *d, const char *ns, int len); + + static Extent* getExtent(const DiskLoc& dl); + static Record* getRecord(const DiskLoc& dl); + static DeletedRecord* makeDeletedRecord(const DiskLoc& dl, int len); + + void deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK = false, bool noWarn = false, bool logOp=false); + + /* does not clean up indexes, etc. : just deletes the record in the pdfile. use deleteRecord() to unindex */ + void _deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl); + + private: + vector<MongoDataFile *> files; + }; + + extern DataFileMgr theDataFileMgr; + +#pragma pack(1) + + class DeletedRecord { + public: + int lengthWithHeaders; + int extentOfs; + DiskLoc nextDeleted; + DiskLoc myExtentLoc(const DiskLoc& myLoc) const { + return DiskLoc(myLoc.a(), extentOfs); + } + Extent* myExtent(const DiskLoc& myLoc) { + return DataFileMgr::getExtent(DiskLoc(myLoc.a(), extentOfs)); + } + }; + + /* Record is a record in a datafile. DeletedRecord is similar but for deleted space. + + *11:03:20 AM) dm10gen: regarding extentOfs... + (11:03:42 AM) dm10gen: an extent is a continugous disk area, which contains many Records and DeleteRecords + (11:03:56 AM) dm10gen: a DiskLoc has two pieces, the fileno and ofs. (64 bit total) + (11:04:16 AM) dm10gen: to keep the headesr small, instead of storing a 64 bit ptr to the full extent address, we keep just the offset + (11:04:29 AM) dm10gen: we can do this as we know the record's address, and it has the same fileNo + (11:04:33 AM) dm10gen: see class DiskLoc for more info + (11:04:43 AM) dm10gen: so that is how Record::myExtent() works + (11:04:53 AM) dm10gen: on an alloc(), when we build a new Record, we must populate its extentOfs then + */ + class Record { + public: + enum HeaderSizeValue { HeaderSize = 16 }; + int lengthWithHeaders; + int extentOfs; + int nextOfs; + int prevOfs; + + /** be careful when referencing this that your write intent was correct */ + char data[4]; + + int netLength() { + return lengthWithHeaders - HeaderSize; + } + //void setNewLength(int netlen) { lengthWithHeaders = netlen + HeaderSize; } + + /* use this when a record is deleted. basically a union with next/prev fields */ + DeletedRecord& asDeleted() { return *((DeletedRecord*) this); } + + Extent* myExtent(const DiskLoc& myLoc) { return DataFileMgr::getExtent(DiskLoc(myLoc.a(), extentOfs)); } + + /* get the next record in the namespace, traversing extents as necessary */ + DiskLoc getNext(const DiskLoc& myLoc); + DiskLoc getPrev(const DiskLoc& myLoc); + + DiskLoc nextInExtent(const DiskLoc& myLoc) { + if ( nextOfs == DiskLoc::NullOfs ) + return DiskLoc(); + assert( nextOfs ); + return DiskLoc(myLoc.a(), nextOfs); + } + + struct NP { + int nextOfs; + int prevOfs; + }; + NP* np() { return (NP*) &nextOfs; } + + // --------------------- + // memory cache + // --------------------- + + /** + * touches the data so that is in physical memory + * @param entireRecrd if false, only the header and first byte is touched + * if true, the entire record is touched + * */ + void touch( bool entireRecrd = false ); + + /** + * @return if this record is likely in physical memory + * its not guaranteed because its possible it gets swapped out in a very unlucky windows + */ + bool likelyInPhysicalMemory(); + + /** + * tell the cache this Record was accessed + * @return this, for simple chaining + */ + Record* accessed(); + + static bool MemoryTrackingEnabled; + }; + + /* extents are datafile regions where all the records within the region + belong to the same namespace. + + (11:12:35 AM) dm10gen: when the extent is allocated, all its empty space is stuck into one big DeletedRecord + (11:12:55 AM) dm10gen: and that is placed on the free list + */ + class Extent { + public: + unsigned magic; + DiskLoc myLoc; + DiskLoc xnext, xprev; /* next/prev extent for this namespace */ + + /* which namespace this extent is for. this is just for troubleshooting really + and won't even be correct if the collection were renamed! + */ + Namespace nsDiagnostic; + + int length; /* size of the extent, including these fields */ + DiskLoc firstRecord; + DiskLoc lastRecord; + char _extentData[4]; + + static int HeaderSize() { return sizeof(Extent)-4; } + + bool validates() { + return !(firstRecord.isNull() ^ lastRecord.isNull()) && + length >= 0 && !myLoc.isNull(); + } + + BSONObj dump() { + return BSON( "loc" << myLoc.toString() << "xnext" << xnext.toString() << "xprev" << xprev.toString() + << "nsdiag" << nsDiagnostic.toString() + << "size" << length << "firstRecord" << firstRecord.toString() << "lastRecord" << lastRecord.toString()); + } + + void dump(iostream& s) { + s << " loc:" << myLoc.toString() << " xnext:" << xnext.toString() << " xprev:" << xprev.toString() << '\n'; + s << " nsdiag:" << nsDiagnostic.toString() << '\n'; + s << " size:" << length << " firstRecord:" << firstRecord.toString() << " lastRecord:" << lastRecord.toString() << '\n'; + } + + /* assumes already zeroed -- insufficient for block 'reuse' perhaps + Returns a DeletedRecord location which is the data in the extent ready for us. + Caller will need to add that to the freelist structure in namespacedetail. + */ + DiskLoc init(const char *nsname, int _length, int _fileNo, int _offset, bool capped); + + /* like init(), but for a reuse case */ + DiskLoc reuse(const char *nsname, bool newUseIsAsCapped); + + bool isOk() const { return magic == 0x41424344; } + void assertOk() const { assert(isOk()); } + + Record* newRecord(int len); + + Record* getRecord(DiskLoc dl) { + assert( !dl.isNull() ); + assert( dl.sameFile(myLoc) ); + int x = dl.getOfs() - myLoc.getOfs(); + assert( x > 0 ); + return (Record *) (((char *) this) + x); + } + + Extent* getNextExtent() { return xnext.isNull() ? 0 : DataFileMgr::getExtent(xnext); } + Extent* getPrevExtent() { return xprev.isNull() ? 0 : DataFileMgr::getExtent(xprev); } + + static int maxSize(); + static int minSize() { return 0x100; } + /** + * @param len lengt of record we need + * @param lastRecord size of last extent which is a factor in next extent size + */ + static int followupSize(int len, int lastExtentLen); + + /** get a suggested size for the first extent in a namespace + * @param len length of record we need to insert + */ + static int initialSize(int len); + + struct FL { + DiskLoc firstRecord; + DiskLoc lastRecord; + }; + /** often we want to update just the firstRecord and lastRecord fields. + this helper is for that -- for use with getDur().writing() method + */ + FL* fl() { return (FL*) &firstRecord; } + + /** caller must declare write intent first */ + void markEmpty(); + private: + DiskLoc _reuse(const char *nsname, bool newUseIsAsCapped); // recycle an extent and reuse it for a different ns + }; + + /* a datafile - i.e. the "dbname.<#>" files : + + ---------------------- + DataFileHeader + ---------------------- + Extent (for a particular namespace) + Record + ... + Record (some chained for unused space) + ---------------------- + more Extents... + ---------------------- + */ + class DataFileHeader { + public: + int version; + int versionMinor; + int fileLength; + DiskLoc unused; /* unused is the portion of the file that doesn't belong to any allocated extents. -1 = no more */ + int unusedLength; + char reserved[8192 - 4*4 - 8]; + + char data[4]; // first extent starts here + + enum { HeaderSize = 8192 }; + + bool isCurrentVersion() const { return ( version == PDFILE_VERSION ) && ( versionMinor == PDFILE_VERSION_MINOR ); } + + bool uninitialized() const { return version == 0; } + + void init(int fileno, int filelength, const char* filename) { + if ( uninitialized() ) { + DEV log() << "datafileheader::init initializing " << filename << " n:" << fileno << endl; + if( !(filelength > 32768 ) ) { + massert(13640, str::stream() << "DataFileHeader looks corrupt at file open filelength:" << filelength << " fileno:" << fileno, false); + } + + { + if( !d.dbMutex.isWriteLocked() ) { + log() << "*** TEMP NOT INITIALIZING FILE " << filename << ", not in a write lock." << endl; + log() << "temp bypass until more elaborate change - case that is manifesting is benign anyway" << endl; + return; +/** + log() << "ERROR can't create outside a write lock" << endl; + printStackTrace(); + ::abort(); +**/ + } + } + + getDur().createdFile(filename, filelength); + assert( HeaderSize == 8192 ); + DataFileHeader *h = getDur().writing(this); + h->fileLength = filelength; + h->version = PDFILE_VERSION; + h->versionMinor = PDFILE_VERSION_MINOR; + h->unused.set( fileno, HeaderSize ); + assert( (data-(char*)this) == HeaderSize ); + h->unusedLength = fileLength - HeaderSize - 16; + } + } + + bool isEmpty() const { + return uninitialized() || ( unusedLength == fileLength - HeaderSize - 16 ); + } + }; + +#pragma pack() + + inline Extent* MongoDataFile::_getExtent(DiskLoc loc) const { + loc.assertOk(); + Extent *e = (Extent *) (p()+loc.getOfs()); + return e; + } + + inline Extent* MongoDataFile::getExtent(DiskLoc loc) const { + Extent *e = _getExtent(loc); + e->assertOk(); + return e; + } + +} // namespace mongo + +#include "cursor.h" + +namespace mongo { + + inline Record* MongoDataFile::recordAt(DiskLoc dl) { + int ofs = dl.getOfs(); + if( ofs < DataFileHeader::HeaderSize ) badOfs(ofs); // will uassert - external call to keep out of the normal code path + return (Record*) (p()+ofs); + } + + inline Record* MongoDataFile::makeRecord(DiskLoc dl, int size) { + int ofs = dl.getOfs(); + if( ofs < DataFileHeader::HeaderSize ) badOfs(ofs); // will uassert - external call to keep out of the normal code path + return (Record*) (p()+ofs); + } + + inline DiskLoc Record::getNext(const DiskLoc& myLoc) { + if ( nextOfs != DiskLoc::NullOfs ) { + /* defensive */ + if ( nextOfs >= 0 && nextOfs < 10 ) { + sayDbContext("Assertion failure - Record::getNext() referencing a deleted record?"); + return DiskLoc(); + } + + return DiskLoc(myLoc.a(), nextOfs); + } + Extent *e = myExtent(myLoc); + while ( 1 ) { + if ( e->xnext.isNull() ) + return DiskLoc(); // end of table. + e = e->xnext.ext(); + if ( !e->firstRecord.isNull() ) + break; + // entire extent could be empty, keep looking + } + return e->firstRecord; + } + inline DiskLoc Record::getPrev(const DiskLoc& myLoc) { + if ( prevOfs != DiskLoc::NullOfs ) + return DiskLoc(myLoc.a(), prevOfs); + Extent *e = myExtent(myLoc); + if ( e->xprev.isNull() ) + return DiskLoc(); + return e->xprev.ext()->lastRecord; + } + + inline BSONObj DiskLoc::obj() const { + return BSONObj(rec()->accessed()); + } + inline DeletedRecord* DiskLoc::drec() const { + assert( _a != -1 ); + return (DeletedRecord*) rec(); + } + inline Extent* DiskLoc::ext() const { + return DataFileMgr::getExtent(*this); + } + + template< class V > + inline + const BtreeBucket<V> * DiskLoc::btree() const { + assert( _a != -1 ); + return (const BtreeBucket<V> *) rec()->data; + } + +} // namespace mongo + +#include "database.h" + +namespace mongo { + + boost::intmax_t dbSize( const char *database ); + + inline NamespaceIndex* nsindex(const char *ns) { + Database *database = cc().database(); + assert( database ); + DEV { + char buf[256]; + nsToDatabase(ns, buf); + if ( database->name != buf ) { + out() << "ERROR: attempt to write to wrong database\n"; + out() << " ns:" << ns << '\n'; + out() << " database->name:" << database->name << endl; + assert( database->name == buf ); + } + } + return &database->namespaceIndex; + } + + inline NamespaceDetails* nsdetails(const char *ns) { + // if this faults, did you set the current db first? (Client::Context + dblock) + return nsindex(ns)->details(ns); + } + + inline Extent* DataFileMgr::getExtent(const DiskLoc& dl) { + assert( dl.a() != -1 ); + return cc().database()->getFile(dl.a())->getExtent(dl); + } + + inline Record* DataFileMgr::getRecord(const DiskLoc& dl) { + assert( dl.a() != -1 ); + return cc().database()->getFile(dl.a())->recordAt(dl); + } + + BOOST_STATIC_ASSERT( 16 == sizeof(DeletedRecord) ); + + inline DeletedRecord* DataFileMgr::makeDeletedRecord(const DiskLoc& dl, int len) { + assert( dl.a() != -1 ); + return (DeletedRecord*) cc().database()->getFile(dl.a())->makeRecord(dl, sizeof(DeletedRecord)); + } + + void ensureHaveIdIndex(const char *ns); + + bool dropIndexes( NamespaceDetails *d, const char *ns, const char *name, string &errmsg, BSONObjBuilder &anObjBuilder, bool maydeleteIdIndex ); + + inline BSONObj::BSONObj(const Record *r) { + init(r->data); + } + +} // namespace mongo diff --git a/src/mongo/db/pipeline/accumulator.cpp b/src/mongo/db/pipeline/accumulator.cpp new file mode 100755 index 00000000000..9ef8aa39470 --- /dev/null +++ b/src/mongo/db/pipeline/accumulator.cpp @@ -0,0 +1,92 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/accumulator.h"
+
+#include "db/jsobj.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+ using namespace mongoutils;
+
+ void Accumulator::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ uassert(15943, str::stream() << "group accumulator " <<
+ getOpName() << " only accepts one operand",
+ vpOperand.size() < 1);
+
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ Accumulator::Accumulator():
+ ExpressionNary() {
+ }
+
+ void Accumulator::opToBson(
+ BSONObjBuilder *pBuilder, string opName,
+ string fieldName, unsigned depth) const {
+ assert(vpOperand.size() == 1);
+ BSONObjBuilder builder;
+ vpOperand[0]->addToBsonObj(&builder, opName, depth);
+ pBuilder->append(fieldName, builder.done());
+ }
+
+ void Accumulator::addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+ opToBson(pBuilder, getOpName(), fieldName, depth);
+ }
+
+ void Accumulator::addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const {
+ assert(false); // these can't appear in arrays
+ }
+
+ void agg_framework_reservedErrors() {
+ uassert(16017, "reserved error", false);
+ uassert(16018, "reserved error", false);
+ uassert(16019, "reserved error", false);
+ uassert(16020, "reserved error", false);
+ uassert(16021, "reserved error", false);
+ uassert(16022, "reserved error", false);
+ uassert(16023, "reserved error", false);
+ uassert(16024, "reserved error", false);
+ uassert(16025, "reserved error", false);
+ uassert(16026, "reserved error", false);
+ uassert(16027, "reserved error", false);
+ uassert(16028, "reserved error", false);
+ uassert(16029, "reserved error", false);
+ uassert(16030, "reserved error", false);
+ uassert(16031, "reserved error", false);
+ uassert(16032, "reserved error", false);
+ uassert(16033, "reserved error", false);
+
+ uassert(16036, "reserved error", false);
+ uassert(16037, "reserved error", false);
+ uassert(16038, "reserved error", false);
+ uassert(16039, "reserved error", false);
+ uassert(16040, "reserved error", false);
+ uassert(16041, "reserved error", false);
+ uassert(16042, "reserved error", false);
+ uassert(16043, "reserved error", false);
+ uassert(16044, "reserved error", false);
+ uassert(16045, "reserved error", false);
+ uassert(16046, "reserved error", false);
+ uassert(16047, "reserved error", false);
+ uassert(16048, "reserved error", false);
+ uassert(16049, "reserved error", false);
+ }
+}
diff --git a/src/mongo/db/pipeline/accumulator.h b/src/mongo/db/pipeline/accumulator.h new file mode 100755 index 00000000000..a75b2c9abaa --- /dev/null +++ b/src/mongo/db/pipeline/accumulator.h @@ -0,0 +1,259 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include <boost/unordered_set.hpp>
+#include "db/pipeline/value.h"
+#include "db/pipeline/expression.h"
+#include "bson/bsontypes.h"
+
+namespace mongo {
+ class ExpressionContext;
+
+ class Accumulator :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+ virtual void addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+ virtual void addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+ /*
+ Get the accumulated value.
+
+ @returns the accumulated value
+ */
+ virtual intrusive_ptr<const Value> getValue() const = 0;
+
+ protected:
+ Accumulator();
+
+ /*
+ Convenience method for doing this for accumulators. The pattern
+ is always the same, so a common implementation works, but requires
+ knowing the operator name.
+
+ @param pBuilder the builder to add to
+ @param fieldName the projected name
+ @param opName the operator name
+ */
+ void opToBson(
+ BSONObjBuilder *pBuilder, string fieldName, string opName,
+ unsigned depth) const;
+ };
+
+
+ class AccumulatorAddToSet :
+ public Accumulator {
+ public:
+ // virtuals from Expression
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual intrusive_ptr<const Value> getValue() const;
+ virtual const char *getOpName() const;
+
+ /*
+ Create an appending accumulator.
+
+ @param pCtx the expression context
+ @returns the created accumulator
+ */
+ static intrusive_ptr<Accumulator> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ private:
+ AccumulatorAddToSet(const intrusive_ptr<ExpressionContext> &pTheCtx);
+ typedef boost::unordered_set<intrusive_ptr<const Value>, Value::Hash > SetType;
+ mutable SetType set;
+ mutable SetType::iterator itr;
+ intrusive_ptr<ExpressionContext> pCtx;
+ };
+
+
+ /*
+ This isn't a finished accumulator, but rather a convenient base class
+ for others such as $first, $last, $max, $min, and similar. It just
+ provides a holder for a single Value, and the getter for that. The
+ holder is protected so derived classes can manipulate it.
+ */
+ class AccumulatorSingleValue :
+ public Accumulator {
+ public:
+ // virtuals from Expression
+ virtual intrusive_ptr<const Value> getValue() const;
+
+ protected:
+ AccumulatorSingleValue();
+
+ mutable intrusive_ptr<const Value> pValue; /* current min/max */
+ };
+
+
+ class AccumulatorFirst :
+ public AccumulatorSingleValue {
+ public:
+ // virtuals from Expression
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+
+ /*
+ Create the accumulator.
+
+ @returns the created accumulator
+ */
+ static intrusive_ptr<Accumulator> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ private:
+ AccumulatorFirst();
+ };
+
+
+ class AccumulatorLast :
+ public AccumulatorSingleValue {
+ public:
+ // virtuals from Expression
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+
+ /*
+ Create the accumulator.
+
+ @returns the created accumulator
+ */
+ static intrusive_ptr<Accumulator> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ private:
+ AccumulatorLast();
+ };
+
+
+ class AccumulatorSum :
+ public Accumulator {
+ public:
+ // virtuals from Accumulator
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual intrusive_ptr<const Value> getValue() const;
+ virtual const char *getOpName() const;
+
+ /*
+ Create a summing accumulator.
+
+ @param pCtx the expression context
+ @returns the created accumulator
+ */
+ static intrusive_ptr<Accumulator> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ protected: /* reused by AccumulatorAvg */
+ AccumulatorSum();
+
+ mutable BSONType totalType;
+ mutable long long longTotal;
+ mutable double doubleTotal;
+ };
+
+
+ class AccumulatorMinMax :
+ public AccumulatorSingleValue {
+ public:
+ // virtuals from Expression
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+
+ /*
+ Create either the max or min accumulator.
+
+ @returns the created accumulator
+ */
+ static intrusive_ptr<Accumulator> createMin(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+ static intrusive_ptr<Accumulator> createMax(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ private:
+ AccumulatorMinMax(int theSense);
+
+ int sense; /* 1 for min, -1 for max; used to "scale" comparison */
+ };
+
+
+ class AccumulatorPush :
+ public Accumulator {
+ public:
+ // virtuals from Expression
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual intrusive_ptr<const Value> getValue() const;
+ virtual const char *getOpName() const;
+
+ /*
+ Create an appending accumulator.
+
+ @param pCtx the expression context
+ @returns the created accumulator
+ */
+ static intrusive_ptr<Accumulator> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ private:
+ AccumulatorPush(const intrusive_ptr<ExpressionContext> &pTheCtx);
+
+ mutable vector<intrusive_ptr<const Value> > vpValue;
+ intrusive_ptr<ExpressionContext> pCtx;
+ };
+
+
+ class AccumulatorAvg :
+ public AccumulatorSum {
+ typedef AccumulatorSum Super;
+ public:
+ // virtuals from Accumulator
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual intrusive_ptr<const Value> getValue() const;
+ virtual const char *getOpName() const;
+
+ /*
+ Create an averaging accumulator.
+
+ @param pCtx the expression context
+ @returns the created accumulator
+ */
+ static intrusive_ptr<Accumulator> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ private:
+ static const char subTotalName[];
+ static const char countName[];
+
+ AccumulatorAvg(const intrusive_ptr<ExpressionContext> &pCtx);
+
+ mutable long long count;
+ intrusive_ptr<ExpressionContext> pCtx;
+ };
+
+}
diff --git a/src/mongo/db/pipeline/accumulator_add_to_set.cpp b/src/mongo/db/pipeline/accumulator_add_to_set.cpp new file mode 100755 index 00000000000..94df0293de4 --- /dev/null +++ b/src/mongo/db/pipeline/accumulator_add_to_set.cpp @@ -0,0 +1,79 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+ intrusive_ptr<const Value> AccumulatorAddToSet::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ assert(vpOperand.size() == 1);
+ intrusive_ptr<const Value> prhs(vpOperand[0]->evaluate(pDocument));
+
+ if (prhs->getType() == Undefined)
+ ; /* nothing to add to the array */
+ else if (!pCtx->getInRouter())
+ set.insert(prhs);
+ else {
+ /*
+ If we're in the router, we need to take apart the arrays we
+ receive and put their elements into the array we are collecting.
+ If we didn't, then we'd get an array of arrays, with one array
+ from each shard that responds.
+ */
+ assert(prhs->getType() == Array);
+
+ intrusive_ptr<ValueIterator> pvi(prhs->getArray());
+ while(pvi->more()) {
+ intrusive_ptr<const Value> pElement(pvi->next());
+ set.insert(pElement);
+ }
+ }
+
+ return Value::getNull();
+ }
+
+ intrusive_ptr<const Value> AccumulatorAddToSet::getValue() const {
+ vector<intrusive_ptr<const Value> > valVec;
+
+ for (itr = set.begin(); itr != set.end(); ++itr) {
+ valVec.push_back(*itr);
+ }
+ /* there is no issue of scope since createArray copy constructs */
+ return Value::createArray(valVec);
+ }
+
+ AccumulatorAddToSet::AccumulatorAddToSet(
+ const intrusive_ptr<ExpressionContext> &pTheCtx):
+ Accumulator(),
+ set(),
+ pCtx(pTheCtx) {
+ }
+
+ intrusive_ptr<Accumulator> AccumulatorAddToSet::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<AccumulatorAddToSet> pAccumulator(
+ new AccumulatorAddToSet(pCtx));
+ return pAccumulator;
+ }
+
+ const char *AccumulatorAddToSet::getOpName() const {
+ return "$addToSet";
+ }
+}
diff --git a/src/mongo/db/pipeline/accumulator_avg.cpp b/src/mongo/db/pipeline/accumulator_avg.cpp new file mode 100755 index 00000000000..9f18b1820c8 --- /dev/null +++ b/src/mongo/db/pipeline/accumulator_avg.cpp @@ -0,0 +1,123 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ const char AccumulatorAvg::subTotalName[] = "subTotal";
+ const char AccumulatorAvg::countName[] = "count";
+
+ intrusive_ptr<const Value> AccumulatorAvg::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ if (!pCtx->getInRouter()) {
+ Super::evaluate(pDocument);
+ ++count;
+ }
+ else {
+ /*
+ If we're in the router, we expect an object that contains
+ both a subtotal and a count. This is what getValue() produced
+ below.
+ */
+ intrusive_ptr<const Value> prhs(
+ vpOperand[0]->evaluate(pDocument));
+ assert(prhs->getType() == Object);
+ intrusive_ptr<Document> pShardDoc(prhs->getDocument());
+
+ intrusive_ptr<const Value> pSubTotal(
+ pShardDoc->getValue(subTotalName));
+ assert(pSubTotal.get());
+ BSONType subTotalType = pSubTotal->getType();
+ if ((totalType == NumberLong) || (subTotalType == NumberLong))
+ totalType = NumberLong;
+ if ((totalType == NumberDouble) || (subTotalType == NumberDouble))
+ totalType = NumberDouble;
+
+ if (subTotalType == NumberInt) {
+ int v = pSubTotal->getInt();
+ longTotal += v;
+ doubleTotal += v;
+ }
+ else if (subTotalType == NumberLong) {
+ long long v = pSubTotal->getLong();
+ longTotal += v;
+ doubleTotal += v;
+ }
+ else {
+ double v = pSubTotal->getDouble();
+ doubleTotal += v;
+ }
+
+ intrusive_ptr<const Value> pCount(pShardDoc->getValue(countName));
+ count += pCount->getLong();
+ }
+
+ return Value::getZero();
+ }
+
+ intrusive_ptr<Accumulator> AccumulatorAvg::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<AccumulatorAvg> pA(new AccumulatorAvg(pCtx));
+ return pA;
+ }
+
+ intrusive_ptr<const Value> AccumulatorAvg::getValue() const {
+ if (!pCtx->getInShard()) {
+ double avg = 0;
+ if (count) {
+ if (totalType != NumberDouble)
+ avg = static_cast<double>(longTotal / count);
+ else
+ avg = doubleTotal / count;
+ }
+
+ return Value::createDouble(avg);
+ }
+
+ intrusive_ptr<Document> pDocument(Document::create());
+
+ intrusive_ptr<const Value> pSubTotal;
+ if (totalType == NumberInt)
+ pSubTotal = Value::createInt((int)longTotal);
+ else if (totalType == NumberLong)
+ pSubTotal = Value::createLong(longTotal);
+ else
+ pSubTotal = Value::createDouble(doubleTotal);
+ pDocument->addField(subTotalName, pSubTotal);
+
+ intrusive_ptr<const Value> pCount(Value::createLong(count));
+ pDocument->addField(countName, pCount);
+
+ return Value::createDocument(pDocument);
+ }
+
+ AccumulatorAvg::AccumulatorAvg(
+ const intrusive_ptr<ExpressionContext> &pTheCtx):
+ AccumulatorSum(),
+ count(0),
+ pCtx(pTheCtx) {
+ }
+
+ const char *AccumulatorAvg::getOpName() const {
+ return "$avg";
+ }
+}
diff --git a/src/mongo/db/pipeline/accumulator_first.cpp b/src/mongo/db/pipeline/accumulator_first.cpp new file mode 100755 index 00000000000..c947aa83996 --- /dev/null +++ b/src/mongo/db/pipeline/accumulator_first.cpp @@ -0,0 +1,49 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ intrusive_ptr<const Value> AccumulatorFirst::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ assert(vpOperand.size() == 1);
+
+ /* only remember the first value seen */
+ if (!pValue.get())
+ pValue = vpOperand[0]->evaluate(pDocument);
+
+ return pValue;
+ }
+
+ AccumulatorFirst::AccumulatorFirst():
+ AccumulatorSingleValue() {
+ }
+
+ intrusive_ptr<Accumulator> AccumulatorFirst::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<AccumulatorFirst> pAccumulator(
+ new AccumulatorFirst());
+ return pAccumulator;
+ }
+
+ const char *AccumulatorFirst::getOpName() const {
+ return "$first";
+ }
+}
diff --git a/src/mongo/db/pipeline/accumulator_last.cpp b/src/mongo/db/pipeline/accumulator_last.cpp new file mode 100755 index 00000000000..c134fc83159 --- /dev/null +++ b/src/mongo/db/pipeline/accumulator_last.cpp @@ -0,0 +1,48 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ intrusive_ptr<const Value> AccumulatorLast::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ assert(vpOperand.size() == 1);
+
+ /* always remember the last value seen */
+ pValue = vpOperand[0]->evaluate(pDocument);
+
+ return pValue;
+ }
+
+ AccumulatorLast::AccumulatorLast():
+ AccumulatorSingleValue() {
+ }
+
+ intrusive_ptr<Accumulator> AccumulatorLast::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<AccumulatorLast> pAccumulator(
+ new AccumulatorLast());
+ return pAccumulator;
+ }
+
+ const char *AccumulatorLast::getOpName() const {
+ return "$last";
+ }
+}
diff --git a/src/mongo/db/pipeline/accumulator_min_max.cpp b/src/mongo/db/pipeline/accumulator_min_max.cpp new file mode 100755 index 00000000000..6f078187b44 --- /dev/null +++ b/src/mongo/db/pipeline/accumulator_min_max.cpp @@ -0,0 +1,67 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ intrusive_ptr<const Value> AccumulatorMinMax::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ assert(vpOperand.size() == 1);
+ intrusive_ptr<const Value> prhs(vpOperand[0]->evaluate(pDocument));
+
+ /* if this is the first value, just use it */
+ if (!pValue.get())
+ pValue = prhs;
+ else {
+ /* compare with the current value; swap if appropriate */
+ int cmp = Value::compare(pValue, prhs) * sense;
+ if (cmp > 0)
+ pValue = prhs;
+ }
+
+ return pValue;
+ }
+
+ AccumulatorMinMax::AccumulatorMinMax(int theSense):
+ AccumulatorSingleValue(),
+ sense(theSense) {
+ assert((sense == 1) || (sense == -1));
+ }
+
+ intrusive_ptr<Accumulator> AccumulatorMinMax::createMin(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<AccumulatorMinMax> pAccumulator(
+ new AccumulatorMinMax(1));
+ return pAccumulator;
+ }
+
+ intrusive_ptr<Accumulator> AccumulatorMinMax::createMax(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<AccumulatorMinMax> pAccumulator(
+ new AccumulatorMinMax(-1));
+ return pAccumulator;
+ }
+
+ const char *AccumulatorMinMax::getOpName() const {
+ if (sense == 1)
+ return "$min";
+ return "$max";
+ }
+}
diff --git a/src/mongo/db/pipeline/accumulator_push.cpp b/src/mongo/db/pipeline/accumulator_push.cpp new file mode 100755 index 00000000000..2640bc4ecfd --- /dev/null +++ b/src/mongo/db/pipeline/accumulator_push.cpp @@ -0,0 +1,73 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+ intrusive_ptr<const Value> AccumulatorPush::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ assert(vpOperand.size() == 1);
+ intrusive_ptr<const Value> prhs(vpOperand[0]->evaluate(pDocument));
+
+ if (prhs->getType() == Undefined)
+ ; /* nothing to add to the array */
+ else if (!pCtx->getInRouter())
+ vpValue.push_back(prhs);
+ else {
+ /*
+ If we're in the router, we need to take apart the arrays we
+ receive and put their elements into the array we are collecting.
+ If we didn't, then we'd get an array of arrays, with one array
+ from each shard that responds.
+ */
+ assert(prhs->getType() == Array);
+
+ intrusive_ptr<ValueIterator> pvi(prhs->getArray());
+ while(pvi->more()) {
+ intrusive_ptr<const Value> pElement(pvi->next());
+ vpValue.push_back(pElement);
+ }
+ }
+
+ return Value::getNull();
+ }
+
+ intrusive_ptr<const Value> AccumulatorPush::getValue() const {
+ return Value::createArray(vpValue);
+ }
+
+ AccumulatorPush::AccumulatorPush(
+ const intrusive_ptr<ExpressionContext> &pTheCtx):
+ Accumulator(),
+ vpValue(),
+ pCtx(pTheCtx) {
+ }
+
+ intrusive_ptr<Accumulator> AccumulatorPush::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<AccumulatorPush> pAccumulator(
+ new AccumulatorPush(pCtx));
+ return pAccumulator;
+ }
+
+ const char *AccumulatorPush::getOpName() const {
+ return "$push";
+ }
+}
diff --git a/src/mongo/db/pipeline/accumulator_single_value.cpp b/src/mongo/db/pipeline/accumulator_single_value.cpp new file mode 100755 index 00000000000..bfec80387d3 --- /dev/null +++ b/src/mongo/db/pipeline/accumulator_single_value.cpp @@ -0,0 +1,32 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ intrusive_ptr<const Value> AccumulatorSingleValue::getValue() const {
+ return pValue;
+ }
+
+ AccumulatorSingleValue::AccumulatorSingleValue():
+ pValue(intrusive_ptr<const Value>()) {
+ }
+
+}
diff --git a/src/mongo/db/pipeline/accumulator_sum.cpp b/src/mongo/db/pipeline/accumulator_sum.cpp new file mode 100755 index 00000000000..e6526ac254a --- /dev/null +++ b/src/mongo/db/pipeline/accumulator_sum.cpp @@ -0,0 +1,74 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "accumulator.h"
+
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ intrusive_ptr<const Value> AccumulatorSum::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ assert(vpOperand.size() == 1);
+ intrusive_ptr<const Value> prhs(vpOperand[0]->evaluate(pDocument));
+
+ /* upgrade to the widest type required to hold the result */
+ totalType = Value::getWidestNumeric(totalType, prhs->getType());
+
+ if (totalType == NumberInt) {
+ int v = prhs->coerceToInt();
+ longTotal += v;
+ doubleTotal += v;
+ }
+ else if (totalType == NumberLong) {
+ long long v = prhs->coerceToLong();
+ longTotal += v;
+ doubleTotal += v;
+ }
+ else { /* (totalType == NumberDouble) */
+ double v = prhs->coerceToDouble();
+ doubleTotal += v;
+ }
+
+ return Value::getZero();
+ }
+
+ intrusive_ptr<Accumulator> AccumulatorSum::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<AccumulatorSum> pSummer(new AccumulatorSum());
+ return pSummer;
+ }
+
+ intrusive_ptr<const Value> AccumulatorSum::getValue() const {
+ if (totalType == NumberInt)
+ return Value::createInt((int)longTotal);
+ if (totalType == NumberLong)
+ return Value::createLong(longTotal);
+ return Value::createDouble(doubleTotal);
+ }
+
+ AccumulatorSum::AccumulatorSum():
+ Accumulator(),
+ totalType(NumberInt),
+ longTotal(0),
+ doubleTotal(0) {
+ }
+
+ const char *AccumulatorSum::getOpName() const {
+ return "$sum";
+ }
+}
diff --git a/src/mongo/db/pipeline/builder.cpp b/src/mongo/db/pipeline/builder.cpp new file mode 100755 index 00000000000..cbde3705656 --- /dev/null +++ b/src/mongo/db/pipeline/builder.cpp @@ -0,0 +1,117 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/builder.h"
+
+
+namespace mongo {
+
+ void BuilderObj::append() {
+ pBuilder->appendNull(fieldName);
+ }
+
+ void BuilderObj::append(bool b) {
+ pBuilder->append(fieldName, b);
+ }
+
+ void BuilderObj::append(int i) {
+ pBuilder->append(fieldName, i);
+ }
+
+ void BuilderObj::append(long long ll) {
+ pBuilder->append(fieldName, ll);
+ }
+
+ void BuilderObj::append(double d) {
+ pBuilder->append(fieldName, d);
+ }
+
+ void BuilderObj::append(string s) {
+ pBuilder->append(fieldName, s);
+ }
+
+ void BuilderObj::append(const OID &o) {
+ pBuilder->append(fieldName, o);
+ }
+
+ void BuilderObj::append(const Date_t &d) {
+ pBuilder->append(fieldName, d);
+ }
+
+ void BuilderObj::append(BSONObjBuilder *pDone) {
+ pBuilder->append(fieldName, pDone->done());
+ }
+
+ void BuilderObj::append(BSONArrayBuilder *pDone) {
+ pBuilder->append(fieldName, pDone->arr());
+ }
+
+ BuilderObj::BuilderObj(
+ BSONObjBuilder *pObjBuilder, string theFieldName):
+ pBuilder(pObjBuilder),
+ fieldName(theFieldName) {
+ }
+
+
+ void BuilderArray::append() {
+ pBuilder->appendNull();
+ }
+
+ void BuilderArray::append(bool b) {
+ pBuilder->append(b);
+ }
+
+ void BuilderArray::append(int i) {
+ pBuilder->append(i);
+ }
+
+ void BuilderArray::append(long long ll) {
+ pBuilder->append(ll);
+ }
+
+ void BuilderArray::append(double d) {
+ pBuilder->append(d);
+ }
+
+ void BuilderArray::append(string s) {
+ pBuilder->append(s);
+ }
+
+ void BuilderArray::append(const OID &o) {
+ pBuilder->append(o);
+ }
+
+ void BuilderArray::append(const Date_t &d) {
+ pBuilder->append(d);
+ }
+
+ void BuilderArray::append(BSONObjBuilder *pDone) {
+ pBuilder->append(pDone->done());
+ }
+
+ void BuilderArray::append(BSONArrayBuilder *pDone) {
+ pBuilder->append(pDone->arr());
+ }
+
+ BuilderArray::BuilderArray(
+ BSONArrayBuilder *pArrayBuilder):
+ pBuilder(pArrayBuilder) {
+ }
+
+}
diff --git a/src/mongo/db/pipeline/builder.h b/src/mongo/db/pipeline/builder.h new file mode 100755 index 00000000000..bdf71cd784c --- /dev/null +++ b/src/mongo/db/pipeline/builder.h @@ -0,0 +1,95 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+namespace mongo {
+
+ class BSONArrayBuilder;
+ class BSONObjBuilder;
+
+ /*
+ Generic Builder.
+
+ The methods to append items to an object (on BSONObjBuilder) and an array
+ (on BSONArrayBuilder) differ only by their inclusion of a field name.
+ For more complicated implementations of addToBsonObj() and
+ addToBsonArray(), it makes sense to abstract that out and use
+ this generic builder that always looks the same, and then implement
+ addToBsonObj() and addToBsonArray() by using a common method.
+ */
+ class Builder :
+ boost::noncopyable {
+ public:
+ virtual ~Builder() {};
+
+ virtual void append() = 0; // append a null
+ virtual void append(bool b) = 0;
+ virtual void append(int i) = 0;
+ virtual void append(long long ll) = 0;
+ virtual void append(double d) = 0;
+ virtual void append(string s) = 0;
+ virtual void append(const OID &o) = 0;
+ virtual void append(const Date_t &d) = 0;
+ virtual void append(BSONObjBuilder *pDone) = 0;
+ virtual void append(BSONArrayBuilder *pDone) = 0;
+ };
+
+ class BuilderObj :
+ public Builder {
+ public:
+ // virtuals from Builder
+ virtual void append();
+ virtual void append(bool b);
+ virtual void append(int i);
+ virtual void append(long long ll);
+ virtual void append(double d);
+ virtual void append(string s);
+ virtual void append(const OID &o);
+ virtual void append(const Date_t &d);
+ virtual void append(BSONObjBuilder *pDone);
+ virtual void append(BSONArrayBuilder *pDone);
+
+ BuilderObj(BSONObjBuilder *pBuilder, string fieldName);
+
+ private:
+ BSONObjBuilder *pBuilder;
+ string fieldName;
+ };
+
+ class BuilderArray :
+ public Builder {
+ public:
+ // virtuals from Builder
+ virtual void append();
+ virtual void append(bool b);
+ virtual void append(int i);
+ virtual void append(long long ll);
+ virtual void append(double d);
+ virtual void append(string s);
+ virtual void append(const OID &o);
+ virtual void append(const Date_t &d);
+ virtual void append(BSONObjBuilder *pDone);
+ virtual void append(BSONArrayBuilder *pDone);
+
+ BuilderArray(BSONArrayBuilder *pBuilder);
+
+ private:
+ BSONArrayBuilder *pBuilder;
+ };
+}
diff --git a/src/mongo/db/pipeline/doc_mem_monitor.cpp b/src/mongo/db/pipeline/doc_mem_monitor.cpp new file mode 100755 index 00000000000..ffbe9c88854 --- /dev/null +++ b/src/mongo/db/pipeline/doc_mem_monitor.cpp @@ -0,0 +1,68 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/doc_mem_monitor.h"
+#include "util/systeminfo.h"
+
+namespace mongo {
+
+ DocMemMonitor::DocMemMonitor(StringWriter *pW) {
+ /*
+ Use the default values.
+
+ Currently, we warn in log at 5%, and assert at 10%.
+ */
+ size_t errorRam = SystemInfo::getPhysicalRam() / 10;
+ size_t warnRam = errorRam / 2;
+
+ init(pW, warnRam, errorRam);
+ }
+
+ DocMemMonitor::DocMemMonitor(StringWriter *pW,
+ size_t warnLimit, size_t errorLimit) {
+ init(pW, warnLimit, errorLimit);
+ }
+
+ void DocMemMonitor::addToTotal(size_t amount) {
+ totalUsed += amount;
+
+ if (!warned) {
+ if (warnLimit && (totalUsed > warnLimit)) {
+ stringstream ss;
+ ss << "warning, 5% of physical RAM used for ";
+ pWriter->writeString(ss);
+ ss << endl;
+ warning() << ss.str();
+ warned = true;
+ }
+ }
+
+ if (errorLimit) {
+ uassert(15944, "terminating request: request heap use exceeded 10% of physical RAM", (totalUsed <= errorLimit));
+ }
+ }
+
+ void DocMemMonitor::init(StringWriter *pW,
+ size_t warnLimit, size_t errorLimit) {
+ this->pWriter = pW;
+ this->warnLimit = warnLimit;
+ this->errorLimit = errorLimit;
+
+ warned = false;
+ totalUsed = 0;
+ }
+}
diff --git a/src/mongo/db/pipeline/doc_mem_monitor.h b/src/mongo/db/pipeline/doc_mem_monitor.h new file mode 100755 index 00000000000..e368acc906a --- /dev/null +++ b/src/mongo/db/pipeline/doc_mem_monitor.h @@ -0,0 +1,94 @@ +/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+#include "util/string_writer.h"
+
+
+namespace mongo {
+
+ /*
+ This utility class provides an easy way to total up, monitor, warn, and
+ signal an error when the amount of memory used for an operation exceeds
+ given thresholds.
+
+ Create a local instance of this class, and then inform it of any memory
+ that you consume using addToTotal().
+
+ Warnings or errors are issued as usage exceeds certain fractions of
+ physical memory on the host, as determined by SystemInfo.
+
+ This class is not guaranteed to warn or signal errors if the host system
+ does not support the ability to report its memory, as per the warnings
+ for SystemInfo in systeminfo.h.
+ */
+ class DocMemMonitor {
+ public:
+ /*
+ Constructor.
+
+ Uses default limits for warnings and errors.
+
+ The StringWriter parameter must outlive the DocMemMonitor instance.
+
+ @param pWriter string writer that provides information about the
+ operation being monitored
+ */
+ DocMemMonitor(StringWriter *pWriter);
+
+ /*
+ Constructor.
+
+ This variant allows explicit selection of the limits. Note that
+ limits of zero are treated as infinite.
+
+ The StringWriter parameter must outlive the DocMemMonitor instance.
+
+ @param pWriter string writer that provides information about the
+ operation being monitored
+ @param warnLimit the amount of ram to issue (log) a warning for
+ @param errorLimit the amount of ram to throw an error for
+ */
+ DocMemMonitor(StringWriter *pWriter, size_t warnLimit,
+ size_t errorLimit);
+
+ /*
+ Increment the total amount of memory used by the given amount. If
+ the warning threshold is exceeded, a warning will be logged. If the
+ error threshold is exceeded, an error will be thrown.
+
+ @param amount the amount of memory to add to the current total
+ */
+ void addToTotal(size_t amount);
+
+ private:
+ /*
+ Real constructor body.
+
+ Provides common construction for all the variant constructors.
+ */
+ void init(StringWriter *pW, size_t warnLimit, size_t errorLimit);
+
+ bool warned;
+ size_t totalUsed;
+ size_t warnLimit;
+ size_t errorLimit;
+ StringWriter *pWriter;
+ };
+
+}
diff --git a/src/mongo/db/pipeline/document.cpp b/src/mongo/db/pipeline/document.cpp new file mode 100755 index 00000000000..a49c7e303c1 --- /dev/null +++ b/src/mongo/db/pipeline/document.cpp @@ -0,0 +1,219 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/value.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+ using namespace mongoutils;
+
+ string Document::idName("_id");
+
+ intrusive_ptr<Document> Document::createFromBsonObj(BSONObj *pBsonObj) {
+ intrusive_ptr<Document> pDocument(new Document(pBsonObj));
+ return pDocument;
+ }
+
+ Document::Document(BSONObj *pBsonObj):
+ vFieldName(),
+ vpValue() {
+ BSONObjIterator bsonIterator(pBsonObj->begin());
+ while(bsonIterator.more()) {
+ BSONElement bsonElement(bsonIterator.next());
+ string fieldName(bsonElement.fieldName());
+ intrusive_ptr<const Value> pValue(
+ Value::createFromBsonElement(&bsonElement));
+
+ vFieldName.push_back(fieldName);
+ vpValue.push_back(pValue);
+ }
+ }
+
+ void Document::toBson(BSONObjBuilder *pBuilder) {
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i)
+ vpValue[i]->addToBsonObj(pBuilder, vFieldName[i]);
+ }
+
+ intrusive_ptr<Document> Document::create(size_t sizeHint) {
+ intrusive_ptr<Document> pDocument(new Document(sizeHint));
+ return pDocument;
+ }
+
+ Document::Document(size_t sizeHint):
+ vFieldName(),
+ vpValue() {
+ if (sizeHint) {
+ vFieldName.reserve(sizeHint);
+ vpValue.reserve(sizeHint);
+ }
+ }
+
+ intrusive_ptr<Document> Document::clone() {
+ const size_t n = vFieldName.size();
+ intrusive_ptr<Document> pNew(Document::create(n));
+ for(size_t i = 0; i < n; ++i)
+ pNew->addField(vFieldName[i], vpValue[i]);
+
+ return pNew;
+ }
+
+ Document::~Document() {
+ }
+
+ FieldIterator *Document::createFieldIterator() {
+ return new FieldIterator(intrusive_ptr<Document>(this));
+ }
+
+ intrusive_ptr<const Value> Document::getValue(const string &fieldName) {
+ /*
+ For now, assume the number of fields is small enough that iteration
+ is ok. Later, if this gets large, we can create a map into the
+ vector for these lookups.
+
+ Note that because of the schema-less nature of this data, we always
+ have to look, and can't assume that the requested field is always
+ in a particular place as we would with a statically compilable
+ reference.
+ */
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i) {
+ if (fieldName.compare(vFieldName[i]) == 0)
+ return vpValue[i];
+ }
+
+ return(intrusive_ptr<const Value>());
+ }
+
+ void Document::addField(const string &fieldName,
+ const intrusive_ptr<const Value> &pValue) {
+ uassert(15945, str::stream() << "cannot add undefined field " <<
+ fieldName << " to document", pValue->getType() != Undefined);
+
+ vFieldName.push_back(fieldName);
+ vpValue.push_back(pValue);
+ }
+
+ void Document::setField(size_t index,
+ const string &fieldName,
+ const intrusive_ptr<const Value> &pValue) {
+ /* special case: should this field be removed? */
+ if (!pValue.get()) {
+ vFieldName.erase(vFieldName.begin() + index);
+ vpValue.erase(vpValue.begin() + index);
+ return;
+ }
+
+ /* make sure we have a valid value */
+ uassert(15968, str::stream() << "cannot set undefined field " <<
+ fieldName << " to document", pValue->getType() != Undefined);
+
+ /* set the indicated field */
+ vFieldName[index] = fieldName;
+ vpValue[index] = pValue;
+ }
+
+ intrusive_ptr<const Value> Document::getField(const string &fieldName) const {
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i) {
+ if (fieldName.compare(vFieldName[i]) == 0)
+ return vpValue[i];
+ }
+
+ /* if we got here, there's no such field */
+ return intrusive_ptr<const Value>();
+ }
+
+ size_t Document::getApproximateSize() const {
+ size_t size = sizeof(Document);
+ const size_t n = vpValue.size();
+ for(size_t i = 0; i < n; ++i)
+ size += vpValue[i]->getApproximateSize();
+
+ return size;
+ }
+
+ size_t Document::getFieldIndex(const string &fieldName) const {
+ const size_t n = vFieldName.size();
+ size_t i = 0;
+ for(; i < n; ++i) {
+ if (fieldName.compare(vFieldName[i]) == 0)
+ break;
+ }
+
+ return i;
+ }
+
+ void Document::hash_combine(size_t &seed) const {
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i) {
+ boost::hash_combine(seed, vFieldName[i]);
+ vpValue[i]->hash_combine(seed);
+ }
+ }
+
+ int Document::compare(const intrusive_ptr<Document> &rL,
+ const intrusive_ptr<Document> &rR) {
+ const size_t lSize = rL->vFieldName.size();
+ const size_t rSize = rR->vFieldName.size();
+
+ for(size_t i = 0; true; ++i) {
+ if (i >= lSize) {
+ if (i >= rSize)
+ return 0; // documents are the same length
+
+ return -1; // left document is shorter
+ }
+
+ if (i >= rSize)
+ return 1; // right document is shorter
+
+ const int nameCmp = rL->vFieldName[i].compare(rR->vFieldName[i]);
+ if (nameCmp)
+ return nameCmp; // field names are unequal
+
+ const int valueCmp = Value::compare(rL->vpValue[i], rR->vpValue[i]);
+ if (valueCmp)
+ return valueCmp; // fields are unequal
+ }
+
+ /* NOTREACHED */
+ assert(false);
+ return 0;
+ }
+
+ /* ----------------------- FieldIterator ------------------------------- */
+
+ FieldIterator::FieldIterator(const intrusive_ptr<Document> &pTheDocument):
+ pDocument(pTheDocument),
+ index(0) {
+ }
+
+ bool FieldIterator::more() const {
+ return (index < pDocument->vFieldName.size());
+ }
+
+ pair<string, intrusive_ptr<const Value> > FieldIterator::next() {
+ assert(more());
+ pair<string, intrusive_ptr<const Value> > result(
+ pDocument->vFieldName[index], pDocument->vpValue[index]);
+ ++index;
+ return result;
+ }
+}
diff --git a/src/mongo/db/pipeline/document.h b/src/mongo/db/pipeline/document.h new file mode 100755 index 00000000000..f11a825151e --- /dev/null +++ b/src/mongo/db/pipeline/document.h @@ -0,0 +1,246 @@ +/** + * Copyright 2011 (c) 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "pch.h" + +#include "util/intrusive_counter.h" + +namespace mongo { + class BSONObj; + class FieldIterator; + class Value; + + class Document : + public IntrusiveCounterUnsigned { + public: + ~Document(); + + /* + Create a new Document from the given BSONObj. + + Document field values may be pointed to in the BSONObj, so it + must live at least as long as the resulting Document. + + @returns shared pointer to the newly created Document + */ + static intrusive_ptr<Document> createFromBsonObj(BSONObj *pBsonObj); + + /* + Create a new empty Document. + + @param sizeHint a hint at what the number of fields will be; if + known, this can be used to increase memory allocation efficiency + @returns shared pointer to the newly created Document + */ + static intrusive_ptr<Document> create(size_t sizeHint = 0); + + /* + Clone a document. + + The new document shares all the fields' values with the original. + + This is not a deep copy. Only the fields on the top-level document + are cloned. + + @returns the shallow clone of the document + */ + intrusive_ptr<Document> clone(); + + /* + Add this document to the BSONObj under construction with the + given BSONObjBuilder. + */ + void toBson(BSONObjBuilder *pBsonObjBuilder); + + /* + Create a new FieldIterator that can be used to examine the + Document's fields. + */ + FieldIterator *createFieldIterator(); + + /* + Get the value of the specified field. + + @param fieldName the name of the field + @return point to the requested field + */ + intrusive_ptr<const Value> getValue(const string &fieldName); + + /* + Add the given field to the Document. + + BSON documents' fields are ordered; the new Field will be + appened to the current list of fields. + + It is an error to add a field that has the same name as another + field. + */ + void addField(const string &fieldName, + const intrusive_ptr<const Value> &pValue); + + /* + Set the given field to be at the specified position in the + Document. This will replace any field that is currently in that + position. The index must be within the current range of field + indices. + + pValue.get() may be NULL, in which case the field will be + removed. fieldName is ignored in this case. + + @param index the field index in the list of fields + @param fieldName the new field name + @param pValue the new Value + */ + void setField(size_t index, + const string &fieldName, + const intrusive_ptr<const Value> &pValue); + + /* + Convenience type for dealing with fields. + */ + typedef pair<string, intrusive_ptr<const Value> > FieldPair; + + /* + Get the indicated field. + + @param index the field index in the list of fields + @returns the field name and value of the field + */ + FieldPair getField(size_t index) const; + + /* + Get the number of fields in the Document. + + @returns the number of fields in the Document + */ + size_t getFieldCount() const; + + /* + Get the index of the given field. + + @param fieldName the name of the field + @returns the index of the field, or if it does not exist, the number + of fields (getFieldCount()) + */ + size_t getFieldIndex(const string &fieldName) const; + + /* + Get a field by name. + + @param fieldName the name of the field + @returns the value of the field + */ + intrusive_ptr<const Value> getField(const string &fieldName) const; + + /* + Get the approximate storage size of the document, in bytes. + + Under the assumption that field name strings are shared, they are + not included in the total. + + @returns the approximate storage + */ + size_t getApproximateSize() const; + + /* + Compare two documents. + + BSON document field order is significant, so this just goes through + the fields in order. The comparison is done in roughly the same way + as strings are compared, but comparing one field at a time instead + of one character at a time. + */ + static int compare(const intrusive_ptr<Document> &rL, + const intrusive_ptr<Document> &rR); + + static string idName; // shared "_id" + + /* + Calculate a hash value. + + Meant to be used to create composite hashes suitable for + boost classes such as unordered_map<>. + + @param seed value to augment with this' hash + */ + void hash_combine(size_t &seed) const; + + private: + friend class FieldIterator; + + Document(size_t sizeHint); + Document(BSONObj *pBsonObj); + + /* these two vectors parallel each other */ + vector<string> vFieldName; + vector<intrusive_ptr<const Value> > vpValue; + }; + + + class FieldIterator : + boost::noncopyable { + public: + /* + Ask if there are more fields to return. + + @return true if there are more fields, false otherwise + */ + bool more() const; + + /* + Move the iterator to point to the next field and return it. + + @return the next field's <name, Value> + */ + Document::FieldPair next(); + + private: + friend class Document; + + /* + Constructor. + + @param pDocument points to the document whose fields are being + iterated + */ + FieldIterator(const intrusive_ptr<Document> &pDocument); + + /* + We'll hang on to the original document to ensure we keep the + fieldPtr vector alive. + */ + intrusive_ptr<Document> pDocument; + size_t index; // current field in iteration + }; +} + + +/* ======================= INLINED IMPLEMENTATIONS ========================== */ + +namespace mongo { + + inline size_t Document::getFieldCount() const { + return vFieldName.size(); + } + + inline Document::FieldPair Document::getField(size_t index) const { + assert( index < vFieldName.size() ); + return FieldPair(vFieldName[index], vpValue[index]); + } + +} diff --git a/src/mongo/db/pipeline/document_source.cpp b/src/mongo/db/pipeline/document_source.cpp new file mode 100755 index 00000000000..813852e35c6 --- /dev/null +++ b/src/mongo/db/pipeline/document_source.cpp @@ -0,0 +1,52 @@ +/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+namespace mongo {
+ DocumentSource::~DocumentSource() {
+ }
+
+ void DocumentSource::setSource(
+ const intrusive_ptr<DocumentSource> &pTheSource) {
+ assert(!pSource.get());
+ pSource = pTheSource;
+ }
+
+ bool DocumentSource::coalesce(
+ const intrusive_ptr<DocumentSource> &pNextSource) {
+ return false;
+ }
+
+ void DocumentSource::optimize() {
+ }
+
+ void DocumentSource::addToBsonArray(BSONArrayBuilder *pBuilder) const {
+ BSONObjBuilder insides;
+ sourceToBson(&insides);
+ pBuilder->append(insides.done());
+ }
+
+ void DocumentSource::writeString(stringstream &ss) const {
+ BSONArrayBuilder bab;
+ addToBsonArray(&bab);
+ BSONArray ba(bab.arr());
+ ss << ba.toString(/* isArray */true);
+ // our toString should use standard string types.....
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source.h b/src/mongo/db/pipeline/document_source.h new file mode 100755 index 00000000000..8d5f0f70847 --- /dev/null +++ b/src/mongo/db/pipeline/document_source.h @@ -0,0 +1,985 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include <boost/unordered_map.hpp>
+#include "util/intrusive_counter.h"
+#include "client/parallel.h"
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+#include "util/string_writer.h"
+
+namespace mongo {
+ class Accumulator;
+ class Cursor;
+ class Document;
+ class Expression;
+ class ExpressionContext;
+ class ExpressionFieldPath;
+ class ExpressionObject;
+ class Matcher;
+
+ class DocumentSource :
+ public IntrusiveCounterUnsigned,
+ public StringWriter {
+ public:
+ virtual ~DocumentSource();
+
+ // virtuals from StringWriter
+ /*
+ Write out a string representation of this pipeline operator.
+
+ @param ss string stream to write the string representation to
+ */
+ virtual void writeString(stringstream &ss) const;
+
+
+ /*
+ Is the source at EOF?
+
+ @returns true if the source has no more Documents to return.
+ */
+ virtual bool eof() = 0;
+
+ /*
+ Advance the state of the DocumentSource so that it will return the
+ next Document.
+
+ @returns whether there is another document to fetch, i.e., whether or
+ not getCurrent() will succeed.
+ */
+ virtual bool advance() = 0;
+
+ /*
+ Advance the source, and return the next Expression.
+
+ @returns the current Document
+ TODO throws an exception if there are no more expressions to return.
+ */
+ virtual intrusive_ptr<Document> getCurrent() = 0;
+
+ /*
+ Set the underlying source this source should use to get Documents
+ from.
+
+ It is an error to set the source more than once. This is to
+ prevent changing sources once the original source has been started;
+ this could break the state maintained by the DocumentSource.
+
+ @param pSource the underlying source to use
+ */
+ virtual void setSource(const intrusive_ptr<DocumentSource> &pSource);
+
+ /*
+ Attempt to coalesce this DocumentSource with its successor in the
+ document processing pipeline. If successful, the successor
+ DocumentSource should be removed from the pipeline and discarded.
+
+ If successful, this operation can be applied repeatedly, in an
+ attempt to coalesce several sources together.
+
+ The default implementation is to do nothing, and return false.
+
+ @param pNextSource the next source in the document processing chain.
+ @returns whether or not the attempt to coalesce was successful or not;
+ if the attempt was not successful, nothing has been changed
+ */
+ virtual bool coalesce(const intrusive_ptr<DocumentSource> &pNextSource);
+
+ /*
+ Optimize the pipeline operation, if possible. This is a local
+ optimization that only looks within this DocumentSource. For best
+ results, first coalesce compatible sources using coalesce().
+
+ This is intended for any operations that include expressions, and
+ provides a hook for those to optimize those operations.
+
+ The default implementation is to do nothing.
+ */
+ virtual void optimize();
+
+ /*
+ Add the DocumentSource to the array builder.
+
+ The default implementation calls sourceToBson() in order to
+ convert the inner part of the object which will be added to the
+ array being built here.
+
+ @param pBuilder the array builder to add the operation to.
+ */
+ virtual void addToBsonArray(BSONArrayBuilder *pBuilder) const;
+
+ protected:
+ /*
+ Create an object that represents the document source. The object
+ will have a single field whose name is the source's name. This
+ will be used by the default implementation of addToBsonArray()
+ to add this object to a pipeline being represented in BSON.
+
+ @param pBuilder a blank object builder to write to
+ */
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const = 0;
+
+ /*
+ Most DocumentSources have an underlying source they get their data
+ from. This is a convenience for them.
+
+ The default implementation of setSource() sets this; if you don't
+ need a source, override that to assert(). The default is to
+ assert() if this has already been set.
+ */
+ intrusive_ptr<DocumentSource> pSource;
+ };
+
+
+ class DocumentSourceBsonArray :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceBsonArray();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+ virtual void setSource(const intrusive_ptr<DocumentSource> &pSource);
+
+ /*
+ Create a document source based on a BSON array.
+
+ This is usually put at the beginning of a chain of document sources
+ in order to fetch data from the database.
+
+ CAUTION: the BSON is not read until the source is used. Any
+ elements that appear after these documents must not be read until
+ this source is exhausted.
+
+ @param pBsonElement the BSON array to treat as a document source
+ @returns the newly created document source
+ */
+ static intrusive_ptr<DocumentSourceBsonArray> create(
+ BSONElement *pBsonElement);
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceBsonArray(BSONElement *pBsonElement);
+
+ BSONObj embeddedObject;
+ BSONObjIterator arrayIterator;
+ BSONElement currentElement;
+ bool haveCurrent;
+ };
+
+
+ class DocumentSourceCommandFutures :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceCommandFutures();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+ virtual void setSource(const intrusive_ptr<DocumentSource> &pSource);
+
+ /* convenient shorthand for a commonly used type */
+ typedef list<shared_ptr<Future::CommandResult> > FuturesList;
+
+ /*
+ Create a DocumentSource that wraps a list of Command::Futures.
+
+ @param errmsg place to write error messages to; must exist for the
+ lifetime of the created DocumentSourceCommandFutures
+ @param pList the list of futures
+ */
+ static intrusive_ptr<DocumentSourceCommandFutures> create(
+ string &errmsg, FuturesList *pList);
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceCommandFutures(string &errmsg, FuturesList *pList);
+
+ /*
+ Advance to the next document, setting pCurrent appropriately.
+
+ Adjusts pCurrent, pBsonSource, and iterator, as needed. On exit,
+ pCurrent is the Document to return, or NULL. If NULL, this
+ indicates there is nothing more to return.
+ */
+ void getNextDocument();
+
+ bool newSource; // set to true for the first item of a new source
+ intrusive_ptr<DocumentSourceBsonArray> pBsonSource;
+ intrusive_ptr<Document> pCurrent;
+ FuturesList::iterator iterator;
+ FuturesList::iterator listEnd;
+ string &errmsg;
+ };
+
+
+ class DocumentSourceCursor :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceCursor();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+ virtual void setSource(const intrusive_ptr<DocumentSource> &pSource);
+
+ /*
+ Create a document source based on a cursor.
+
+ This is usually put at the beginning of a chain of document sources
+ in order to fetch data from the database.
+
+ @param pCursor the cursor to use to fetch data
+ */
+ static intrusive_ptr<DocumentSourceCursor> create(
+ const shared_ptr<Cursor> &pCursor);
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceCursor(const shared_ptr<Cursor> &pTheCursor);
+
+ void findNext();
+ shared_ptr<Cursor> pCursor;
+ intrusive_ptr<Document> pCurrent;
+ };
+
+
+ /*
+ This contains all the basic mechanics for filtering a stream of
+ Documents, except for the actual predicate evaluation itself. This was
+ factored out so we could create DocumentSources that use both Matcher
+ style predicates as well as full Expressions.
+ */
+ class DocumentSourceFilterBase :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceFilterBase();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+
+ /*
+ Create a BSONObj suitable for Matcher construction.
+
+ This is used after filter analysis has moved as many filters to
+ as early a point as possible in the document processing pipeline.
+ See db/Matcher.h and the associated wiki documentation for the
+ format. This conversion is used to move back to the low-level
+ find() Cursor mechanism.
+
+ @param pBuilder the builder to write to
+ */
+ virtual void toMatcherBson(BSONObjBuilder *pBuilder) const = 0;
+
+ protected:
+ DocumentSourceFilterBase();
+
+ /*
+ Test the given document against the predicate and report if it
+ should be accepted or not.
+
+ @param pDocument the document to test
+ @returns true if the document matches the filter, false otherwise
+ */
+ virtual bool accept(const intrusive_ptr<Document> &pDocument) const = 0;
+
+ private:
+
+ void findNext();
+
+ bool unstarted;
+ bool hasNext;
+ intrusive_ptr<Document> pCurrent;
+ };
+
+
+ class DocumentSourceFilter :
+ public DocumentSourceFilterBase {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceFilter();
+ virtual bool coalesce(const intrusive_ptr<DocumentSource> &pNextSource);
+ virtual void optimize();
+
+ /*
+ Create a filter.
+
+ @param pBsonElement the raw BSON specification for the filter
+ @returns the filter
+ */
+ static intrusive_ptr<DocumentSource> createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Create a filter.
+
+ @param pFilter the expression to use to filter
+ @returns the filter
+ */
+ static intrusive_ptr<DocumentSourceFilter> create(
+ const intrusive_ptr<Expression> &pFilter);
+
+ /*
+ Create a BSONObj suitable for Matcher construction.
+
+ This is used after filter analysis has moved as many filters to
+ as early a point as possible in the document processing pipeline.
+ See db/Matcher.h and the associated wiki documentation for the
+ format. This conversion is used to move back to the low-level
+ find() Cursor mechanism.
+
+ @param pBuilder the builder to write to
+ */
+ void toMatcherBson(BSONObjBuilder *pBuilder) const;
+
+ static const char filterName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ // virtuals from DocumentSourceFilterBase
+ virtual bool accept(const intrusive_ptr<Document> &pDocument) const;
+
+ private:
+ DocumentSourceFilter(const intrusive_ptr<Expression> &pFilter);
+
+ intrusive_ptr<Expression> pFilter;
+ };
+
+
+ class DocumentSourceGroup :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceGroup();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+
+ /*
+ Create a new grouping DocumentSource.
+
+ @param pCtx the expression context
+ @returns the DocumentSource
+ */
+ static intrusive_ptr<DocumentSourceGroup> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Set the Id Expression.
+
+ Documents that pass through the grouping Document are grouped
+ according to this key. This will generate the id_ field in the
+ result documents.
+
+ @param pExpression the group key
+ */
+ void setIdExpression(const intrusive_ptr<Expression> &pExpression);
+
+ /*
+ Add an accumulator.
+
+ Accumulators become fields in the Documents that result from
+ grouping. Each unique group document must have it's own
+ accumulator; the accumulator factory is used to create that.
+
+ @param fieldName the name the accumulator result will have in the
+ result documents
+ @param pAccumulatorFactory used to create the accumulator for the
+ group field
+ */
+ void addAccumulator(string fieldName,
+ intrusive_ptr<Accumulator> (*pAccumulatorFactory)(
+ const intrusive_ptr<ExpressionContext> &),
+ const intrusive_ptr<Expression> &pExpression);
+
+ /*
+ Create a grouping DocumentSource from BSON.
+
+ This is a convenience method that uses the above, and operates on
+ a BSONElement that has been deteremined to be an Object with an
+ element named $group.
+
+ @param pBsonElement the BSONELement that defines the group
+ @param pCtx the expression context
+ @returns the grouping DocumentSource
+ */
+ static intrusive_ptr<DocumentSource> createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+
+ /*
+ Create a unifying group that can be used to combine group results
+ from shards.
+
+ @returns the grouping DocumentSource
+ */
+ intrusive_ptr<DocumentSource> createMerger();
+
+ static const char groupName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceGroup(const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Before returning anything, this source must fetch everything from
+ the underlying source and group it. populate() is used to do that
+ on the first call to any method on this source. The populated
+ boolean indicates that this has been done.
+ */
+ void populate();
+ bool populated;
+
+ intrusive_ptr<Expression> pIdExpression;
+
+ typedef boost::unordered_map<intrusive_ptr<const Value>,
+ vector<intrusive_ptr<Accumulator> >, Value::Hash> GroupsType;
+ GroupsType groups;
+
+ /*
+ The field names for the result documents and the accumulator
+ factories for the result documents. The Expressions are the
+ common expressions used by each instance of each accumulator
+ in order to find the right-hand side of what gets added to the
+ accumulator. Note that each of those is the same for each group,
+ so we can share them across all groups by adding them to the
+ accumulators after we use the factories to make a new set of
+ accumulators for each new group.
+
+ These three vectors parallel each other.
+ */
+ vector<string> vFieldName;
+ vector<intrusive_ptr<Accumulator> (*)(
+ const intrusive_ptr<ExpressionContext> &)> vpAccumulatorFactory;
+ vector<intrusive_ptr<Expression> > vpExpression;
+
+
+ intrusive_ptr<Document> makeDocument(
+ const GroupsType::iterator &rIter);
+
+ GroupsType::iterator groupsIterator;
+ intrusive_ptr<Document> pCurrent;
+
+ intrusive_ptr<ExpressionContext> pCtx;
+ };
+
+
+ class DocumentSourceMatch :
+ public DocumentSourceFilterBase {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceMatch();
+
+ /*
+ Create a filter.
+
+ @param pBsonElement the raw BSON specification for the filter
+ @returns the filter
+ */
+ static intrusive_ptr<DocumentSource> createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Create a BSONObj suitable for Matcher construction.
+
+ This is used after filter analysis has moved as many filters to
+ as early a point as possible in the document processing pipeline.
+ See db/Matcher.h and the associated wiki documentation for the
+ format. This conversion is used to move back to the low-level
+ find() Cursor mechanism.
+
+ @param pBuilder the builder to write to
+ */
+ void toMatcherBson(BSONObjBuilder *pBuilder) const;
+
+ static const char matchName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ // virtuals from DocumentSourceFilterBase
+ virtual bool accept(const intrusive_ptr<Document> &pDocument) const;
+
+ private:
+ DocumentSourceMatch(const BSONObj &query);
+
+ Matcher matcher;
+ };
+
+
+ class DocumentSourceOut :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceOut();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+
+ /*
+ Create a document source for output and pass-through.
+
+ This can be put anywhere in a pipeline and will store content as
+ well as pass it on.
+
+ @returns the newly created document source
+ */
+ static intrusive_ptr<DocumentSourceOut> createFromBson(
+ BSONElement *pBsonElement);
+
+ static const char outName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceOut(BSONElement *pBsonElement);
+ };
+
+
+ class DocumentSourceProject :
+ public DocumentSource,
+ public boost::enable_shared_from_this<DocumentSourceProject> {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceProject();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+ virtual void optimize();
+
+ /*
+ Create a new DocumentSource that can implement projection.
+
+ @returns the projection DocumentSource
+ */
+ static intrusive_ptr<DocumentSourceProject> create();
+
+ /*
+ Include a field path in a projection.
+
+ @param fieldPath the path of the field to include
+ */
+ void includePath(const string &fieldPath);
+
+ /*
+ Exclude a field path from the projection.
+
+ @param fieldPath the path of the field to exclude
+ */
+ void excludePath(const string &fieldPath);
+
+ /*
+ Add an output Expression in the projection.
+
+ BSON document fields are ordered, so the new field will be
+ appended to the existing set.
+
+ @param fieldName the name of the field as it will appear
+ @param pExpression the expression used to compute the field
+ */
+ void addField(const string &fieldName,
+ const intrusive_ptr<Expression> &pExpression);
+
+ /*
+ Create a new projection DocumentSource from BSON.
+
+ This is a convenience for directly handling BSON, and relies on the
+ above methods.
+
+ @param pBsonElement the BSONElement with an object named $project
+ @returns the created projection
+ */
+ static intrusive_ptr<DocumentSource> createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ static const char projectName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceProject();
+
+ // configuration state
+ bool excludeId;
+ intrusive_ptr<ExpressionObject> pEO;
+ };
+
+
+ class DocumentSourceSort :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceSort();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+ /*
+ TODO
+ Adjacent sorts should reduce to the last sort.
+ virtual bool coalesce(const intrusive_ptr<DocumentSource> &pNextSource);
+ */
+
+ /*
+ Create a new sorting DocumentSource.
+
+ @param pCtx the expression context
+ @returns the DocumentSource
+ */
+ static intrusive_ptr<DocumentSourceSort> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Add sort key field.
+
+ Adds a sort key field to the key being built up. A concatenated
+ key is built up by calling this repeatedly.
+
+ @param fieldPath the field path to the key component
+ @param ascending if true, use the key for an ascending sort,
+ otherwise, use it for descending
+ */
+ void addKey(const string &fieldPath, bool ascending);
+
+ /*
+ Write out an object whose contents are the sort key.
+
+ @param pBuilder initialized object builder.
+ @param fieldPrefix specify whether or not to include the field prefix
+ */
+ void sortKeyToBson(BSONObjBuilder *pBuilder, bool usePrefix) const;
+
+ /*
+ Create a sorting DocumentSource from BSON.
+
+ This is a convenience method that uses the above, and operates on
+ a BSONElement that has been deteremined to be an Object with an
+ element named $group.
+
+ @param pBsonElement the BSONELement that defines the group
+ @param pCtx the expression context
+ @returns the grouping DocumentSource
+ */
+ static intrusive_ptr<DocumentSource> createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+
+ static const char sortName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceSort(const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Before returning anything, this source must fetch everything from
+ the underlying source and group it. populate() is used to do that
+ on the first call to any method on this source. The populated
+ boolean indicates that this has been done.
+ */
+ void populate();
+ bool populated;
+ long long count;
+
+ /* these two parallel each other */
+ vector<intrusive_ptr<ExpressionFieldPath> > vSortKey;
+ vector<bool> vAscending;
+
+ class Carrier {
+ public:
+ /*
+ We need access to the key for compares, so we have to carry
+ this around.
+ */
+ DocumentSourceSort *pSort;
+
+ intrusive_ptr<Document> pDocument;
+
+ Carrier(DocumentSourceSort *pSort,
+ const intrusive_ptr<Document> &pDocument);
+
+ static bool lessThan(const Carrier &rL, const Carrier &rR);
+ };
+
+ /*
+ Compare two documents according to the specified sort key.
+
+ @param rL reference to the left document
+ @param rR reference to the right document
+ @returns a number less than, equal to, or greater than zero,
+ indicating pL < pR, pL == pR, or pL > pR, respectively
+ */
+ int compare(const intrusive_ptr<Document> &pL,
+ const intrusive_ptr<Document> &pR);
+
+ typedef list<Carrier> ListType;
+ ListType documents;
+
+ ListType::iterator listIterator;
+ intrusive_ptr<Document> pCurrent;
+
+ intrusive_ptr<ExpressionContext> pCtx;
+ };
+
+
+ class DocumentSourceLimit :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceLimit();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+
+ /*
+ Create a new limiting DocumentSource.
+
+ @param pCtx the expression context
+ @returns the DocumentSource
+ */
+ static intrusive_ptr<DocumentSourceLimit> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Create a limiting DocumentSource from BSON.
+
+ This is a convenience method that uses the above, and operates on
+ a BSONElement that has been deteremined to be an Object with an
+ element named $limit.
+
+ @param pBsonElement the BSONELement that defines the limit
+ @param pCtx the expression context
+ @returns the grouping DocumentSource
+ */
+ static intrusive_ptr<DocumentSource> createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+
+ static const char limitName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceLimit(const intrusive_ptr<ExpressionContext> &pCtx);
+
+ long long limit;
+ long long count;
+ intrusive_ptr<Document> pCurrent;
+
+ intrusive_ptr<ExpressionContext> pCtx;
+ };
+
+ class DocumentSourceSkip :
+ public DocumentSource {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceSkip();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+
+ /*
+ Create a new skipping DocumentSource.
+
+ @param pCtx the expression context
+ @returns the DocumentSource
+ */
+ static intrusive_ptr<DocumentSourceSkip> create(
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Create a skipping DocumentSource from BSON.
+
+ This is a convenience method that uses the above, and operates on
+ a BSONElement that has been deteremined to be an Object with an
+ element named $skip.
+
+ @param pBsonElement the BSONELement that defines the skip
+ @param pCtx the expression context
+ @returns the grouping DocumentSource
+ */
+ static intrusive_ptr<DocumentSource> createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+
+ static const char skipName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceSkip(const intrusive_ptr<ExpressionContext> &pCtx);
+
+ /*
+ Skips initial documents.
+ */
+ void skipper();
+
+ long long skip;
+ long long count;
+ intrusive_ptr<Document> pCurrent;
+
+ intrusive_ptr<ExpressionContext> pCtx;
+ };
+
+
+ class DocumentSourceUnwind :
+ public DocumentSource,
+ public boost::enable_shared_from_this<DocumentSourceUnwind> {
+ public:
+ // virtuals from DocumentSource
+ virtual ~DocumentSourceUnwind();
+ virtual bool eof();
+ virtual bool advance();
+ virtual intrusive_ptr<Document> getCurrent();
+
+ /*
+ Create a new DocumentSource that can implement unwind.
+
+ @returns the projection DocumentSource
+ */
+ static intrusive_ptr<DocumentSourceUnwind> create();
+
+ /*
+ Specify the field to unwind. There must be exactly one before
+ the pipeline begins execution.
+
+ @param rFieldPath - path to the field to unwind
+ */
+ void unwindField(const FieldPath &rFieldPath);
+
+ /*
+ Create a new projection DocumentSource from BSON.
+
+ This is a convenience for directly handling BSON, and relies on the
+ above methods.
+
+ @param pBsonElement the BSONElement with an object named $project
+ @returns the created projection
+ */
+ static intrusive_ptr<DocumentSource> createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx);
+
+ static const char unwindName[];
+
+ protected:
+ // virtuals from DocumentSource
+ virtual void sourceToBson(BSONObjBuilder *pBuilder) const;
+
+ private:
+ DocumentSourceUnwind();
+
+ // configuration state
+ FieldPath unwindPath;
+
+ vector<int> fieldIndex; /* for the current document, the indices
+ leading down to the field being unwound */
+
+ // iteration state
+ intrusive_ptr<Document> pNoUnwindDocument;
+ // document to return, pre-unwind
+ intrusive_ptr<const Value> pUnwindArray; // field being unwound
+ intrusive_ptr<ValueIterator> pUnwinder; // iterator used for unwinding
+ intrusive_ptr<const Value> pUnwindValue; // current value
+
+ /*
+ Clear all the state related to unwinding an array.
+ */
+ void resetArray();
+
+ /*
+ Clone the current document being unwound.
+
+ This is a partial deep clone. Because we're going to replace the
+ value at the end, we have to replace everything along the path
+ leading to that in order to not share that change with any other
+ clones (or the original) that we've made.
+
+ This expects pUnwindValue to have been set by a prior call to
+ advance(). However, pUnwindValue may also be NULL, in which case
+ the field will be removed -- this is the action for an empty
+ array.
+
+ @returns a partial deep clone of pNoUnwindDocument
+ */
+ intrusive_ptr<Document> clonePath() const;
+
+ };
+
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+ inline void DocumentSourceGroup::setIdExpression(
+ const intrusive_ptr<Expression> &pExpression) {
+ pIdExpression = pExpression;
+ }
+
+ inline void DocumentSourceUnwind::resetArray() {
+ pNoUnwindDocument.reset();
+ pUnwindArray.reset();
+ pUnwinder.reset();
+ pUnwindValue.reset();
+ }
+
+ inline DocumentSourceSort::Carrier::Carrier(
+ DocumentSourceSort *pTheSort,
+ const intrusive_ptr<Document> &pTheDocument):
+ pSort(pTheSort),
+ pDocument(pTheDocument) {
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_bson_array.cpp b/src/mongo/db/pipeline/document_source_bson_array.cpp new file mode 100755 index 00000000000..5d187b03ef9 --- /dev/null +++ b/src/mongo/db/pipeline/document_source_bson_array.cpp @@ -0,0 +1,83 @@ +/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/pipeline/document.h"
+
+namespace mongo {
+
+ DocumentSourceBsonArray::~DocumentSourceBsonArray() {
+ }
+
+ bool DocumentSourceBsonArray::eof() {
+ return !haveCurrent;
+ }
+
+ bool DocumentSourceBsonArray::advance() {
+ if (eof())
+ return false;
+
+ if (!arrayIterator.more()) {
+ haveCurrent = false;
+ return false;
+ }
+
+ currentElement = arrayIterator.next();
+ return true;
+ }
+
+ intrusive_ptr<Document> DocumentSourceBsonArray::getCurrent() {
+ assert(haveCurrent);
+ BSONObj documentObj(currentElement.Obj());
+ intrusive_ptr<Document> pDocument(
+ Document::createFromBsonObj(&documentObj));
+ return pDocument;
+ }
+
+ void DocumentSourceBsonArray::setSource(
+ const intrusive_ptr<DocumentSource> &pSource) {
+ /* this doesn't take a source */
+ assert(false);
+ }
+
+ DocumentSourceBsonArray::DocumentSourceBsonArray(
+ BSONElement *pBsonElement):
+ embeddedObject(pBsonElement->embeddedObject()),
+ arrayIterator(embeddedObject),
+ haveCurrent(false) {
+ if (arrayIterator.more()) {
+ currentElement = arrayIterator.next();
+ haveCurrent = true;
+ }
+ }
+
+ intrusive_ptr<DocumentSourceBsonArray> DocumentSourceBsonArray::create(
+ BSONElement *pBsonElement) {
+
+ assert(pBsonElement->type() == Array);
+ intrusive_ptr<DocumentSourceBsonArray> pSource(
+ new DocumentSourceBsonArray(pBsonElement));
+
+ return pSource;
+ }
+
+ void DocumentSourceBsonArray::sourceToBson(BSONObjBuilder *pBuilder) const {
+ assert(false); // this has no analog in the BSON world
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_command_futures.cpp b/src/mongo/db/pipeline/document_source_command_futures.cpp new file mode 100755 index 00000000000..61a257cf16f --- /dev/null +++ b/src/mongo/db/pipeline/document_source_command_futures.cpp @@ -0,0 +1,132 @@ +/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+namespace mongo {
+
+ DocumentSourceCommandFutures::~DocumentSourceCommandFutures() {
+ }
+
+ bool DocumentSourceCommandFutures::eof() {
+ /* if we haven't even started yet, do so */
+ if (!pCurrent.get())
+ getNextDocument();
+
+ return (pCurrent.get() == NULL);
+ }
+
+ bool DocumentSourceCommandFutures::advance() {
+ if (eof())
+ return false;
+
+ /* advance */
+ getNextDocument();
+
+ return (pCurrent.get() != NULL);
+ }
+
+ intrusive_ptr<Document> DocumentSourceCommandFutures::getCurrent() {
+ assert(!eof());
+ return pCurrent;
+ }
+
+ void DocumentSourceCommandFutures::setSource(
+ const intrusive_ptr<DocumentSource> &pSource) {
+ /* this doesn't take a source */
+ assert(false);
+ }
+
+ void DocumentSourceCommandFutures::sourceToBson(
+ BSONObjBuilder *pBuilder) const {
+ /* this has no BSON equivalent */
+ assert(false);
+ }
+
+ DocumentSourceCommandFutures::DocumentSourceCommandFutures(
+ string &theErrmsg, FuturesList *pList):
+ newSource(false),
+ pBsonSource(),
+ pCurrent(),
+ iterator(pList->begin()),
+ listEnd(pList->end()),
+ errmsg(theErrmsg) {
+ }
+
+ intrusive_ptr<DocumentSourceCommandFutures>
+ DocumentSourceCommandFutures::create(
+ string &errmsg, FuturesList *pList) {
+ intrusive_ptr<DocumentSourceCommandFutures> pSource(
+ new DocumentSourceCommandFutures(errmsg, pList));
+ return pSource;
+ }
+
+ void DocumentSourceCommandFutures::getNextDocument() {
+ while(true) {
+ if (!pBsonSource.get()) {
+ /* if there aren't any more futures, we're done */
+ if (iterator == listEnd) {
+ pCurrent.reset();
+ return;
+ }
+
+ /* grab the next command result */
+ shared_ptr<Future::CommandResult> pResult(*iterator);
+ ++iterator;
+
+ /* try to wait for it */
+ if (!pResult->join()) {
+ error() << "sharded pipeline failed on shard: " <<
+ pResult->getServer() << " error: " <<
+ pResult->result() << endl;
+ errmsg += "-- mongod pipeline failed: ";
+ errmsg += pResult->result().toString();
+
+ /* move on to the next command future */
+ continue;
+ }
+
+ /* grab the result array out of the shard server's response */
+ BSONObj shardResult(pResult->result());
+ BSONObjIterator objIterator(shardResult);
+ while(objIterator.more()) {
+ BSONElement element(objIterator.next());
+ const char *pFieldName = element.fieldName();
+
+ /* find the result array and quit this loop */
+ if (strcmp(pFieldName, "result") == 0) {
+ pBsonSource = DocumentSourceBsonArray::create(&element);
+ newSource = true;
+ break;
+ }
+ }
+ }
+
+ /* if we're done with this shard's results, try the next */
+ if (pBsonSource->eof() ||
+ (!newSource && !pBsonSource->advance())) {
+ pBsonSource.reset();
+ continue;
+ }
+
+ pCurrent = pBsonSource->getCurrent();
+ newSource = false;
+ return;
+ }
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_filter.cpp b/src/mongo/db/pipeline/document_source_filter.cpp new file mode 100755 index 00000000000..66e57ba2e93 --- /dev/null +++ b/src/mongo/db/pipeline/document_source_filter.cpp @@ -0,0 +1,98 @@ +/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ const char DocumentSourceFilter::filterName[] = "$filter";
+
+ DocumentSourceFilter::~DocumentSourceFilter() {
+ }
+
+ bool DocumentSourceFilter::coalesce(
+ const intrusive_ptr<DocumentSource> &pNextSource) {
+
+ /* we only know how to coalesce other filters */
+ DocumentSourceFilter *pDocFilter =
+ dynamic_cast<DocumentSourceFilter *>(pNextSource.get());
+ if (!pDocFilter)
+ return false;
+
+ /*
+ Two adjacent filters can be combined by creating a conjunction of
+ their predicates.
+ */
+ intrusive_ptr<ExpressionNary> pAnd(ExpressionAnd::create());
+ pAnd->addOperand(pFilter);
+ pAnd->addOperand(pDocFilter->pFilter);
+ pFilter = pAnd;
+
+ return true;
+ }
+
+ void DocumentSourceFilter::optimize() {
+ pFilter = pFilter->optimize();
+ }
+
+ void DocumentSourceFilter::sourceToBson(BSONObjBuilder *pBuilder) const {
+ pFilter->addToBsonObj(pBuilder, filterName, 0);
+ }
+
+ bool DocumentSourceFilter::accept(
+ const intrusive_ptr<Document> &pDocument) const {
+ intrusive_ptr<const Value> pValue(pFilter->evaluate(pDocument));
+ return pValue->coerceToBool();
+ }
+
+ intrusive_ptr<DocumentSource> DocumentSourceFilter::createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ uassert(15946, "a document filter expression must be an object",
+ pBsonElement->type() == Object);
+
+ Expression::ObjectCtx oCtx(0);
+ intrusive_ptr<Expression> pExpression(
+ Expression::parseObject(pBsonElement, &oCtx));
+ intrusive_ptr<DocumentSourceFilter> pFilter(
+ DocumentSourceFilter::create(pExpression));
+
+ return pFilter;
+ }
+
+ intrusive_ptr<DocumentSourceFilter> DocumentSourceFilter::create(
+ const intrusive_ptr<Expression> &pFilter) {
+ intrusive_ptr<DocumentSourceFilter> pSource(
+ new DocumentSourceFilter(pFilter));
+ return pSource;
+ }
+
+ DocumentSourceFilter::DocumentSourceFilter(
+ const intrusive_ptr<Expression> &pTheFilter):
+ DocumentSourceFilterBase(),
+ pFilter(pTheFilter) {
+ }
+
+ void DocumentSourceFilter::toMatcherBson(BSONObjBuilder *pBuilder) const {
+ pFilter->toMatcherBson(pBuilder, 0);
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_filter_base.cpp b/src/mongo/db/pipeline/document_source_filter_base.cpp new file mode 100755 index 00000000000..dbda34b7151 --- /dev/null +++ b/src/mongo/db/pipeline/document_source_filter_base.cpp @@ -0,0 +1,85 @@ +/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ DocumentSourceFilterBase::~DocumentSourceFilterBase() {
+ }
+
+ void DocumentSourceFilterBase::findNext() {
+ /* only do this the first time */
+ if (unstarted) {
+ hasNext = !pSource->eof();
+ unstarted = false;
+ }
+
+ while(hasNext) {
+ boost::intrusive_ptr<Document> pDocument(pSource->getCurrent());
+ hasNext = pSource->advance();
+
+ if (accept(pDocument)) {
+ pCurrent = pDocument;
+ return;
+ }
+ }
+
+ pCurrent.reset();
+ }
+
+ bool DocumentSourceFilterBase::eof() {
+ if (unstarted)
+ findNext();
+
+ return (pCurrent.get() == NULL);
+ }
+
+ bool DocumentSourceFilterBase::advance() {
+ if (unstarted)
+ findNext();
+
+ /*
+ This looks weird after the above, but is correct. Note that calling
+ getCurrent() when first starting already yields the first document
+ in the collection. Calling advance() without using getCurrent()
+ first will skip over the first item.
+ */
+ findNext();
+
+ return (pCurrent.get() != NULL);
+ }
+
+ boost::intrusive_ptr<Document> DocumentSourceFilterBase::getCurrent() {
+ if (unstarted)
+ findNext();
+
+ assert(pCurrent.get() != NULL);
+ return pCurrent;
+ }
+
+ DocumentSourceFilterBase::DocumentSourceFilterBase():
+ unstarted(true),
+ hasNext(false),
+ pCurrent() {
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_group.cpp b/src/mongo/db/pipeline/document_source_group.cpp new file mode 100755 index 00000000000..244561589da --- /dev/null +++ b/src/mongo/db/pipeline/document_source_group.cpp @@ -0,0 +1,391 @@ +/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/accumulator.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+ const char DocumentSourceGroup::groupName[] = "$group";
+
+ DocumentSourceGroup::~DocumentSourceGroup() {
+ }
+
+ bool DocumentSourceGroup::eof() {
+ if (!populated)
+ populate();
+
+ return (groupsIterator == groups.end());
+ }
+
+ bool DocumentSourceGroup::advance() {
+ if (!populated)
+ populate();
+
+ assert(groupsIterator != groups.end());
+
+ ++groupsIterator;
+ if (groupsIterator == groups.end()) {
+ pCurrent.reset();
+ return false;
+ }
+
+ pCurrent = makeDocument(groupsIterator);
+ return true;
+ }
+
+ intrusive_ptr<Document> DocumentSourceGroup::getCurrent() {
+ if (!populated)
+ populate();
+
+ return pCurrent;
+ }
+
+ void DocumentSourceGroup::sourceToBson(BSONObjBuilder *pBuilder) const {
+ BSONObjBuilder insides;
+
+ /* add the _id */
+ pIdExpression->addToBsonObj(&insides, Document::idName.c_str(), 0);
+
+ /* add the remaining fields */
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<Accumulator> pA((*vpAccumulatorFactory[i])(pCtx));
+ pA->addOperand(vpExpression[i]);
+ pA->addToBsonObj(&insides, vFieldName[i], 0);
+ }
+
+ pBuilder->append(groupName, insides.done());
+ }
+
+ intrusive_ptr<DocumentSourceGroup> DocumentSourceGroup::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<DocumentSourceGroup> pSource(
+ new DocumentSourceGroup(pCtx));
+ return pSource;
+ }
+
+ DocumentSourceGroup::DocumentSourceGroup(
+ const intrusive_ptr<ExpressionContext> &pTheCtx):
+ populated(false),
+ pIdExpression(),
+ groups(),
+ vFieldName(),
+ vpAccumulatorFactory(),
+ vpExpression(),
+ pCtx(pTheCtx) {
+ }
+
+ void DocumentSourceGroup::addAccumulator(
+ string fieldName,
+ intrusive_ptr<Accumulator> (*pAccumulatorFactory)(
+ const intrusive_ptr<ExpressionContext> &),
+ const intrusive_ptr<Expression> &pExpression) {
+ vFieldName.push_back(fieldName);
+ vpAccumulatorFactory.push_back(pAccumulatorFactory);
+ vpExpression.push_back(pExpression);
+ }
+
+
+ struct GroupOpDesc {
+ const char *pName;
+ intrusive_ptr<Accumulator> (*pFactory)(
+ const intrusive_ptr<ExpressionContext> &);
+ };
+
+ static int GroupOpDescCmp(const void *pL, const void *pR) {
+ return strcmp(((const GroupOpDesc *)pL)->pName,
+ ((const GroupOpDesc *)pR)->pName);
+ }
+
+ /*
+ Keep these sorted alphabetically so we can bsearch() them using
+ GroupOpDescCmp() above.
+ */
+ static const GroupOpDesc GroupOpTable[] = {
+ {"$addToSet", AccumulatorAddToSet::create},
+ {"$avg", AccumulatorAvg::create},
+ {"$first", AccumulatorFirst::create},
+ {"$last", AccumulatorLast::create},
+ {"$max", AccumulatorMinMax::createMax},
+ {"$min", AccumulatorMinMax::createMin},
+ {"$push", AccumulatorPush::create},
+ {"$sum", AccumulatorSum::create},
+ };
+
+ static const size_t NGroupOp = sizeof(GroupOpTable)/sizeof(GroupOpTable[0]);
+
+ intrusive_ptr<DocumentSource> DocumentSourceGroup::createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ uassert(15947, "a group's fields must be specified in an object",
+ pBsonElement->type() == Object);
+
+ intrusive_ptr<DocumentSourceGroup> pGroup(
+ DocumentSourceGroup::create(pCtx));
+ bool idSet = false;
+
+ BSONObj groupObj(pBsonElement->Obj());
+ BSONObjIterator groupIterator(groupObj);
+ while(groupIterator.more()) {
+ BSONElement groupField(groupIterator.next());
+ const char *pFieldName = groupField.fieldName();
+
+ if (strcmp(pFieldName, Document::idName.c_str()) == 0) {
+ uassert(15948, "a group's _id may only be specified once",
+ !idSet);
+
+ BSONType groupType = groupField.type();
+
+ if (groupType == Object) {
+ /*
+ Use the projection-like set of field paths to create the
+ group-by key.
+ */
+ Expression::ObjectCtx oCtx(
+ Expression::ObjectCtx::DOCUMENT_OK);
+ intrusive_ptr<Expression> pId(
+ Expression::parseObject(&groupField, &oCtx));
+
+ pGroup->setIdExpression(pId);
+ idSet = true;
+ }
+ else if (groupType == String) {
+ string groupString(groupField.String());
+ const char *pGroupString = groupString.c_str();
+ if ((groupString.length() == 0) ||
+ (pGroupString[0] != '$'))
+ goto StringConstantId;
+
+ string pathString(
+ Expression::removeFieldPrefix(groupString));
+ intrusive_ptr<ExpressionFieldPath> pFieldPath(
+ ExpressionFieldPath::create(pathString));
+ pGroup->setIdExpression(pFieldPath);
+ idSet = true;
+ }
+ else {
+ /* pick out the constant types that are allowed */
+ switch(groupType) {
+ case NumberDouble:
+ case String:
+ case Object:
+ case Array:
+ case jstOID:
+ case Bool:
+ case Date:
+ case NumberInt:
+ case Timestamp:
+ case NumberLong:
+ case jstNULL:
+ StringConstantId: // from string case above
+ {
+ intrusive_ptr<const Value> pValue(
+ Value::createFromBsonElement(&groupField));
+ intrusive_ptr<ExpressionConstant> pConstant(
+ ExpressionConstant::create(pValue));
+ pGroup->setIdExpression(pConstant);
+ idSet = true;
+ break;
+ }
+
+ default:
+ uassert(15949, str::stream() <<
+ "a group's _id may not include fields of BSON type " << groupType,
+ false);
+ }
+ }
+ }
+ else {
+ /*
+ Treat as a projection field with the additional ability to
+ add aggregation operators.
+ */
+ uassert(15950, str::stream() <<
+ "the group aggregate field name " <<
+ *pFieldName << " cannot be an operator name",
+ *pFieldName != '$');
+
+ uassert(15951, str::stream() <<
+ "the group aggregate field " << *pFieldName <<
+ "must be defined as an expression inside an object",
+ groupField.type() == Object);
+
+ BSONObj subField(groupField.Obj());
+ BSONObjIterator subIterator(subField);
+ size_t subCount = 0;
+ for(; subIterator.more(); ++subCount) {
+ BSONElement subElement(subIterator.next());
+
+ /* look for the specified operator */
+ GroupOpDesc key;
+ key.pName = subElement.fieldName();
+ const GroupOpDesc *pOp =
+ (const GroupOpDesc *)bsearch(
+ &key, GroupOpTable, NGroupOp, sizeof(GroupOpDesc),
+ GroupOpDescCmp);
+
+ uassert(15952, str::stream() <<
+ "unknown group operator \"" <<
+ key.pName << "\"",
+ pOp);
+
+ intrusive_ptr<Expression> pGroupExpr;
+
+ BSONType elementType = subElement.type();
+ if (elementType == Object) {
+ Expression::ObjectCtx oCtx(
+ Expression::ObjectCtx::DOCUMENT_OK);
+ pGroupExpr = Expression::parseObject(
+ &subElement, &oCtx);
+ }
+ else if (elementType == Array) {
+ uassert(15953, str::stream() <<
+ "aggregating group operators are unary (" <<
+ key.pName << ")", false);
+ }
+ else { /* assume its an atomic single operand */
+ pGroupExpr = Expression::parseOperand(&subElement);
+ }
+
+ pGroup->addAccumulator(
+ pFieldName, pOp->pFactory, pGroupExpr);
+ }
+
+ uassert(15954, str::stream() <<
+ "the computed aggregate \"" <<
+ pFieldName << "\" must specify exactly one operator",
+ subCount == 1);
+ }
+ }
+
+ uassert(15955, "a group specification must include an _id", idSet);
+
+ return pGroup;
+ }
+
+ void DocumentSourceGroup::populate() {
+ for(bool hasNext = !pSource->eof(); hasNext;
+ hasNext = pSource->advance()) {
+ intrusive_ptr<Document> pDocument(pSource->getCurrent());
+
+ /* get the _id document */
+ intrusive_ptr<const Value> pId(pIdExpression->evaluate(pDocument));
+ uassert(15956, "the _id field for a group must not be undefined",
+ pId->getType() != Undefined);
+
+ /*
+ Look for the _id value in the map; if it's not there, add a
+ new entry with a blank accumulator.
+ */
+ vector<intrusive_ptr<Accumulator> > *pGroup;
+ GroupsType::iterator it(groups.find(pId));
+ if (it != groups.end()) {
+ /* point at the existing accumulators */
+ pGroup = &it->second;
+ }
+ else {
+ /* insert a new group into the map */
+ groups.insert(it,
+ pair<intrusive_ptr<const Value>,
+ vector<intrusive_ptr<Accumulator> > >(
+ pId, vector<intrusive_ptr<Accumulator> >()));
+
+ /* find the accumulator vector (the map value) */
+ it = groups.find(pId);
+ pGroup = &it->second;
+
+ /* add the accumulators */
+ const size_t n = vpAccumulatorFactory.size();
+ pGroup->reserve(n);
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<Accumulator> pAccumulator(
+ (*vpAccumulatorFactory[i])(pCtx));
+ pAccumulator->addOperand(vpExpression[i]);
+ pGroup->push_back(pAccumulator);
+ }
+ }
+
+ /* point at the existing key */
+ // unneeded atm // pId = it.first;
+
+ /* tickle all the accumulators for the group we found */
+ const size_t n = pGroup->size();
+ for(size_t i = 0; i < n; ++i)
+ (*pGroup)[i]->evaluate(pDocument);
+ }
+
+ /* start the group iterator */
+ groupsIterator = groups.begin();
+ if (groupsIterator != groups.end())
+ pCurrent = makeDocument(groupsIterator);
+ populated = true;
+ }
+
+ intrusive_ptr<Document> DocumentSourceGroup::makeDocument(
+ const GroupsType::iterator &rIter) {
+ vector<intrusive_ptr<Accumulator> > *pGroup = &rIter->second;
+ const size_t n = vFieldName.size();
+ intrusive_ptr<Document> pResult(Document::create(1 + n));
+
+ /* add the _id field */
+ pResult->addField(Document::idName, rIter->first);
+
+ /* add the rest of the fields */
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<const Value> pValue((*pGroup)[i]->getValue());
+ if (pValue->getType() != Undefined)
+ pResult->addField(vFieldName[i], pValue);
+ }
+
+ return pResult;
+ }
+
+ intrusive_ptr<DocumentSource> DocumentSourceGroup::createMerger() {
+ intrusive_ptr<DocumentSourceGroup> pMerger(
+ DocumentSourceGroup::create(pCtx));
+
+ /* the merger will use the same grouping key */
+ pMerger->setIdExpression(ExpressionFieldPath::create(
+ Document::idName.c_str()));
+
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i) {
+ /*
+ The merger's output field names will be the same, as will the
+ accumulator factories. However, for some accumulators, the
+ expression to be accumulated will be different. The original
+ accumulator may be collecting an expression based on a field
+ expression or constant. Here, we accumulate the output of the
+ same name from the prior group.
+ */
+ pMerger->addAccumulator(
+ vFieldName[i], vpAccumulatorFactory[i],
+ ExpressionFieldPath::create(vFieldName[i]));
+ }
+
+ return pMerger;
+ }
+}
+
+
diff --git a/src/mongo/db/pipeline/document_source_limit.cpp b/src/mongo/db/pipeline/document_source_limit.cpp new file mode 100644 index 00000000000..a73d4da2005 --- /dev/null +++ b/src/mongo/db/pipeline/document_source_limit.cpp @@ -0,0 +1,83 @@ +/** +* Copyright (C) 2011 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" + +#include "db/pipeline/document_source.h" + +#include "db/jsobj.h" +#include "db/pipeline/document.h" +#include "db/pipeline/expression.h" +#include "db/pipeline/expression_context.h" +#include "db/pipeline/value.h" + +namespace mongo { + const char DocumentSourceLimit::limitName[] = "$limit"; + + DocumentSourceLimit::DocumentSourceLimit(const intrusive_ptr<ExpressionContext> &pTheCtx): + limit(0), + count(0), + pCtx(pTheCtx) { + } + + DocumentSourceLimit::~DocumentSourceLimit() { + } + + bool DocumentSourceLimit::eof() { + return pSource->eof() || count >= limit; + } + + bool DocumentSourceLimit::advance() { + ++count; + if (count >= limit) { + pCurrent.reset(); + return false; + } + pCurrent = pSource->getCurrent(); + return pSource->advance(); + } + + intrusive_ptr<Document> DocumentSourceLimit::getCurrent() { + return pSource->getCurrent(); + } + + void DocumentSourceLimit::sourceToBson(BSONObjBuilder *pBuilder) const { + pBuilder->append("$limit", limit); + } + + intrusive_ptr<DocumentSourceLimit> DocumentSourceLimit::create( + const intrusive_ptr<ExpressionContext> &pCtx) { + intrusive_ptr<DocumentSourceLimit> pSource( + new DocumentSourceLimit(pCtx)); + return pSource; + } + + intrusive_ptr<DocumentSource> DocumentSourceLimit::createFromBson( + BSONElement *pBsonElement, + const intrusive_ptr<ExpressionContext> &pCtx) { + uassert(15957, "the limit must be specified as a number", + pBsonElement->isNumber()); + + intrusive_ptr<DocumentSourceLimit> pLimit( + DocumentSourceLimit::create(pCtx)); + + pLimit->limit = (int)pBsonElement->numberLong(); + uassert(15958, "the limit must be positive", + pLimit->limit > 0); + + return pLimit; + } +} diff --git a/src/mongo/db/pipeline/document_source_match.cpp b/src/mongo/db/pipeline/document_source_match.cpp new file mode 100755 index 00000000000..bedac3ef717 --- /dev/null +++ b/src/mongo/db/pipeline/document_source_match.cpp @@ -0,0 +1,80 @@ +/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/matcher.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+
+namespace mongo {
+
+ const char DocumentSourceMatch::matchName[] = "$match";
+
+ DocumentSourceMatch::~DocumentSourceMatch() {
+ }
+
+ void DocumentSourceMatch::sourceToBson(BSONObjBuilder *pBuilder) const {
+ const BSONObj *pQuery = matcher.getQuery();
+ pBuilder->append(matchName, *pQuery);
+ }
+
+ bool DocumentSourceMatch::accept(
+ const intrusive_ptr<Document> &pDocument) const {
+
+ /*
+ The matcher only takes BSON documents, so we have to make one.
+
+ LATER
+ We could optimize this by making a document with only the
+ fields referenced by the Matcher. We could do this by looking inside
+ the Matcher's BSON before it is created, and recording those. The
+ easiest implementation might be to hold onto an ExpressionDocument
+ in here, and give that pDocument to create the created subset of
+ fields, and then convert that instead.
+ */
+ BSONObjBuilder objBuilder;
+ pDocument->toBson(&objBuilder);
+ BSONObj obj(objBuilder.done());
+
+ return matcher.matches(obj);
+ }
+
+ intrusive_ptr<DocumentSource> DocumentSourceMatch::createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ uassert(15959, "the match filter must be an expression in an object",
+ pBsonElement->type() == Object);
+
+ intrusive_ptr<DocumentSourceMatch> pMatcher(
+ new DocumentSourceMatch(pBsonElement->Obj()));
+
+ return pMatcher;
+ }
+
+ void DocumentSourceMatch::toMatcherBson(BSONObjBuilder *pBuilder) const {
+ const BSONObj *pQuery = matcher.getQuery();
+ pBuilder->appendElements(*pQuery);
+ }
+
+ DocumentSourceMatch::DocumentSourceMatch(const BSONObj &query):
+ DocumentSourceFilterBase(),
+ matcher(query) {
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_out.cpp b/src/mongo/db/pipeline/document_source_out.cpp new file mode 100755 index 00000000000..5a30342d25c --- /dev/null +++ b/src/mongo/db/pipeline/document_source_out.cpp @@ -0,0 +1,56 @@ +/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+
+namespace mongo {
+
+ const char DocumentSourceOut::outName[] = "$out";
+
+ DocumentSourceOut::~DocumentSourceOut() {
+ }
+
+ bool DocumentSourceOut::eof() {
+ return pSource->eof();
+ }
+
+ bool DocumentSourceOut::advance() {
+ return pSource->advance();
+ }
+
+ boost::intrusive_ptr<Document> DocumentSourceOut::getCurrent() {
+ return pSource->getCurrent();
+ }
+
+ DocumentSourceOut::DocumentSourceOut(BSONElement *pBsonElement) {
+ assert(false && "unimplemented");
+ }
+
+ intrusive_ptr<DocumentSourceOut> DocumentSourceOut::createFromBson(
+ BSONElement *pBsonElement) {
+ intrusive_ptr<DocumentSourceOut> pSource(
+ new DocumentSourceOut(pBsonElement));
+
+ return pSource;
+ }
+
+ void DocumentSourceOut::sourceToBson(BSONObjBuilder *pBuilder) const {
+ assert(false); // CW TODO
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_project.cpp b/src/mongo/db/pipeline/document_source_project.cpp new file mode 100755 index 00000000000..bb7a0b5a6d9 --- /dev/null +++ b/src/mongo/db/pipeline/document_source_project.cpp @@ -0,0 +1,201 @@ +/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ const char DocumentSourceProject::projectName[] = "$project";
+
+ DocumentSourceProject::~DocumentSourceProject() {
+ }
+
+ DocumentSourceProject::DocumentSourceProject():
+ excludeId(false),
+ pEO(ExpressionObject::create()) {
+ }
+
+ bool DocumentSourceProject::eof() {
+ return pSource->eof();
+ }
+
+ bool DocumentSourceProject::advance() {
+ return pSource->advance();
+ }
+
+ intrusive_ptr<Document> DocumentSourceProject::getCurrent() {
+ intrusive_ptr<Document> pInDocument(pSource->getCurrent());
+
+ /* create the result document */
+ const size_t sizeHint =
+ pEO->getSizeHint(pInDocument) + (excludeId ? 0 : 1);
+ intrusive_ptr<Document> pResultDocument(Document::create(sizeHint));
+
+ if (!excludeId) {
+ intrusive_ptr<const Value> pId(
+ pInDocument->getField(Document::idName));
+ pResultDocument->addField(Document::idName, pId);
+ }
+
+ /* use the ExpressionObject to create the base result */
+ pEO->addToDocument(pResultDocument, pInDocument);
+
+ return pResultDocument;
+ }
+
+ void DocumentSourceProject::optimize() {
+ intrusive_ptr<Expression> pE(pEO->optimize());
+ pEO = dynamic_pointer_cast<ExpressionObject>(pE);
+ }
+
+ void DocumentSourceProject::sourceToBson(BSONObjBuilder *pBuilder) const {
+ BSONObjBuilder insides;
+ if (excludeId)
+ insides.append(Document::idName, false);
+ pEO->documentToBson(&insides, 0);
+ pBuilder->append(projectName, insides.done());
+ }
+
+ intrusive_ptr<DocumentSourceProject> DocumentSourceProject::create() {
+ intrusive_ptr<DocumentSourceProject> pSource(
+ new DocumentSourceProject());
+ return pSource;
+ }
+
+ void DocumentSourceProject::addField(
+ const string &fieldName, const intrusive_ptr<Expression> &pExpression) {
+ uassert(15960,
+ "projection fields must be defined by non-empty expressions",
+ pExpression);
+
+ pEO->addField(fieldName, pExpression);
+ }
+
+ void DocumentSourceProject::includePath(const string &fieldPath) {
+ if (Document::idName.compare(fieldPath) == 0) {
+ uassert(15961, str::stream() << projectName <<
+ ": _id cannot be included once it has been excluded",
+ !excludeId);
+
+ return;
+ }
+
+ pEO->includePath(fieldPath);
+ }
+
+ void DocumentSourceProject::excludePath(const string &fieldPath) {
+ if (Document::idName.compare(fieldPath) == 0) {
+ excludeId = true;
+ return;
+ }
+
+ pEO->excludePath(fieldPath);
+ }
+
+ intrusive_ptr<DocumentSource> DocumentSourceProject::createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ /* validate */
+ uassert(15969, str::stream() << projectName <<
+ " specification must be an object",
+ pBsonElement->type() == Object);
+
+ /* chain the projection onto the original source */
+ intrusive_ptr<DocumentSourceProject> pProject(
+ DocumentSourceProject::create());
+
+ /*
+ Pull out the $project object. This should just be a list of
+ field inclusion or exclusion specifications. Note you can't do
+ both, except for the case of _id.
+ */
+ BSONObj projectObj(pBsonElement->Obj());
+ BSONObjIterator fieldIterator(projectObj);
+ Expression::ObjectCtx objectCtx(
+ Expression::ObjectCtx::DOCUMENT_OK);
+ while(fieldIterator.more()) {
+ BSONElement outFieldElement(fieldIterator.next());
+ string outFieldPath(outFieldElement.fieldName());
+ string inFieldName(outFieldPath);
+ BSONType specType = outFieldElement.type();
+ int fieldInclusion = -1;
+
+ switch(specType) {
+ case NumberDouble: {
+ double inclusion = outFieldElement.numberDouble();
+ fieldInclusion = static_cast<int>(inclusion);
+ goto IncludeExclude;
+ }
+
+ case NumberInt:
+ /* just a plain integer include/exclude specification */
+ fieldInclusion = outFieldElement.numberInt();
+
+IncludeExclude:
+ uassert(15970, str::stream() <<
+ "field inclusion or exclusion specification for \"" <<
+ outFieldPath <<
+ "\" must be true, 1, false, or zero",
+ ((fieldInclusion == 0) || (fieldInclusion == 1)));
+
+ if (fieldInclusion == 0)
+ pProject->excludePath(outFieldPath);
+ else
+ pProject->includePath(outFieldPath);
+ break;
+
+ case Bool:
+ /* just a plain boolean include/exclude specification */
+ fieldInclusion = (outFieldElement.Bool() ? 1 : 0);
+ goto IncludeExclude;
+
+ case String:
+ /* include a field, with rename */
+ fieldInclusion = 1;
+ inFieldName = outFieldElement.String();
+ pProject->addField(
+ outFieldPath,
+ ExpressionFieldPath::create(
+ Expression::removeFieldPrefix(inFieldName)));
+ break;
+
+ case Object: {
+ intrusive_ptr<Expression> pDocument(
+ Expression::parseObject(&outFieldElement, &objectCtx));
+
+ /* add The document expression to the projection */
+ pProject->addField(outFieldPath, pDocument);
+ break;
+ }
+
+ default:
+ uassert(15971, str::stream() <<
+ "invalid BSON type (" << specType <<
+ ") for " << projectName <<
+ " field " << outFieldPath, false);
+ }
+
+ }
+
+ return pProject;
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_skip.cpp b/src/mongo/db/pipeline/document_source_skip.cpp new file mode 100644 index 00000000000..74bf2360ce9 --- /dev/null +++ b/src/mongo/db/pipeline/document_source_skip.cpp @@ -0,0 +1,99 @@ +/** +* Copyright (C) 2011 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" + +#include "db/pipeline/document_source.h" + +#include "db/jsobj.h" +#include "db/pipeline/document.h" +#include "db/pipeline/expression.h" +#include "db/pipeline/expression_context.h" +#include "db/pipeline/value.h" + +namespace mongo { + const char DocumentSourceSkip::skipName[] = "$skip"; + + DocumentSourceSkip::DocumentSourceSkip(const intrusive_ptr<ExpressionContext> &pTheCtx): + skip(0), + count(0), + pCtx(pTheCtx) { + } + + DocumentSourceSkip::~DocumentSourceSkip() { + } + + void DocumentSourceSkip::skipper() { + if (count == 0) { + while (!pSource->eof() && count++ < skip) { + pSource->advance(); + } + } + + if (pSource->eof()) { + pCurrent.reset(); + return; + } + + pCurrent = pSource->getCurrent(); + } + + bool DocumentSourceSkip::eof() { + skipper(); + return pSource->eof(); + } + + bool DocumentSourceSkip::advance() { + if (eof()) { + pCurrent.reset(); + return false; + } + + pCurrent = pSource->getCurrent(); + return pSource->advance(); + } + + intrusive_ptr<Document> DocumentSourceSkip::getCurrent() { + skipper(); + return pCurrent; + } + + void DocumentSourceSkip::sourceToBson(BSONObjBuilder *pBuilder) const { + pBuilder->append("$skip", skip); + } + + intrusive_ptr<DocumentSourceSkip> DocumentSourceSkip::create( + const intrusive_ptr<ExpressionContext> &pCtx) { + intrusive_ptr<DocumentSourceSkip> pSource( + new DocumentSourceSkip(pCtx)); + return pSource; + } + + intrusive_ptr<DocumentSource> DocumentSourceSkip::createFromBson( + BSONElement *pBsonElement, + const intrusive_ptr<ExpressionContext> &pCtx) { + uassert(15972, str::stream() << "the value to " << + skipName << " must be a number", pBsonElement->isNumber()); + + intrusive_ptr<DocumentSourceSkip> pSkip( + DocumentSourceSkip::create(pCtx)); + + pSkip->skip = (int)pBsonElement->numberLong(); + assert(pSkip->skip > 0); // CW TODO error code + + return pSkip; + } +} diff --git a/src/mongo/db/pipeline/document_source_sort.cpp b/src/mongo/db/pipeline/document_source_sort.cpp new file mode 100755 index 00000000000..bf4739af7d1 --- /dev/null +++ b/src/mongo/db/pipeline/document_source_sort.cpp @@ -0,0 +1,216 @@ +/**
+* Copyright (C) 2011 10gen Inc.
+*
+* This program is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Affero General Public License, version 3,
+* as published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU Affero General Public License for more details.
+*
+* You should have received a copy of the GNU Affero General Public License
+* along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "pch.h"
+
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/doc_mem_monitor.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+
+
+namespace mongo {
+ const char DocumentSourceSort::sortName[] = "$sort";
+
+ DocumentSourceSort::~DocumentSourceSort() {
+ }
+
+ bool DocumentSourceSort::eof() {
+ if (!populated)
+ populate();
+
+ return (listIterator == documents.end());
+ }
+
+ bool DocumentSourceSort::advance() {
+ if (!populated)
+ populate();
+
+ assert(listIterator != documents.end());
+
+ ++listIterator;
+ if (listIterator == documents.end()) {
+ pCurrent.reset();
+ count = 0;
+ return false;
+ }
+ pCurrent = listIterator->pDocument;
+
+ return true;
+ }
+
+ intrusive_ptr<Document> DocumentSourceSort::getCurrent() {
+ if (!populated)
+ populate();
+
+ return pCurrent;
+ }
+
+ void DocumentSourceSort::sourceToBson(BSONObjBuilder *pBuilder) const {
+ BSONObjBuilder insides;
+ sortKeyToBson(&insides, false);
+ pBuilder->append(sortName, insides.done());
+ }
+
+ intrusive_ptr<DocumentSourceSort> DocumentSourceSort::create(
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ intrusive_ptr<DocumentSourceSort> pSource(
+ new DocumentSourceSort(pCtx));
+ return pSource;
+ }
+
+ DocumentSourceSort::DocumentSourceSort(
+ const intrusive_ptr<ExpressionContext> &pTheCtx):
+ populated(false),
+ pCtx(pTheCtx) {
+ }
+
+ void DocumentSourceSort::addKey(const string &fieldPath, bool ascending) {
+ intrusive_ptr<ExpressionFieldPath> pE(
+ ExpressionFieldPath::create(fieldPath));
+ vSortKey.push_back(pE);
+ vAscending.push_back(ascending);
+ }
+
+ void DocumentSourceSort::sortKeyToBson(
+ BSONObjBuilder *pBuilder, bool usePrefix) const {
+ /* add the key fields */
+ const size_t n = vSortKey.size();
+ for(size_t i = 0; i < n; ++i) {
+ /* create the "field name" */
+ stringstream ss;
+ vSortKey[i]->writeFieldPath(ss, usePrefix);
+
+ /* append a named integer based on the sort order */
+ pBuilder->append(ss.str(), (vAscending[i] ? 1 : -1));
+ }
+ }
+
+ intrusive_ptr<DocumentSource> DocumentSourceSort::createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ uassert(15973, str::stream() << " the " <<
+ sortName << " key specification must be an object",
+ pBsonElement->type() == Object);
+
+ intrusive_ptr<DocumentSourceSort> pSort(
+ DocumentSourceSort::create(pCtx));
+
+ /* check for then iterate over the sort object */
+ size_t sortKeys = 0;
+ for(BSONObjIterator keyIterator(pBsonElement->Obj().begin());
+ keyIterator.more();) {
+ BSONElement keyField(keyIterator.next());
+ const char *pKeyFieldName = keyField.fieldName();
+ int sortOrder = 0;
+
+ uassert(15974, str::stream() << sortName <<
+ " key ordering must be specified using a number",
+ keyField.isNumber());
+ sortOrder = (int)keyField.numberInt();
+
+ uassert(15975, str::stream() << sortName <<
+ " key ordering must be 1 (for ascending) or -1 (for descending",
+ ((sortOrder == 1) || (sortOrder == -1)));
+
+ pSort->addKey(pKeyFieldName, (sortOrder > 0));
+ ++sortKeys;
+ }
+
+ uassert(15976, str::stream() << sortName <<
+ " must have at least one sort key", (sortKeys > 0));
+
+ return pSort;
+ }
+
+ void DocumentSourceSort::populate() {
+ /* make sure we've got a sort key */
+ assert(vSortKey.size());
+
+ /* track and warn about how much physical memory has been used */
+ DocMemMonitor dmm(this);
+
+ /* pull everything from the underlying source */
+ for(bool hasNext = !pSource->eof(); hasNext;
+ hasNext = pSource->advance()) {
+ intrusive_ptr<Document> pDocument(pSource->getCurrent());
+ documents.push_back(Carrier(this, pDocument));
+
+ dmm.addToTotal(pDocument->getApproximateSize());
+ }
+
+ /* sort the list */
+ documents.sort(Carrier::lessThan);
+
+ /* start the sort iterator */
+ listIterator = documents.begin();
+
+ if (listIterator != documents.end())
+ pCurrent = listIterator->pDocument;
+ populated = true;
+ }
+
+ int DocumentSourceSort::compare(
+ const intrusive_ptr<Document> &pL, const intrusive_ptr<Document> &pR) {
+
+ /*
+ populate() already checked that there is a non-empty sort key,
+ so we shouldn't have to worry about that here.
+
+ However, the tricky part is what to do is none of the sort keys are
+ present. In this case, consider the document less.
+ */
+ const size_t n = vSortKey.size();
+ for(size_t i = 0; i < n; ++i) {
+ /* evaluate the sort keys */
+ ExpressionFieldPath *pE = vSortKey[i].get();
+ intrusive_ptr<const Value> pLeft(pE->evaluate(pL));
+ intrusive_ptr<const Value> pRight(pE->evaluate(pR));
+
+ /*
+ Compare the two values; if they differ, return. If they are
+ the same, move on to the next key.
+ */
+ int cmp = Value::compare(pLeft, pRight);
+ if (cmp) {
+ /* if necessary, adjust the return value by the key ordering */
+ if (!vAscending[i])
+ cmp = -cmp;
+
+ return cmp;
+ }
+ }
+
+ /*
+ If we got here, everything matched (or didn't exist), so we'll
+ consider the documents equal for purposes of this sort.
+ */
+ return 0;
+ }
+
+ bool DocumentSourceSort::Carrier::lessThan(
+ const Carrier &rL, const Carrier &rR) {
+ /* make sure these aren't from different lists */
+ assert(rL.pSort == rR.pSort);
+
+ /* compare the documents according to the sort key */
+ return (rL.pSort->compare(rL.pDocument, rR.pDocument) < 0);
+ }
+}
diff --git a/src/mongo/db/pipeline/document_source_unwind.cpp b/src/mongo/db/pipeline/document_source_unwind.cpp new file mode 100755 index 00000000000..bb231451113 --- /dev/null +++ b/src/mongo/db/pipeline/document_source_unwind.cpp @@ -0,0 +1,234 @@ +/**
+ * Copyright 2011 (c) 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/document_source.h"
+
+#include "db/jsobj.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression.h"
+#include "db/pipeline/value.h"
+
+namespace mongo {
+
+ const char DocumentSourceUnwind::unwindName[] = "$unwind";
+
+ DocumentSourceUnwind::~DocumentSourceUnwind() {
+ }
+
+ DocumentSourceUnwind::DocumentSourceUnwind():
+ unwindPath(),
+ pNoUnwindDocument(),
+ pUnwindArray(),
+ pUnwinder(),
+ pUnwindValue() {
+ }
+
+ bool DocumentSourceUnwind::eof() {
+ /*
+ If we're unwinding an array, and there are more elements, then we
+ can return more documents.
+ */
+ if (pUnwinder.get() && pUnwinder->more())
+ return false;
+
+ return pSource->eof();
+ }
+
+ bool DocumentSourceUnwind::advance() {
+ if (pUnwinder.get() && pUnwinder->more()) {
+ pUnwindValue = pUnwinder->next();
+ return true;
+ }
+
+ /* release the last document and advance */
+ resetArray();
+ return pSource->advance();
+ }
+
+ intrusive_ptr<Document> DocumentSourceUnwind::getCurrent() {
+ if (!pNoUnwindDocument.get()) {
+ intrusive_ptr<Document> pInDocument(pSource->getCurrent());
+
+ /* create the result document */
+ pNoUnwindDocument = pInDocument;
+ fieldIndex.clear();
+
+ /*
+ First we'll look to see if the path is there. If it isn't,
+ we'll pass this document through. If it is, we record the
+ indexes of the fields down the field path so that we can
+ quickly replace them as we clone the documents along the
+ field path.
+
+ We have to clone all the documents along the field path so
+ that we don't share the end value across documents that have
+ come out of this pipeline operator.
+ */
+ intrusive_ptr<Document> pCurrent(pInDocument);
+ const size_t pathLength = unwindPath.getPathLength();
+ for(size_t i = 0; i < pathLength; ++i) {
+ size_t idx = pCurrent->getFieldIndex(
+ unwindPath.getFieldName(i));
+ if (idx == pCurrent->getFieldCount() ) {
+ /* this document doesn't contain the target field */
+ resetArray();
+ return pInDocument;
+ break;
+ }
+
+ fieldIndex.push_back(idx);
+ Document::FieldPair fp(pCurrent->getField(idx));
+ intrusive_ptr<const Value> pPathValue(fp.second);
+ if (i < pathLength - 1) {
+ if (pPathValue->getType() != Object) {
+ /* can't walk down the field path */
+ resetArray();
+ uassert(15977, str::stream() << unwindName <<
+ ": cannot traverse field path past scalar value for \"" <<
+ fp.first << "\"", false);
+ break;
+ }
+
+ /* move down the object tree */
+ pCurrent = pPathValue->getDocument();
+ }
+ else /* (i == pathLength - 1) */ {
+ if (pPathValue->getType() != Array) {
+ /* last item on path must be an array to unwind */
+ resetArray();
+ uassert(15978, str::stream() << unwindName <<
+ ": value at end of field path must be an array",
+ false);
+ break;
+ }
+
+ /* keep track of the array we're unwinding */
+ pUnwindArray = pPathValue;
+ if (pUnwindArray->getArrayLength() == 0) {
+ /*
+ The $unwind of an empty array is a NULL value. If we
+ encounter this, use the non-unwind path, but replace
+ pOutField with a null.
+
+ Make sure unwind value is clear so the array is
+ removed.
+ */
+ pUnwindValue.reset();
+ intrusive_ptr<Document> pClone(clonePath());
+ resetArray();
+ return pClone;
+ }
+
+ /* get the iterator we'll use to unwind the array */
+ pUnwinder = pUnwindArray->getArray();
+ assert(pUnwinder->more()); // we just checked above...
+ pUnwindValue = pUnwinder->next();
+ }
+ }
+ }
+
+ /*
+ If we're unwinding a field, create an alternate document. In the
+ alternate (clone), replace the unwound array field with the element
+ at the appropriate index.
+ */
+ if (pUnwindArray.get()) {
+ /* clone the document with an array we're unwinding */
+ intrusive_ptr<Document> pUnwindDocument(clonePath());
+
+ return pUnwindDocument;
+ }
+
+ return pNoUnwindDocument;
+ }
+
+ intrusive_ptr<Document> DocumentSourceUnwind::clonePath() const {
+ /*
+ For this to be valid, we must already have pNoUnwindDocument set,
+ and have set up the vector of indices for that document in fieldIndex.
+ */
+ assert(pNoUnwindDocument.get());
+ assert(pUnwinder.get());
+
+ intrusive_ptr<Document> pClone(pNoUnwindDocument->clone());
+ intrusive_ptr<Document> pCurrent(pClone);
+ const size_t n = fieldIndex.size();
+ assert(n);
+ for(size_t i = 0; i < n; ++i) {
+ const size_t fi = fieldIndex[i];
+ Document::FieldPair fp(pCurrent->getField(fi));
+ if (i + 1 < n) {
+ /*
+ For every object in the path but the last, clone it and
+ continue on down.
+ */
+ intrusive_ptr<Document> pNext(
+ fp.second->getDocument()->clone());
+ pCurrent->setField(fi, fp.first, Value::createDocument(pNext));
+ pCurrent = pNext;
+ }
+ else {
+ /* for the last, subsitute the next unwound value */
+ pCurrent->setField(fi, fp.first, pUnwindValue);
+ }
+ }
+
+ return pClone;
+ }
+
+ void DocumentSourceUnwind::sourceToBson(BSONObjBuilder *pBuilder) const {
+ pBuilder->append(unwindName, unwindPath.getPath(true));
+ }
+
+ intrusive_ptr<DocumentSourceUnwind> DocumentSourceUnwind::create() {
+ intrusive_ptr<DocumentSourceUnwind> pSource(
+ new DocumentSourceUnwind());
+ return pSource;
+ }
+
+ void DocumentSourceUnwind::unwindField(const FieldPath &rFieldPath) {
+ /* can't set more than one unwind field */
+ uassert(15979, str::stream() << unwindName <<
+ "can't unwind more than one path at once",
+ !unwindPath.getPathLength());
+
+ uassert(15980, "the path of the field to unwind cannot be empty",
+ false);
+
+ /* record the field path */
+ unwindPath = rFieldPath;
+ }
+
+ intrusive_ptr<DocumentSource> DocumentSourceUnwind::createFromBson(
+ BSONElement *pBsonElement,
+ const intrusive_ptr<ExpressionContext> &pCtx) {
+ /*
+ The value of $unwind should just be a field path.
+ */
+ uassert(15981, str::stream() << "the " << unwindName <<
+ " field path must be specified as a string",
+ pBsonElement->type() == String);
+
+ string prefixedPathString(pBsonElement->String());
+ string pathString(Expression::removeFieldPrefix(prefixedPathString));
+ intrusive_ptr<DocumentSourceUnwind> pUnwind(
+ DocumentSourceUnwind::create());
+ pUnwind->unwindPath = FieldPath(pathString);
+
+ return pUnwind;
+ }
+}
diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp new file mode 100755 index 00000000000..b3caefcf899 --- /dev/null +++ b/src/mongo/db/pipeline/expression.cpp @@ -0,0 +1,2815 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/expression.h"
+
+#include <cstdio>
+#include "db/jsobj.h"
+#include "db/pipeline/builder.h"
+#include "db/pipeline/document.h"
+#include "db/pipeline/expression_context.h"
+#include "db/pipeline/value.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+ using namespace mongoutils;
+
+ /* --------------------------- Expression ------------------------------ */
+
+ void Expression::toMatcherBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const {
+ assert(false && "Expression::toMatcherBson()");
+ }
+
+ Expression::ObjectCtx::ObjectCtx(int theOptions):
+ options(theOptions),
+ unwindField() {
+ }
+
+ void Expression::ObjectCtx::unwind(string fieldName) {
+ assert(unwindOk());
+ assert(!unwindUsed());
+ assert(fieldName.size());
+ unwindField = fieldName;
+ }
+
+ bool Expression::ObjectCtx::documentOk() const {
+ return ((options & DOCUMENT_OK) != 0);
+ }
+
+ const char Expression::unwindName[] = "$unwind";
+
+ string Expression::removeFieldPrefix(const string &prefixedField) {
+ const char *pPrefixedField = prefixedField.c_str();
+ uassert(15982, str::stream() <<
+ "field path references must be prefixed with a '$' (\"" <<
+ prefixedField << "\"", pPrefixedField[0] == '$');
+
+ return string(pPrefixedField + 1);
+ }
+
+ intrusive_ptr<Expression> Expression::parseObject(
+ BSONElement *pBsonElement, ObjectCtx *pCtx) {
+ /*
+ An object expression can take any of the following forms:
+
+ f0: {f1: ..., f2: ..., f3: ...}
+ f0: {$operator:[operand1, operand2, ...]}
+ f0: {$unwind:"fieldpath"}
+
+ We handle $unwind as a special case, because this is done by the
+ projection source. For any other expression, we hand over control to
+ code that parses the expression and returns an expression.
+ */
+
+ intrusive_ptr<Expression> pExpression; // the result
+ intrusive_ptr<ExpressionObject> pExpressionObject; // alt result
+ int isOp = -1; /* -1 -> unknown, 0 -> not an operator, 1 -> operator */
+ enum { UNKNOWN, NOTOPERATOR, OPERATOR } kind = UNKNOWN;
+
+ BSONObj obj(pBsonElement->Obj());
+ BSONObjIterator iter(obj);
+ for(size_t fieldCount = 0; iter.more(); ++fieldCount) {
+ BSONElement fieldElement(iter.next());
+ const char *pFieldName = fieldElement.fieldName();
+
+ if (pFieldName[0] == '$') {
+ uassert(15983, str::stream() <<
+ "the operator must be the only field in a pipeline object (at \""
+ << pFieldName << "\"",
+ fieldCount == 0);
+
+ /* we've determined this "object" is an operator expression */
+ isOp = 1;
+ kind = OPERATOR;
+
+ if (strcmp(pFieldName, unwindName) != 0) {
+ pExpression = parseExpression(pFieldName, &fieldElement);
+ }
+ else {
+ assert(pCtx->unwindOk());
+ // CW TODO error: it's not OK to unwind in this context
+
+ assert(!pCtx->unwindUsed());
+ // CW TODO error: this projection already has an unwind
+
+ assert(fieldElement.type() == String);
+ // CW TODO $unwind operand must be single field name
+
+ string fieldPath(removeFieldPrefix(fieldElement.String()));
+ pExpression = ExpressionFieldPath::create(fieldPath);
+ pCtx->unwind(fieldPath);
+ }
+ }
+ else {
+ uassert(15984, str::stream() << "this object is already an operator expression, and can't be used as a document expression (at \"" <<
+ pFieldName << "\")",
+ isOp != 1);
+ uassert(15990, str::stream() << "this object is already an operator expression, and can't be used as a document expression (at \"" <<
+ pFieldName << "\")",
+ kind != OPERATOR);
+
+ /* if it's our first time, create the document expression */
+ if (!pExpression.get()) {
+ assert(pCtx->documentOk());
+ // CW TODO error: document not allowed in this context
+
+ pExpressionObject = ExpressionObject::create();
+ pExpression = pExpressionObject;
+
+ /* this "object" is not an operator expression */
+ isOp = 0;
+ kind = NOTOPERATOR;
+ }
+
+ BSONType fieldType = fieldElement.type();
+ string fieldName(pFieldName);
+ if (fieldType == Object) {
+ /* it's a nested document */
+ ObjectCtx oCtx(
+ (pCtx->documentOk() ? ObjectCtx::DOCUMENT_OK : 0));
+ intrusive_ptr<Expression> pNested(
+ parseObject(&fieldElement, &oCtx));
+ pExpressionObject->addField(fieldName, pNested);
+ }
+ else if (fieldType == String) {
+ /* it's a renamed field */
+ // CW TODO could also be a constant
+ intrusive_ptr<Expression> pPath(
+ ExpressionFieldPath::create(
+ removeFieldPrefix(fieldElement.String())));
+ pExpressionObject->addField(fieldName, pPath);
+ }
+ else if (fieldType == NumberDouble) {
+ /* it's an inclusion specification */
+ int inclusion = static_cast<int>(fieldElement.Double());
+ if (inclusion == 0)
+ pExpressionObject->excludePath(fieldName);
+ else if (inclusion == 1)
+ pExpressionObject->includePath(fieldName);
+ else
+ uassert(15991, str::stream() <<
+ "\"" << fieldName <<
+ "\" numeric inclusion or exclusion must be 1 or 0 (or boolean)",
+ false);
+ }
+ else if (fieldType == Bool) {
+ bool inclusion = fieldElement.Bool();
+ if (!inclusion)
+ pExpressionObject->excludePath(fieldName);
+ else
+ pExpressionObject->includePath(fieldName);
+ }
+ else { /* nothing else is allowed */
+ uassert(15992, str::stream() <<
+ "disallowed field type " << fieldType <<
+ " in object expression (at \"" <<
+ fieldName << "\")", false);
+ }
+ }
+ }
+
+ return pExpression;
+ }
+
+
+ struct OpDesc {
+ const char *pName;
+ intrusive_ptr<ExpressionNary> (*pFactory)(void);
+ };
+
+ static int OpDescCmp(const void *pL, const void *pR) {
+ return strcmp(((const OpDesc *)pL)->pName, ((const OpDesc *)pR)->pName);
+ }
+
+ /*
+ Keep these sorted alphabetically so we can bsearch() them using
+ OpDescCmp() above.
+ */
+ static const OpDesc OpTable[] = {
+ {"$add", ExpressionAdd::create},
+ {"$and", ExpressionAnd::create},
+ {"$cmp", ExpressionCompare::createCmp},
+ {"$cond", ExpressionCond::create},
+ {"$const", ExpressionNoOp::create},
+ {"$dayOfMonth", ExpressionDayOfMonth::create},
+ {"$dayOfWeek", ExpressionDayOfWeek::create},
+ {"$dayOfYear", ExpressionDayOfYear::create},
+ {"$divide", ExpressionDivide::create},
+ {"$eq", ExpressionCompare::createEq},
+ {"$gt", ExpressionCompare::createGt},
+ {"$gte", ExpressionCompare::createGte},
+ {"$hour", ExpressionHour::create},
+ {"$ifNull", ExpressionIfNull::create},
+ {"$lt", ExpressionCompare::createLt},
+ {"$lte", ExpressionCompare::createLte},
+ {"$minute", ExpressionMinute::create},
+ {"$mod", ExpressionMod::create},
+ {"$month", ExpressionMonth::create},
+ {"$multiply", ExpressionMultiply::create},
+ {"$ne", ExpressionCompare::createNe},
+ {"$not", ExpressionNot::create},
+ {"$or", ExpressionOr::create},
+ {"$second", ExpressionSecond::create},
+ {"$strcasecmp", ExpressionStrcasecmp::create},
+ {"$substr", ExpressionSubstr::create},
+ {"$subtract", ExpressionSubtract::create},
+ {"$toLower", ExpressionToLower::create},
+ {"$toUpper", ExpressionToUpper::create},
+ {"$week", ExpressionWeek::create},
+ {"$year", ExpressionYear::create},
+ };
+
+ static const size_t NOp = sizeof(OpTable)/sizeof(OpTable[0]);
+
+ intrusive_ptr<Expression> Expression::parseExpression(
+ const char *pOpName, BSONElement *pBsonElement) {
+ /* look for the specified operator */
+ OpDesc key;
+ key.pName = pOpName;
+ const OpDesc *pOp = (const OpDesc *)bsearch(
+ &key, OpTable, NOp, sizeof(OpDesc), OpDescCmp);
+
+ uassert(15999, str::stream() << "invalid operator \"" <<
+ pOpName << "\"", pOp);
+
+ /* make the expression node */
+ intrusive_ptr<ExpressionNary> pExpression((*pOp->pFactory)());
+
+ /* add the operands to the expression node */
+ BSONType elementType = pBsonElement->type();
+ if (elementType == Object) {
+ /* the operator must be unary and accept an object argument */
+ BSONObj objOperand(pBsonElement->Obj());
+ ObjectCtx oCtx(ObjectCtx::DOCUMENT_OK);
+ intrusive_ptr<Expression> pOperand(
+ Expression::parseObject(pBsonElement, &oCtx));
+ pExpression->addOperand(pOperand);
+ }
+ else if (elementType == Array) {
+ /* multiple operands - an n-ary operator */
+ vector<BSONElement> bsonArray(pBsonElement->Array());
+ const size_t n = bsonArray.size();
+ for(size_t i = 0; i < n; ++i) {
+ BSONElement *pBsonOperand = &bsonArray[i];
+ intrusive_ptr<Expression> pOperand(
+ Expression::parseOperand(pBsonOperand));
+ pExpression->addOperand(pOperand);
+ }
+ }
+ else { /* assume it's an atomic operand */
+ intrusive_ptr<Expression> pOperand(
+ Expression::parseOperand(pBsonElement));
+ pExpression->addOperand(pOperand);
+ }
+
+ return pExpression;
+ }
+
+ intrusive_ptr<Expression> Expression::parseOperand(BSONElement *pBsonElement) {
+ BSONType type = pBsonElement->type();
+
+ switch(type) {
+ case String: {
+ /*
+ This could be a field path, or it could be a constant
+ string.
+
+ We make a copy of the BSONElement reader so we can read its
+ value without advancing its state, in case we need to read it
+ again in the constant code path.
+ */
+ BSONElement opCopy(*pBsonElement);
+ string value(opCopy.String());
+
+ /* check for a field path */
+ if (value[0] != '$')
+ goto ExpectConstant; // assume plain string constant
+
+ /* if we got here, this is a field path expression */
+ string fieldPath(removeFieldPrefix(value));
+ intrusive_ptr<Expression> pFieldExpr(
+ ExpressionFieldPath::create(fieldPath));
+ return pFieldExpr;
+ }
+
+ case Object: {
+ ObjectCtx oCtx(ObjectCtx::DOCUMENT_OK);
+ intrusive_ptr<Expression> pSubExpression(
+ Expression::parseObject(pBsonElement, &oCtx));
+ return pSubExpression;
+ }
+
+ default:
+ ExpectConstant: {
+ intrusive_ptr<Expression> pOperand(
+ ExpressionConstant::createFromBsonElement(pBsonElement));
+ return pOperand;
+ }
+
+ } // switch(type)
+
+ /* NOTREACHED */
+ assert(false);
+ return intrusive_ptr<Expression>();
+ }
+
+ /* ------------------------- ExpressionAdd ----------------------------- */
+
+ ExpressionAdd::~ExpressionAdd() {
+ }
+
+ intrusive_ptr<Expression> ExpressionAdd::optimize() {
+ intrusive_ptr<Expression> pE(ExpressionNary::optimize());
+ ExpressionAdd *pA = dynamic_cast<ExpressionAdd *>(pE.get());
+ if (pA) {
+ /* don't create a circular reference */
+ if (pA != this)
+ pA->pAdd = this;
+ }
+
+ return pE;
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionAdd::create() {
+ intrusive_ptr<ExpressionAdd> pExpression(new ExpressionAdd());
+ return pExpression;
+ }
+
+ ExpressionAdd::ExpressionAdd():
+ ExpressionNary(),
+ useOriginal(false) {
+ }
+
+ intrusive_ptr<const Value> ExpressionAdd::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ unsigned stringCount = 0;
+ unsigned nonConstStringCount = 0;
+ unsigned dateCount = 0;
+ const size_t n = vpOperand.size();
+ vector<intrusive_ptr<const Value> > vpValue; /* evaluated operands */
+
+ /* use the original, if we've been told to do so */
+ if (useOriginal) {
+ return pAdd->evaluate(pDocument);
+ }
+
+ for (size_t i = 0; i < n; ++i) {
+ intrusive_ptr<const Value> pValue(
+ vpOperand[i]->evaluate(pDocument));
+ vpValue.push_back(pValue);
+
+ BSONType valueType = pValue->getType();
+ if (valueType == String) {
+ ++stringCount;
+ if (!dynamic_cast<ExpressionConstant *>(vpOperand[i].get()))
+ ++nonConstStringCount;
+ }
+ else if (valueType == Date)
+ ++dateCount;
+ }
+
+ /*
+ We don't allow adding two dates because it doesn't make sense
+ especially since they are in epoch time. However, if there is a
+ string present then we would be appending the dates to a string so
+ having many would not be not a problem.
+ */
+ if ((dateCount > 1) && !stringCount) {
+ uassert(16000, "can't add two dates together", false);
+ return Value::getNull();
+ }
+
+ /*
+ If there are non-constant strings, and we've got a copy of the
+ original, then use that from this point forward. This is necessary
+ to keep the order of strings the same for string concatenation;
+ constant-folding would violate the order preservation.
+
+ This is a one-way conversion we do if we see one of these. It is
+ possible that these could vary from document to document, but any
+ sane schema probably isn't going to do that, so once we see a string,
+ we can probably assume they're going to be strings all the way down.
+ */
+ if (nonConstStringCount && pAdd.get()) {
+ useOriginal = true;
+ return pAdd->evaluate(pDocument);
+ }
+
+ if (stringCount) {
+ stringstream stringTotal;
+ for (size_t i = 0; i < n; ++i) {
+ intrusive_ptr<const Value> pValue(vpValue[i]);
+ stringTotal << pValue->coerceToString();
+ }
+
+ return Value::createString(stringTotal.str());
+ }
+
+ if (dateCount) {
+ long long dateTotal = 0;
+ for (size_t i = 0; i < n; ++i) {
+ intrusive_ptr<const Value> pValue(vpValue[i]);
+ if (pValue->getType() == Date)
+ dateTotal += pValue->coerceToDate();
+ else
+ dateTotal += static_cast<long long>(pValue->coerceToDouble()*24*60*60*1000);
+ }
+
+ return Value::createDate(Date_t(dateTotal));
+ }
+
+ /*
+ We'll try to return the narrowest possible result value. To do that
+ without creating intermediate Values, do the arithmetic for double
+ and integral types in parallel, tracking the current narrowest
+ type.
+ */
+ double doubleTotal = 0;
+ long long longTotal = 0;
+ BSONType totalType = NumberInt;
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<const Value> pValue(vpValue[i]);
+
+ totalType = Value::getWidestNumeric(totalType, pValue->getType());
+ doubleTotal += pValue->coerceToDouble();
+ longTotal += pValue->coerceToLong();
+ }
+
+ if (totalType == NumberDouble)
+ return Value::createDouble(doubleTotal);
+ if (totalType == NumberLong)
+ return Value::createLong(longTotal);
+ return Value::createInt((int)longTotal);
+ }
+
+ const char *ExpressionAdd::getOpName() const {
+ return "$add";
+ }
+
+ intrusive_ptr<ExpressionNary> (*ExpressionAdd::getFactory() const)() {
+ return ExpressionAdd::create;
+ }
+
+ void ExpressionAdd::toBson(
+ BSONObjBuilder *pBuilder, const char *pOpName, unsigned depth) const {
+
+ if (pAdd)
+ pAdd->toBson(pBuilder, pOpName, depth);
+ else
+ ExpressionNary::toBson(pBuilder, pOpName, depth);
+ }
+
+
+ /* ------------------------- ExpressionAnd ----------------------------- */
+
+ ExpressionAnd::~ExpressionAnd() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionAnd::create() {
+ intrusive_ptr<ExpressionNary> pExpression(new ExpressionAnd());
+ return pExpression;
+ }
+
+ ExpressionAnd::ExpressionAnd():
+ ExpressionNary() {
+ }
+
+ intrusive_ptr<Expression> ExpressionAnd::optimize() {
+ /* optimize the conjunction as much as possible */
+ intrusive_ptr<Expression> pE(ExpressionNary::optimize());
+
+ /* if the result isn't a conjunction, we can't do anything */
+ ExpressionAnd *pAnd = dynamic_cast<ExpressionAnd *>(pE.get());
+ if (!pAnd)
+ return pE;
+
+ /*
+ Check the last argument on the result; if it's not constant (as
+ promised by ExpressionNary::optimize(),) then there's nothing
+ we can do.
+ */
+ const size_t n = pAnd->vpOperand.size();
+ intrusive_ptr<Expression> pLast(pAnd->vpOperand[n - 1]);
+ const ExpressionConstant *pConst =
+ dynamic_cast<ExpressionConstant *>(pLast.get());
+ if (!pConst)
+ return pE;
+
+ /*
+ Evaluate and coerce the last argument to a boolean. If it's false,
+ then we can replace this entire expression.
+ */
+ bool last = pLast->evaluate(intrusive_ptr<Document>())->coerceToBool();
+ if (!last) {
+ intrusive_ptr<ExpressionConstant> pFinal(
+ ExpressionConstant::create(Value::getFalse()));
+ return pFinal;
+ }
+
+ /*
+ If we got here, the final operand was true, so we don't need it
+ anymore. If there was only one other operand, we don't need the
+ conjunction either. Note we still need to keep the promise that
+ the result will be a boolean.
+ */
+ if (n == 2) {
+ intrusive_ptr<Expression> pFinal(
+ ExpressionCoerceToBool::create(pAnd->vpOperand[0]));
+ return pFinal;
+ }
+
+ /*
+ Remove the final "true" value, and return the new expression.
+
+ CW TODO:
+ Note that because of any implicit conversions, we may need to
+ apply an implicit boolean conversion.
+ */
+ pAnd->vpOperand.resize(n - 1);
+ return pE;
+ }
+
+ intrusive_ptr<const Value> ExpressionAnd::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ const size_t n = vpOperand.size();
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<const Value> pValue(vpOperand[i]->evaluate(pDocument));
+ if (!pValue->coerceToBool())
+ return Value::getFalse();
+ }
+
+ return Value::getTrue();
+ }
+
+ const char *ExpressionAnd::getOpName() const {
+ return "$and";
+ }
+
+ void ExpressionAnd::toMatcherBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const {
+ /*
+ There are two patterns we can handle:
+ (1) one or two comparisons on the same field: { a:{$gte:3, $lt:7} }
+ (2) multiple field comparisons: {a:7, b:{$lte:6}, c:2}
+ This can be recognized as a conjunction of a set of range
+ expressions. Direct equality is a degenerate range expression;
+ range expressions can be open-ended.
+ */
+ assert(false && "unimplemented");
+ }
+
+ intrusive_ptr<ExpressionNary> (*ExpressionAnd::getFactory() const)() {
+ return ExpressionAnd::create;
+ }
+
+ /* -------------------- ExpressionCoerceToBool ------------------------- */
+
+ ExpressionCoerceToBool::~ExpressionCoerceToBool() {
+ }
+
+ intrusive_ptr<ExpressionCoerceToBool> ExpressionCoerceToBool::create(
+ const intrusive_ptr<Expression> &pExpression) {
+ intrusive_ptr<ExpressionCoerceToBool> pNew(
+ new ExpressionCoerceToBool(pExpression));
+ return pNew;
+ }
+
+ ExpressionCoerceToBool::ExpressionCoerceToBool(
+ const intrusive_ptr<Expression> &pTheExpression):
+ Expression(),
+ pExpression(pTheExpression) {
+ }
+
+ intrusive_ptr<Expression> ExpressionCoerceToBool::optimize() {
+ /* optimize the operand */
+ pExpression = pExpression->optimize();
+
+ /* if the operand already produces a boolean, then we don't need this */
+ /* LATER - Expression to support a "typeof" query? */
+ Expression *pE = pExpression.get();
+ if (dynamic_cast<ExpressionAnd *>(pE) ||
+ dynamic_cast<ExpressionOr *>(pE) ||
+ dynamic_cast<ExpressionNot *>(pE) ||
+ dynamic_cast<ExpressionCoerceToBool *>(pE))
+ return pExpression;
+
+ return intrusive_ptr<Expression>(this);
+ }
+
+ intrusive_ptr<const Value> ExpressionCoerceToBool::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+
+ intrusive_ptr<const Value> pResult(pExpression->evaluate(pDocument));
+ bool b = pResult->coerceToBool();
+ if (b)
+ return Value::getTrue();
+ return Value::getFalse();
+ }
+
+ void ExpressionCoerceToBool::addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+ assert(false && "not possible"); // no equivalent of this
+ }
+
+ void ExpressionCoerceToBool::addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const {
+ assert(false && "not possible"); // no equivalent of this
+ }
+
+ /* ----------------------- ExpressionCompare --------------------------- */
+
+ ExpressionCompare::~ExpressionCompare() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionCompare::createEq() {
+ intrusive_ptr<ExpressionCompare> pExpression(
+ new ExpressionCompare(EQ));
+ return pExpression;
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionCompare::createNe() {
+ intrusive_ptr<ExpressionCompare> pExpression(
+ new ExpressionCompare(NE));
+ return pExpression;
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionCompare::createGt() {
+ intrusive_ptr<ExpressionCompare> pExpression(
+ new ExpressionCompare(GT));
+ return pExpression;
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionCompare::createGte() {
+ intrusive_ptr<ExpressionCompare> pExpression(
+ new ExpressionCompare(GTE));
+ return pExpression;
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionCompare::createLt() {
+ intrusive_ptr<ExpressionCompare> pExpression(
+ new ExpressionCompare(LT));
+ return pExpression;
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionCompare::createLte() {
+ intrusive_ptr<ExpressionCompare> pExpression(
+ new ExpressionCompare(LTE));
+ return pExpression;
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionCompare::createCmp() {
+ intrusive_ptr<ExpressionCompare> pExpression(
+ new ExpressionCompare(CMP));
+ return pExpression;
+ }
+
+ ExpressionCompare::ExpressionCompare(CmpOp theCmpOp):
+ ExpressionNary(),
+ cmpOp(theCmpOp) {
+ }
+
+ void ExpressionCompare::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(2);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ /*
+ Lookup table for truth value returns
+ */
+ struct CmpLookup {
+ bool truthValue[3]; /* truth value for -1, 0, 1 */
+ Expression::CmpOp reverse; /* reverse comparison operator */
+ char name[5]; /* string name (w/trailing '\0') */
+ };
+ static const CmpLookup cmpLookup[7] = {
+ /* -1 0 1 reverse name */
+ /* EQ */ { { false, true, false }, Expression::EQ, "$eq" },
+ /* NE */ { { true, false, true }, Expression::NE, "$ne" },
+ /* GT */ { { false, false, true }, Expression::LTE, "$gt" },
+ /* GTE */ { { false, true, true }, Expression::LT, "$gte" },
+ /* LT */ { { true, false, false }, Expression::GTE, "$lt" },
+ /* LTE */ { { true, true, false }, Expression::GT, "$lte" },
+ /* CMP */ { { false, false, false }, Expression::CMP, "$cmp" },
+ };
+
+ intrusive_ptr<Expression> ExpressionCompare::optimize() {
+ /* first optimize the comparison operands */
+ intrusive_ptr<Expression> pE(ExpressionNary::optimize());
+
+ /*
+ If the result of optimization is no longer a comparison, there's
+ nothing more we can do.
+ */
+ ExpressionCompare *pCmp = dynamic_cast<ExpressionCompare *>(pE.get());
+ if (!pCmp)
+ return pE;
+
+ /* check to see if optimizing comparison operator is supported */
+ CmpOp newOp = pCmp->cmpOp;
+ if (newOp == CMP)
+ return pE; // not reversible: there's nothing more we can do
+
+ /*
+ There's one localized optimization we recognize: a comparison
+ between a field and a constant. If we recognize that pattern,
+ replace it with an ExpressionFieldRange.
+
+ When looking for this pattern, note that the operands could appear
+ in any order. If we need to reverse the sense of the comparison to
+ put it into the required canonical form, do so.
+ */
+ intrusive_ptr<Expression> pLeft(pCmp->vpOperand[0]);
+ intrusive_ptr<Expression> pRight(pCmp->vpOperand[1]);
+ intrusive_ptr<ExpressionFieldPath> pFieldPath(
+ dynamic_pointer_cast<ExpressionFieldPath>(pLeft));
+ intrusive_ptr<ExpressionConstant> pConstant;
+ if (pFieldPath.get()) {
+ pConstant = dynamic_pointer_cast<ExpressionConstant>(pRight);
+ if (!pConstant.get())
+ return pE; // there's nothing more we can do
+ }
+ else {
+ /* if the first operand wasn't a path, see if it's a constant */
+ pConstant = dynamic_pointer_cast<ExpressionConstant>(pLeft);
+ if (!pConstant.get())
+ return pE; // there's nothing more we can do
+
+ /* the left operand was a constant; see if the right is a path */
+ pFieldPath = dynamic_pointer_cast<ExpressionFieldPath>(pRight);
+ if (!pFieldPath.get())
+ return pE; // there's nothing more we can do
+
+ /* these were not in canonical order, so reverse the sense */
+ newOp = cmpLookup[newOp].reverse;
+ }
+
+ return ExpressionFieldRange::create(
+ pFieldPath, newOp, pConstant->getValue());
+ }
+
+ intrusive_ptr<const Value> ExpressionCompare::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(2);
+ intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+ intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+
+ BSONType leftType = pLeft->getType();
+ BSONType rightType = pRight->getType();
+ uassert(15994, str::stream() << getOpName() <<
+ ": no automatic conversion for types " <<
+ leftType << " and " << rightType,
+ leftType == rightType);
+ // CW TODO at least for now. later, handle automatic conversions
+
+ int cmp = 0;
+ switch(leftType) {
+ case NumberDouble: {
+ double left = pLeft->getDouble();
+ double right = pRight->getDouble();
+
+ if (left < right)
+ cmp = -1;
+ else if (left > right)
+ cmp = 1;
+ break;
+ }
+
+ case NumberInt: {
+ int left = pLeft->getInt();
+ int right = pRight->getInt();
+
+ if (left < right)
+ cmp = -1;
+ else if (left > right)
+ cmp = 1;
+ break;
+ }
+
+ case String: {
+ string left(pLeft->getString());
+ string right(pRight->getString());
+ cmp = signum(left.compare(right));
+ break;
+ }
+
+ default:
+ uassert(15995, str::stream() <<
+ "can't compare values of type " << leftType, false);
+ break;
+ }
+
+ if (cmpOp == CMP) {
+ switch(cmp) {
+ case -1:
+ return Value::getMinusOne();
+ case 0:
+ return Value::getZero();
+ case 1:
+ return Value::getOne();
+
+ default:
+ assert(false); // CW TODO internal error
+ return Value::getNull();
+ }
+ }
+
+ bool returnValue = cmpLookup[cmpOp].truthValue[cmp + 1];
+ if (returnValue)
+ return Value::getTrue();
+ return Value::getFalse();
+ }
+
+ const char *ExpressionCompare::getOpName() const {
+ return cmpLookup[cmpOp].name;
+ }
+
+ /* ----------------------- ExpressionCond ------------------------------ */
+
+ ExpressionCond::~ExpressionCond() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionCond::create() {
+ intrusive_ptr<ExpressionCond> pExpression(new ExpressionCond());
+ return pExpression;
+ }
+
+ ExpressionCond::ExpressionCond():
+ ExpressionNary() {
+ }
+
+ void ExpressionCond::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(3);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionCond::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(3);
+ intrusive_ptr<const Value> pCond(vpOperand[0]->evaluate(pDocument));
+ int idx = pCond->coerceToBool() ? 1 : 2;
+ return vpOperand[idx]->evaluate(pDocument);
+ }
+
+ const char *ExpressionCond::getOpName() const {
+ return "$cond";
+ }
+
+ /* ---------------------- ExpressionConstant --------------------------- */
+
+ ExpressionConstant::~ExpressionConstant() {
+ }
+
+ intrusive_ptr<ExpressionConstant> ExpressionConstant::createFromBsonElement(
+ BSONElement *pBsonElement) {
+ intrusive_ptr<ExpressionConstant> pEC(
+ new ExpressionConstant(pBsonElement));
+ return pEC;
+ }
+
+ ExpressionConstant::ExpressionConstant(BSONElement *pBsonElement):
+ pValue(Value::createFromBsonElement(pBsonElement)) {
+ }
+
+ intrusive_ptr<ExpressionConstant> ExpressionConstant::create(
+ const intrusive_ptr<const Value> &pValue) {
+ intrusive_ptr<ExpressionConstant> pEC(new ExpressionConstant(pValue));
+ return pEC;
+ }
+
+ ExpressionConstant::ExpressionConstant(
+ const intrusive_ptr<const Value> &pTheValue):
+ pValue(pTheValue) {
+ }
+
+
+ intrusive_ptr<Expression> ExpressionConstant::optimize() {
+ /* nothing to do */
+ return intrusive_ptr<Expression>(this);
+ }
+
+ intrusive_ptr<const Value> ExpressionConstant::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ return pValue;
+ }
+
+ void ExpressionConstant::addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+
+ /*
+ For depth greater than one, do the regular thing
+
+ This will be one because any top level expression will actually
+ be an operator node, so by the time we get to an expression
+ constant, we're at level 1 (counting up as we go down the
+ expression tree).
+
+ See the comment below for more on why this happens.
+ */
+ if (depth > 1) {
+ pValue->addToBsonObj(pBuilder, fieldName);
+ return;
+ }
+
+ /*
+ If this happens at the top level, we don't have any direct way
+ to express it. However, we may need to if constant folding
+ reduced expressions to constants, and we need to re-materialize
+ the pipeline in order to ship it to a shard server. This has
+ forced the introduction of {$const: ...}.
+ */
+ BSONObjBuilder constBuilder;
+ pValue->addToBsonObj(&constBuilder, "$const");
+ pBuilder->append(fieldName, constBuilder.done());
+ }
+
+ void ExpressionConstant::addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const {
+ pValue->addToBsonArray(pBuilder);
+ }
+
+ const char *ExpressionConstant::getOpName() const {
+ assert(false); // this has no name
+ return NULL;
+ }
+
+ /* ---------------------- ExpressionDayOfMonth ------------------------- */
+
+ ExpressionDayOfMonth::~ExpressionDayOfMonth() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionDayOfMonth::create() {
+ intrusive_ptr<ExpressionDayOfMonth> pExpression(new ExpressionDayOfMonth());
+ return pExpression;
+ }
+
+ ExpressionDayOfMonth::ExpressionDayOfMonth():
+ ExpressionNary() {
+ }
+
+ void ExpressionDayOfMonth::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionDayOfMonth::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ return Value::createInt(date.tm_mday);
+ }
+
+ const char *ExpressionDayOfMonth::getOpName() const {
+ return "$dayOfMonth";
+ }
+
+ /* ------------------------- ExpressionDayOfWeek ----------------------------- */
+
+ ExpressionDayOfWeek::~ExpressionDayOfWeek() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionDayOfWeek::create() {
+ intrusive_ptr<ExpressionDayOfWeek> pExpression(new ExpressionDayOfWeek());
+ return pExpression;
+ }
+
+ ExpressionDayOfWeek::ExpressionDayOfWeek():
+ ExpressionNary() {
+ }
+
+ void ExpressionDayOfWeek::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionDayOfWeek::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ return Value::createInt(date.tm_wday+1); // MySQL uses 1-7 tm uses 0-6
+ }
+
+ const char *ExpressionDayOfWeek::getOpName() const {
+ return "$dayOfWeek";
+ }
+
+ /* ------------------------- ExpressionDayOfYear ----------------------------- */
+
+ ExpressionDayOfYear::~ExpressionDayOfYear() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionDayOfYear::create() {
+ intrusive_ptr<ExpressionDayOfYear> pExpression(new ExpressionDayOfYear());
+ return pExpression;
+ }
+
+ ExpressionDayOfYear::ExpressionDayOfYear():
+ ExpressionNary() {
+ }
+
+ void ExpressionDayOfYear::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionDayOfYear::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ return Value::createInt(date.tm_yday+1); // MySQL uses 1-366 tm uses 0-365
+ }
+
+ const char *ExpressionDayOfYear::getOpName() const {
+ return "$dayOfYear";
+ }
+
+ /* ----------------------- ExpressionDivide ---------------------------- */
+
+ ExpressionDivide::~ExpressionDivide() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionDivide::create() {
+ intrusive_ptr<ExpressionDivide> pExpression(new ExpressionDivide());
+ return pExpression;
+ }
+
+ ExpressionDivide::ExpressionDivide():
+ ExpressionNary() {
+ }
+
+ void ExpressionDivide::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(2);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionDivide::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(2);
+ intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+ intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+
+ double right = pRight->coerceToDouble();
+ if (right == 0)
+ return Value::getUndefined();
+
+ double left = pLeft->coerceToDouble();
+
+ return Value::createDouble(left / right);
+ }
+
+ const char *ExpressionDivide::getOpName() const {
+ return "$divide";
+ }
+
+ /* ---------------------- ExpressionObject --------------------------- */
+
+ ExpressionObject::~ExpressionObject() {
+ }
+
+ intrusive_ptr<ExpressionObject> ExpressionObject::create() {
+ intrusive_ptr<ExpressionObject> pExpression(new ExpressionObject());
+ return pExpression;
+ }
+
+ ExpressionObject::ExpressionObject():
+ excludePaths(false),
+ path(),
+ vFieldName(),
+ vpExpression() {
+ }
+
+ intrusive_ptr<Expression> ExpressionObject::optimize() {
+ const size_t n = vpExpression.size();
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<Expression> pE(vpExpression[i]->optimize());
+ vpExpression[i] = pE;
+ }
+
+ return intrusive_ptr<Expression>(this);
+ }
+
+ void ExpressionObject::addToDocument(
+ const intrusive_ptr<Document> &pResult,
+ const intrusive_ptr<Document> &pDocument) const {
+ const size_t pathSize = path.size();
+ set<string>::const_iterator end(path.end());
+
+ /*
+ Take care of inclusions or exclusions. Note that _id is special,
+ that that it is always included, unless it is specifically excluded.
+ we use excludeId for that in case excludePaths if false, which means
+ to include paths.
+ */
+ if (pathSize) {
+ auto_ptr<FieldIterator> pIter(pDocument->createFieldIterator());
+ if (excludePaths) {
+ while(pIter->more()) {
+ pair<string, intrusive_ptr<const Value> > field(pIter->next());
+
+ /*
+ If the field in the document is not in the exclusion set,
+ add it to the result document.
+
+ Note that exclusions are only allowed on leaves, so we
+ can assume we don't have to descend recursively here.
+ */
+ if (path.find(field.first) != end)
+ continue; // we found it, so don't add it
+
+ pResult->addField(field.first, field.second);
+ }
+ }
+ else { /* !excludePaths */
+ while(pIter->more()) {
+ pair<string, intrusive_ptr<const Value> > field(
+ pIter->next());
+ /*
+ If the field in the document is in the inclusion set,
+ add it to the result document. Or, if we're not
+ excluding _id, and it is _id, include it.
+
+ Note that this could be an inclusion along a pathway,
+ so we look for an ExpressionObject in vpExpression; when
+ we find one, we populate the result with the evaluation
+ of that on the nested object, yielding relative paths.
+ This also allows us to handle intermediate arrays; if we
+ encounter one, we repeat this for each array element.
+ */
+ if (path.find(field.first) != end) {
+ /* find the Expression */
+ const size_t n = vFieldName.size();
+ size_t i;
+ Expression *pE = NULL;
+ for(i = 0; i < n; ++i) {
+ if (field.first.compare(vFieldName[i]) == 0) {
+ pE = vpExpression[i].get();
+ break;
+ }
+ }
+
+ /*
+ If we didn't find an expression, it's the last path
+ element to include.
+ */
+ if (!pE) {
+ pResult->addField(field.first, field.second);
+ continue;
+ }
+
+ ExpressionObject *pChild =
+ dynamic_cast<ExpressionObject *>(pE);
+ assert(pChild);
+
+ /*
+ Check on the type of the result object. If it's an
+ object, just walk down into that recursively, and
+ add it to the result.
+ */
+ BSONType valueType = field.second->getType();
+ if (valueType == Object) {
+ intrusive_ptr<Document> pD(
+ pChild->evaluateDocument(
+ field.second->getDocument()));
+ pResult->addField(vFieldName[i],
+ Value::createDocument(pD));
+ }
+ else if (valueType == Array) {
+ /*
+ If it's an array, we have to do the same thing,
+ but to each array element. Then, add the array
+ of results to the current document.
+ */
+ vector<intrusive_ptr<const Value> > result;
+ intrusive_ptr<ValueIterator> pVI(
+ field.second->getArray());
+ while(pVI->more()) {
+ intrusive_ptr<Document> pD(
+ pChild->evaluateDocument(
+ pVI->next()->getDocument()));
+ result.push_back(Value::createDocument(pD));
+ }
+
+ pResult->addField(vFieldName[i],
+ Value::createArray(result));
+ }
+ }
+ }
+ }
+ }
+
+ /* add any remaining fields we haven't already taken care of */
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i) {
+ string fieldName(vFieldName[i]);
+
+ /* if we've already dealt with this field, above, do nothing */
+ if (path.find(fieldName) != end)
+ continue;
+
+ intrusive_ptr<const Value> pValue(
+ vpExpression[i]->evaluate(pDocument));
+
+ /*
+ Don't add non-existent values (note: different from NULL);
+ this is consistent with existing selection syntax which doesn't
+ force the appearnance of non-existent fields.
+ */
+ if (pValue->getType() == Undefined)
+ continue;
+
+ pResult->addField(fieldName, pValue);
+ }
+ }
+
+ size_t ExpressionObject::getSizeHint(
+ const intrusive_ptr<Document> &pDocument) const {
+ size_t sizeHint = pDocument->getFieldCount();
+ const size_t pathSize = path.size();
+ if (!excludePaths)
+ sizeHint += pathSize;
+ else {
+ size_t excludeCount = pathSize;
+ if (sizeHint > excludeCount)
+ sizeHint -= excludeCount;
+ else
+ sizeHint = 0;
+ }
+
+ /* account for the additional computed fields */
+ sizeHint += vFieldName.size();
+
+ return sizeHint;
+ }
+
+ intrusive_ptr<Document> ExpressionObject::evaluateDocument(
+ const intrusive_ptr<Document> &pDocument) const {
+ /* create and populate the result */
+ intrusive_ptr<Document> pResult(
+ Document::create(getSizeHint(pDocument)));
+ addToDocument(pResult, pDocument);
+ return pResult;
+ }
+
+ intrusive_ptr<const Value> ExpressionObject::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ return Value::createDocument(evaluateDocument(pDocument));
+ }
+
+ void ExpressionObject::addField(const string &fieldName,
+ const intrusive_ptr<Expression> &pExpression) {
+ /* must have an expression */
+ assert(pExpression.get());
+
+ /* parse the field path */
+ FieldPath fieldPath(fieldName);
+ uassert(16008, str::stream() <<
+ "an expression object's field names cannot be field paths (at \"" <<
+ fieldName << "\")", fieldPath.getPathLength() == 1);
+
+ /* make sure it isn't a name we've included or excluded */
+ set<string>::iterator ex(path.find(fieldName));
+ uassert(16009, str::stream() <<
+ "can't add a field to an object expression that has already been excluded (at \"" <<
+ fieldName << "\")", ex == path.end());
+
+ /* make sure it isn't a name we've already got */
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i) {
+ uassert(16010, str::stream() <<
+ "can't add the same field to an object expression more than once (at \"" <<
+ fieldName << "\")",
+ fieldName.compare(vFieldName[i]) != 0);
+ }
+
+ vFieldName.push_back(fieldName);
+ vpExpression.push_back(pExpression);
+ }
+
+ void ExpressionObject::includePath(
+ const FieldPath *pPath, size_t pathi, size_t pathn, bool excludeLast) {
+
+ /* get the current path field name */
+ string fieldName(pPath->getFieldName(pathi));
+ uassert(16011,
+ "an object expression can't include an empty field-name",
+ fieldName.length());
+
+ const size_t pathCount = path.size();
+
+ /* if this is the leaf-most object, stop */
+ if (pathi == pathn - 1) {
+ /*
+ Make sure the exclusion configuration of this node matches
+ the requested result. Or, that this is the first (determining)
+ specification.
+ */
+ uassert(16012, str::stream() <<
+ "incompatible exclusion for \"" <<
+ pPath->getPath(false) <<
+ "\" because of a prior inclusion that includes a common sub-path",
+ ((excludePaths == excludeLast) || !pathCount));
+
+ excludePaths = excludeLast; // if (!pathCount), set this
+ path.insert(fieldName);
+ return;
+ }
+
+ /* this level had better be about inclusions */
+ uassert(16013, str::stream() <<
+ "incompatible inclusion for \"" << pPath->getPath(false) <<
+ "\" because of a prior exclusion that includes a common sub-path",
+ !excludePaths);
+
+ /* see if we already know about this field */
+ const size_t n = vFieldName.size();
+ size_t i;
+ for(i = 0; i < n; ++i) {
+ if (fieldName.compare(vFieldName[i]) == 0)
+ break;
+ }
+
+ /* find the right object, and continue */
+ ExpressionObject *pChild;
+ if (i < n) {
+ /* the intermediate child already exists */
+ pChild = dynamic_cast<ExpressionObject *>(vpExpression[i].get());
+ assert(pChild);
+ }
+ else {
+ /*
+ If we get here, the intervening child isn't already there,
+ so create it.
+ */
+ intrusive_ptr<ExpressionObject> pSharedChild(
+ ExpressionObject::create());
+ path.insert(fieldName);
+ vFieldName.push_back(fieldName);
+ vpExpression.push_back(pSharedChild);
+ pChild = pSharedChild.get();
+ }
+
+ // LATER CW TODO turn this into a loop
+ pChild->includePath(pPath, pathi + 1, pathn, excludeLast);
+ }
+
+ void ExpressionObject::includePath(const string &theFieldPath) {
+ /* parse the field path */
+ FieldPath fieldPath(theFieldPath);
+ includePath(&fieldPath, 0, fieldPath.getPathLength(), false);
+ }
+
+ void ExpressionObject::excludePath(const string &theFieldPath) {
+ /* parse the field path */
+ FieldPath fieldPath(theFieldPath);
+ includePath(&fieldPath, 0, fieldPath.getPathLength(), true);
+ }
+
+ intrusive_ptr<Expression> ExpressionObject::getField(
+ const string &fieldName) const {
+ const size_t n = vFieldName.size();
+ for(size_t i = 0; i < n; ++i) {
+ if (fieldName.compare(vFieldName[i]) == 0)
+ return vpExpression[i];
+ }
+
+ /* if we got here, we didn't find it */
+ return intrusive_ptr<Expression>();
+ }
+
+ void ExpressionObject::emitPaths(
+ BSONObjBuilder *pBuilder, vector<string> *pvPath) const {
+ if (!path.size())
+ return;
+
+ /* we use these for loops */
+ const size_t nField = vFieldName.size();
+ const size_t nPath = pvPath->size();
+
+ /*
+ We can iterate over the inclusion/exclusion paths in their
+ (random) set order because they don't affect the order that
+ fields are listed in the result. That comes from the underlying
+ Document they are fetched from.
+ */
+ for(set<string>::const_iterator end(path.end()),
+ iter(path.begin()); iter != end; ++iter) {
+
+ /* find the matching field description */
+ size_t iField = 0;
+ for(; iField < nField; ++iField) {
+ if (iter->compare(vFieldName[iField]) == 0)
+ break;
+ }
+
+ if (iField == nField) {
+ /*
+ If we didn't find a matching field description, this is the
+ leaf, so add the path.
+ */
+ stringstream ss;
+
+ for(size_t iPath = 0; iPath < nPath; ++iPath)
+ ss << (*pvPath)[iPath] << ".";
+ ss << *iter;
+
+ pBuilder->append(ss.str(), !excludePaths);
+ }
+ else {
+ /*
+ If we found a matching field description, then we need to
+ descend into the next level.
+ */
+ Expression *pE = vpExpression[iField].get();
+ ExpressionObject *pEO = dynamic_cast<ExpressionObject *>(pE);
+ assert(pEO);
+
+ /*
+ Add the current field name to the path being built up,
+ then go down into the next level.
+ */
+ PathPusher pathPusher(pvPath, vFieldName[iField]);
+ pEO->emitPaths(pBuilder, pvPath);
+ }
+ }
+ }
+
+ void ExpressionObject::documentToBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const {
+
+ /* emit any inclusion/exclusion paths */
+ vector<string> vPath;
+ emitPaths(pBuilder, &vPath);
+
+ /* then add any expressions */
+ const size_t nField = vFieldName.size();
+ const set<string>::const_iterator pathEnd(path.end());
+ for(size_t iField = 0; iField < nField; ++iField) {
+ string fieldName(vFieldName[iField]);
+
+ /* if we already took care of this, don't repeat it */
+ if (path.find(fieldName) != pathEnd)
+ continue;
+
+ vpExpression[iField]->addToBsonObj(pBuilder, fieldName, depth + 1);
+ }
+ }
+
+ void ExpressionObject::addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+
+ BSONObjBuilder objBuilder;
+ documentToBson(&objBuilder, depth);
+ pBuilder->append(fieldName, objBuilder.done());
+ }
+
+ void ExpressionObject::addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const {
+
+ BSONObjBuilder objBuilder;
+ documentToBson(&objBuilder, depth);
+ pBuilder->append(objBuilder.done());
+ }
+
+ /* --------------------- ExpressionFieldPath --------------------------- */
+
+ ExpressionFieldPath::~ExpressionFieldPath() {
+ }
+
+ intrusive_ptr<ExpressionFieldPath> ExpressionFieldPath::create(
+ const string &fieldPath) {
+ intrusive_ptr<ExpressionFieldPath> pExpression(
+ new ExpressionFieldPath(fieldPath));
+ return pExpression;
+ }
+
+ ExpressionFieldPath::ExpressionFieldPath(
+ const string &theFieldPath):
+ fieldPath(theFieldPath) {
+ }
+
+ intrusive_ptr<Expression> ExpressionFieldPath::optimize() {
+ /* nothing can be done for these */
+ return intrusive_ptr<Expression>(this);
+ }
+
+ intrusive_ptr<const Value> ExpressionFieldPath::evaluatePath(
+ size_t index, const size_t pathLength,
+ intrusive_ptr<Document> pDocument) const {
+ intrusive_ptr<const Value> pValue; /* the return value */
+
+ pValue = pDocument->getValue(fieldPath.getFieldName(index));
+
+ /* if the field doesn't exist, quit with an undefined value */
+ if (!pValue.get())
+ return Value::getUndefined();
+
+ /* if we've hit the end of the path, stop */
+ ++index;
+ if (index >= pathLength)
+ return pValue;
+
+ /*
+ We're diving deeper. If the value was null, return null.
+ */
+ BSONType type = pValue->getType();
+ if ((type == Undefined) || (type == jstNULL))
+ return Value::getUndefined();
+
+ if (type == Object) {
+ /* extract from the next level down */
+ return evaluatePath(index, pathLength, pValue->getDocument());
+ }
+
+ if (type == Array) {
+ /*
+ We're going to repeat this for each member of the array,
+ building up a new array as we go.
+ */
+ vector<intrusive_ptr<const Value> > result;
+ intrusive_ptr<ValueIterator> pIter(pValue->getArray());
+ while(pIter->more()) {
+ intrusive_ptr<const Value> pItem(pIter->next());
+ BSONType iType = pItem->getType();
+ if ((iType == Undefined) || (iType == jstNULL)) {
+ result.push_back(pItem);
+ continue;
+ }
+
+ uassert(16014, str::stream() <<
+ "the element \"" << fieldPath.getFieldName(index) <<
+ "\" along the dotted path \"" <<
+ fieldPath.getPath(false) <<
+ "\" is not an object, and cannot be navigated",
+ iType == Object);
+ intrusive_ptr<const Value> itemResult(
+ evaluatePath(index, pathLength, pItem->getDocument()));
+ result.push_back(itemResult);
+ }
+
+ return Value::createArray(result);
+ }
+
+ uassert(16015, str::stream() <<
+ "can't navigate into value of type " << type <<
+ "at \"" << fieldPath.getFieldName(index) <<
+ "\" in dotted path \"" << fieldPath.getPath(false),
+ false);
+ return intrusive_ptr<const Value>();
+ }
+
+ intrusive_ptr<const Value> ExpressionFieldPath::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ return evaluatePath(0, fieldPath.getPathLength(), pDocument);
+ }
+
+ void ExpressionFieldPath::addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+ pBuilder->append(fieldName, fieldPath.getPath(true));
+ }
+
+ void ExpressionFieldPath::addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const {
+ pBuilder->append(getFieldPath(true));
+ }
+
+ /* --------------------- ExpressionFieldPath --------------------------- */
+
+ ExpressionFieldRange::~ExpressionFieldRange() {
+ }
+
+ intrusive_ptr<Expression> ExpressionFieldRange::optimize() {
+ /* if there is no range to match, this will never evaluate true */
+ if (!pRange.get())
+ return ExpressionConstant::create(Value::getFalse());
+
+ /*
+ If we ended up with a double un-ended range, anything matches. I
+ don't know how that can happen, given intersect()'s interface, but
+ here it is, just in case.
+ */
+ if (!pRange->pBottom.get() && !pRange->pTop.get())
+ return ExpressionConstant::create(Value::getTrue());
+
+ /*
+ In all other cases, we have to test candidate values. The
+ intersect() method has already optimized those tests, so there
+ aren't any more optimizations to look for here.
+ */
+ return intrusive_ptr<Expression>(this);
+ }
+
+ intrusive_ptr<const Value> ExpressionFieldRange::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ /* if there's no range, there can't be a match */
+ if (!pRange.get())
+ return Value::getFalse();
+
+ /* get the value of the specified field */
+ intrusive_ptr<const Value> pValue(pFieldPath->evaluate(pDocument));
+
+ /* see if it fits within any of the ranges */
+ if (pRange->contains(pValue))
+ return Value::getTrue();
+
+ return Value::getFalse();
+ }
+
+ void ExpressionFieldRange::addToBson(
+ Builder *pBuilder, unsigned depth) const {
+ if (!pRange.get()) {
+ /* nothing will satisfy this predicate */
+ pBuilder->append(false);
+ return;
+ }
+
+ if (!pRange->pTop.get() && !pRange->pBottom.get()) {
+ /* any value will satisfy this predicate */
+ pBuilder->append(true);
+ return;
+ }
+
+ if (pRange->pTop.get() == pRange->pBottom.get()) {
+ BSONArrayBuilder operands;
+ pFieldPath->addToBsonArray(&operands, depth);
+ pRange->pTop->addToBsonArray(&operands);
+
+ BSONObjBuilder equals;
+ equals.append("$eq", operands.arr());
+ pBuilder->append(&equals);
+ return;
+ }
+
+ BSONObjBuilder leftOperator;
+ if (pRange->pBottom.get()) {
+ BSONArrayBuilder leftOperands;
+ pFieldPath->addToBsonArray(&leftOperands, depth);
+ pRange->pBottom->addToBsonArray(&leftOperands);
+ leftOperator.append(
+ (pRange->bottomOpen ? "$gt" : "$gte"),
+ leftOperands.arr());
+
+ if (!pRange->pTop.get()) {
+ pBuilder->append(&leftOperator);
+ return;
+ }
+ }
+
+ BSONObjBuilder rightOperator;
+ if (pRange->pTop.get()) {
+ BSONArrayBuilder rightOperands;
+ pFieldPath->addToBsonArray(&rightOperands, depth);
+ pRange->pTop->addToBsonArray(&rightOperands);
+ rightOperator.append(
+ (pRange->topOpen ? "$lt" : "$lte"),
+ rightOperands.arr());
+
+ if (!pRange->pBottom.get()) {
+ pBuilder->append(&rightOperator);
+ return;
+ }
+ }
+
+ BSONArrayBuilder andOperands;
+ andOperands.append(leftOperator.done());
+ andOperands.append(rightOperator.done());
+ BSONObjBuilder andOperator;
+ andOperator.append("$and", andOperands.arr());
+ pBuilder->append(&andOperator);
+ }
+
+ void ExpressionFieldRange::addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+ BuilderObj builder(pBuilder, fieldName);
+ addToBson(&builder, depth);
+ }
+
+ void ExpressionFieldRange::addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const {
+ BuilderArray builder(pBuilder);
+ addToBson(&builder, depth);
+ }
+
+ void ExpressionFieldRange::toMatcherBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const {
+ assert(pRange.get()); // otherwise, we can't do anything
+
+ /* if there are no endpoints, then every value is accepted */
+ if (!pRange->pBottom.get() && !pRange->pTop.get())
+ return; // nothing to add to the predicate
+
+ /* we're going to need the field path */
+ string fieldPath(pFieldPath->getFieldPath(false));
+
+ BSONObjBuilder range;
+ if (pRange->pBottom.get()) {
+ /* the test for equality doesn't generate a subobject */
+ if (pRange->pBottom.get() == pRange->pTop.get()) {
+ pRange->pBottom->addToBsonObj(pBuilder, fieldPath);
+ return;
+ }
+
+ pRange->pBottom->addToBsonObj(
+ pBuilder, (pRange->bottomOpen ? "$gt" : "$gte"));
+ }
+
+ if (pRange->pTop.get()) {
+ pRange->pTop->addToBsonObj(
+ pBuilder, (pRange->topOpen ? "$lt" : "$lte"));
+ }
+
+ pBuilder->append(fieldPath, range.done());
+ }
+
+ intrusive_ptr<ExpressionFieldRange> ExpressionFieldRange::create(
+ const intrusive_ptr<ExpressionFieldPath> &pFieldPath, CmpOp cmpOp,
+ const intrusive_ptr<const Value> &pValue) {
+ intrusive_ptr<ExpressionFieldRange> pE(
+ new ExpressionFieldRange(pFieldPath, cmpOp, pValue));
+ return pE;
+ }
+
+ ExpressionFieldRange::ExpressionFieldRange(
+ const intrusive_ptr<ExpressionFieldPath> &pTheFieldPath, CmpOp cmpOp,
+ const intrusive_ptr<const Value> &pValue):
+ pFieldPath(pTheFieldPath),
+ pRange(new Range(cmpOp, pValue)) {
+ }
+
+ void ExpressionFieldRange::intersect(
+ CmpOp cmpOp, const intrusive_ptr<const Value> &pValue) {
+
+ /* create the new range */
+ scoped_ptr<Range> pNew(new Range(cmpOp, pValue));
+
+ /*
+ Go through the range list. For every range, either add the
+ intersection of that to the range list, or if there is none, the
+ original range. This has the effect of restricting overlapping
+ ranges, but leaving non-overlapping ones as-is.
+ */
+ pRange.reset(pRange->intersect(pNew.get()));
+ }
+
+ ExpressionFieldRange::Range::Range(
+ CmpOp cmpOp, const intrusive_ptr<const Value> &pValue):
+ bottomOpen(false),
+ topOpen(false),
+ pBottom(),
+ pTop() {
+ switch(cmpOp) {
+ case NE:
+ bottomOpen = topOpen = true;
+ /* FALLTHROUGH */
+ case EQ:
+ pBottom = pTop = pValue;
+ break;
+
+ case GT:
+ bottomOpen = true;
+ /* FALLTHROUGH */
+ case GTE:
+ topOpen = true;
+ pBottom = pValue;
+ break;
+
+ case LT:
+ topOpen = true;
+ /* FALLTHROUGH */
+ case LTE:
+ bottomOpen = true;
+ pTop = pValue;
+ break;
+
+ case CMP:
+ assert(false); // not allowed
+ break;
+ }
+ }
+
+ ExpressionFieldRange::Range::Range(const Range &rRange):
+ bottomOpen(rRange.bottomOpen),
+ topOpen(rRange.topOpen),
+ pBottom(rRange.pBottom),
+ pTop(rRange.pTop) {
+ }
+
+ ExpressionFieldRange::Range::Range(
+ const intrusive_ptr<const Value> &pTheBottom, bool theBottomOpen,
+ const intrusive_ptr<const Value> &pTheTop, bool theTopOpen):
+ bottomOpen(theBottomOpen),
+ topOpen(theTopOpen),
+ pBottom(pTheBottom),
+ pTop(pTheTop) {
+ }
+
+ ExpressionFieldRange::Range *ExpressionFieldRange::Range::intersect(
+ const Range *pRange) const {
+ /*
+ Find the max of the bottom end of the ranges.
+
+ Start by assuming the maximum is from pRange. Then, if we have
+ values of our own, see if they're greater.
+ */
+ intrusive_ptr<const Value> pMaxBottom(pRange->pBottom);
+ bool maxBottomOpen = pRange->bottomOpen;
+ if (pBottom.get()) {
+ if (!pRange->pBottom.get()) {
+ pMaxBottom = pBottom;
+ maxBottomOpen = bottomOpen;
+ }
+ else {
+ const int cmp = Value::compare(pBottom, pRange->pBottom);
+ if (cmp == 0)
+ maxBottomOpen = bottomOpen || pRange->bottomOpen;
+ else if (cmp > 0) {
+ pMaxBottom = pBottom;
+ maxBottomOpen = bottomOpen;
+ }
+ }
+ }
+
+ /*
+ Find the minimum of the tops of the ranges.
+
+ Start by assuming the minimum is from pRange. Then, if we have
+ values of our own, see if they are less.
+ */
+ intrusive_ptr<const Value> pMinTop(pRange->pTop);
+ bool minTopOpen = pRange->topOpen;
+ if (pTop.get()) {
+ if (!pRange->pTop.get()) {
+ pMinTop = pTop;
+ minTopOpen = topOpen;
+ }
+ else {
+ const int cmp = Value::compare(pTop, pRange->pTop);
+ if (cmp == 0)
+ minTopOpen = topOpen || pRange->topOpen;
+ else if (cmp < 0) {
+ pMinTop = pTop;
+ minTopOpen = topOpen;
+ }
+ }
+ }
+
+ /*
+ If the intersections didn't create a disjoint set, create the
+ new range.
+ */
+ if (Value::compare(pMaxBottom, pMinTop) <= 0)
+ return new Range(pMaxBottom, maxBottomOpen, pMinTop, minTopOpen);
+
+ /* if we got here, the intersection is empty */
+ return NULL;
+ }
+
+ bool ExpressionFieldRange::Range::contains(
+ const intrusive_ptr<const Value> &pValue) const {
+ if (pBottom.get()) {
+ const int cmp = Value::compare(pValue, pBottom);
+ if (cmp < 0)
+ return false;
+ if (bottomOpen && (cmp == 0))
+ return false;
+ }
+
+ if (pTop.get()) {
+ const int cmp = Value::compare(pValue, pTop);
+ if (cmp > 0)
+ return false;
+ if (topOpen && (cmp == 0))
+ return false;
+ }
+
+ return true;
+ }
+
+ /* ------------------------- ExpressionMinute ----------------------------- */
+
+ ExpressionMinute::~ExpressionMinute() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionMinute::create() {
+ intrusive_ptr<ExpressionMinute> pExpression(new ExpressionMinute());
+ return pExpression;
+ }
+
+ ExpressionMinute::ExpressionMinute():
+ ExpressionNary() {
+ }
+
+ void ExpressionMinute::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionMinute::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ return Value::createInt(date.tm_min);
+ }
+
+ const char *ExpressionMinute::getOpName() const {
+ return "$minute";
+ }
+
+ /* ----------------------- ExpressionMod ---------------------------- */
+
+ ExpressionMod::~ExpressionMod() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionMod::create() {
+ intrusive_ptr<ExpressionMod> pExpression(new ExpressionMod());
+ return pExpression;
+ }
+
+ ExpressionMod::ExpressionMod():
+ ExpressionNary() {
+ }
+
+ void ExpressionMod::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(2);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionMod::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ BSONType productType;
+ checkArgCount(2);
+ intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+ intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+
+ productType = Value::getWidestNumeric(pRight->getType(), pLeft->getType());
+
+ long long right = pRight->coerceToLong();
+ if (right == 0)
+ return Value::getUndefined();
+
+ long long left = pLeft->coerceToLong();
+ if (productType == NumberLong)
+ return Value::createLong(left % right);
+ return Value::createInt((int)left % right);
+ }
+
+ const char *ExpressionMod::getOpName() const {
+ return "$mod";
+ }
+
+ /* ------------------------- ExpressionMonth ----------------------------- */
+
+ ExpressionMonth::~ExpressionMonth() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionMonth::create() {
+ intrusive_ptr<ExpressionMonth> pExpression(new ExpressionMonth());
+ return pExpression;
+ }
+
+ ExpressionMonth::ExpressionMonth():
+ ExpressionNary() {
+ }
+
+ void ExpressionMonth::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionMonth::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ return Value::createInt(date.tm_mon+1); // MySQL uses 1-12 tm uses 0-11
+ }
+
+ const char *ExpressionMonth::getOpName() const {
+ return "$month";
+ }
+
+ /* ------------------------- ExpressionMultiply ----------------------------- */
+
+ ExpressionMultiply::~ExpressionMultiply() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionMultiply::create() {
+ intrusive_ptr<ExpressionMultiply> pExpression(new ExpressionMultiply());
+ return pExpression;
+ }
+
+ ExpressionMultiply::ExpressionMultiply():
+ ExpressionNary() {
+ }
+
+ intrusive_ptr<const Value> ExpressionMultiply::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ /*
+ We'll try to return the narrowest possible result value. To do that
+ without creating intermediate Values, do the arithmetic for double
+ and integral types in parallel, tracking the current narrowest
+ type.
+ */
+ double doubleProduct = 1;
+ long long longProduct = 1;
+ BSONType productType = NumberInt;
+
+ const size_t n = vpOperand.size();
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<const Value> pValue(vpOperand[i]->evaluate(pDocument));
+
+ productType = Value::getWidestNumeric(productType, pValue->getType());
+ doubleProduct *= pValue->coerceToDouble();
+ longProduct *= pValue->coerceToLong();
+ }
+
+ if (productType == NumberDouble)
+ return Value::createDouble(doubleProduct);
+ if (productType == NumberLong)
+ return Value::createLong(longProduct);
+ return Value::createInt((int)longProduct);
+ }
+
+ const char *ExpressionMultiply::getOpName() const {
+ return "$multiply";
+ }
+
+ intrusive_ptr<ExpressionNary> (*ExpressionMultiply::getFactory() const)() {
+ return ExpressionMultiply::create;
+ }
+
+ /* ------------------------- ExpressionHour ----------------------------- */
+
+ ExpressionHour::~ExpressionHour() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionHour::create() {
+ intrusive_ptr<ExpressionHour> pExpression(new ExpressionHour());
+ return pExpression;
+ }
+
+ ExpressionHour::ExpressionHour():
+ ExpressionNary() {
+ }
+
+ void ExpressionHour::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionHour::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ return Value::createInt(date.tm_hour);
+ }
+
+ const char *ExpressionHour::getOpName() const {
+ return "$hour";
+ }
+
+ /* ----------------------- ExpressionIfNull ---------------------------- */
+
+ ExpressionIfNull::~ExpressionIfNull() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionIfNull::create() {
+ intrusive_ptr<ExpressionIfNull> pExpression(new ExpressionIfNull());
+ return pExpression;
+ }
+
+ ExpressionIfNull::ExpressionIfNull():
+ ExpressionNary() {
+ }
+
+ void ExpressionIfNull::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(2);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionIfNull::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(2);
+ intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+ BSONType leftType = pLeft->getType();
+
+ if ((leftType != Undefined) && (leftType != jstNULL))
+ return pLeft;
+
+ intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+ return pRight;
+ }
+
+ const char *ExpressionIfNull::getOpName() const {
+ return "$ifNull";
+ }
+
+ /* ------------------------ ExpressionNary ----------------------------- */
+
+ ExpressionNary::ExpressionNary():
+ vpOperand() {
+ }
+
+ intrusive_ptr<Expression> ExpressionNary::optimize() {
+ unsigned constCount = 0; // count of constant operands
+ unsigned stringCount = 0; // count of constant string operands
+ const size_t n = vpOperand.size();
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<Expression> pNew(vpOperand[i]->optimize());
+
+ /* subsitute the optimized expression */
+ vpOperand[i] = pNew;
+
+ /* check to see if the result was a constant */
+ const ExpressionConstant *pConst =
+ dynamic_cast<ExpressionConstant *>(pNew.get());
+ if (pConst) {
+ ++constCount;
+ if (pConst->getValue()->getType() == String)
+ ++stringCount;
+ }
+ }
+
+ /*
+ If all the operands are constant, we can replace this expression
+ with a constant. We can find the value by evaluating this
+ expression over a NULL Document because evaluating the
+ ExpressionConstant never refers to the argument Document.
+ */
+ if (constCount == n) {
+ intrusive_ptr<const Value> pResult(
+ evaluate(intrusive_ptr<Document>()));
+ intrusive_ptr<Expression> pReplacement(
+ ExpressionConstant::create(pResult));
+ return pReplacement;
+ }
+
+ /*
+ If there are any strings, we can't re-arrange anything, so stop
+ now.
+
+ LATER: we could concatenate adjacent strings as a special case.
+ */
+ if (stringCount)
+ return intrusive_ptr<Expression>(this);
+
+ /*
+ If there's no more than one constant, then we can't do any
+ constant folding, so don't bother going any further.
+ */
+ if (constCount <= 1)
+ return intrusive_ptr<Expression>(this);
+
+ /*
+ If the operator isn't commutative or associative, there's nothing
+ more we can do. We test that by seeing if we can get a factory;
+ if we can, we can use it to construct a temporary expression which
+ we'll evaluate to collapse as many constants as we can down to
+ a single one.
+ */
+ intrusive_ptr<ExpressionNary> (*const pFactory)() = getFactory();
+ if (!pFactory)
+ return intrusive_ptr<Expression>(this);
+
+ /*
+ Create a new Expression that will be the replacement for this one.
+ We actually create two: one to hold constant expressions, and
+ one to hold non-constants. Once we've got these, we evaluate
+ the constant expression to produce a single value, as above.
+ We then add this operand to the end of the non-constant expression,
+ and return that.
+ */
+ intrusive_ptr<ExpressionNary> pNew((*pFactory)());
+ intrusive_ptr<ExpressionNary> pConst((*pFactory)());
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<Expression> pE(vpOperand[i]);
+ if (dynamic_cast<ExpressionConstant *>(pE.get()))
+ pConst->addOperand(pE);
+ else {
+ /*
+ If the child operand is the same type as this, then we can
+ extract its operands and inline them here because we already
+ know this is commutative and associative because it has a
+ factory. We can detect sameness of the child operator by
+ checking for equality of the factory
+
+ Note we don't have to do this recursively, because we
+ called optimize() on all the children first thing in
+ this call to optimize().
+ */
+ ExpressionNary *pNary =
+ dynamic_cast<ExpressionNary *>(pE.get());
+ if (!pNary)
+ pNew->addOperand(pE);
+ else {
+ intrusive_ptr<ExpressionNary> (*const pChildFactory)() =
+ pNary->getFactory();
+ if (pChildFactory != pFactory)
+ pNew->addOperand(pE);
+ else {
+ /* same factory, so flatten */
+ size_t nChild = pNary->vpOperand.size();
+ for(size_t iChild = 0; iChild < nChild; ++iChild) {
+ intrusive_ptr<Expression> pCE(
+ pNary->vpOperand[iChild]);
+ if (dynamic_cast<ExpressionConstant *>(pCE.get()))
+ pConst->addOperand(pCE);
+ else
+ pNew->addOperand(pCE);
+ }
+ }
+ }
+ }
+ }
+
+ /*
+ If there was only one constant, add it to the end of the expression
+ operand vector.
+ */
+ if (pConst->vpOperand.size() == 1)
+ pNew->addOperand(pConst->vpOperand[0]);
+ else if (pConst->vpOperand.size() > 1) {
+ /*
+ If there was more than one constant, collapse all the constants
+ together before adding the result to the end of the expression
+ operand vector.
+ */
+ intrusive_ptr<const Value> pResult(
+ pConst->evaluate(intrusive_ptr<Document>()));
+ pNew->addOperand(ExpressionConstant::create(pResult));
+ }
+
+ return pNew;
+ }
+
+ void ExpressionNary::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ vpOperand.push_back(pExpression);
+ }
+
+ intrusive_ptr<ExpressionNary> (*ExpressionNary::getFactory() const)() {
+ return NULL;
+ }
+
+ void ExpressionNary::toBson(
+ BSONObjBuilder *pBuilder, const char *pOpName, unsigned depth) const {
+ const size_t nOperand = vpOperand.size();
+ assert(nOperand > 0);
+ if (nOperand == 1) {
+ vpOperand[0]->addToBsonObj(pBuilder, pOpName, depth + 1);
+ return;
+ }
+
+ /* build up the array */
+ BSONArrayBuilder arrBuilder;
+ for(size_t i = 0; i < nOperand; ++i)
+ vpOperand[i]->addToBsonArray(&arrBuilder, depth + 1);
+
+ pBuilder->append(pOpName, arrBuilder.arr());
+ }
+
+ void ExpressionNary::addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const {
+ BSONObjBuilder exprBuilder;
+ toBson(&exprBuilder, getOpName(), depth);
+ pBuilder->append(fieldName, exprBuilder.done());
+ }
+
+ void ExpressionNary::addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const {
+ BSONObjBuilder exprBuilder;
+ toBson(&exprBuilder, getOpName(), depth);
+ pBuilder->append(exprBuilder.done());
+ }
+
+ void ExpressionNary::checkArgLimit(unsigned maxArgs) const {
+ uassert(15993, str::stream() << getOpName() <<
+ " only takes " << maxArgs <<
+ " operand" << (maxArgs == 1 ? "" : "s"),
+ vpOperand.size() < maxArgs);
+ }
+
+ void ExpressionNary::checkArgCount(unsigned reqArgs) const {
+ uassert(15997, str::stream() << getOpName() <<
+ ": insufficient operands; " << reqArgs <<
+ " required, only got " << vpOperand.size(),
+ vpOperand.size() == reqArgs);
+ }
+
+ /* ----------------------- ExpressionNoOp ------------------------------ */
+
+ ExpressionNoOp::~ExpressionNoOp() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionNoOp::create() {
+ intrusive_ptr<ExpressionNoOp> pExpression(new ExpressionNoOp());
+ return pExpression;
+ }
+
+ intrusive_ptr<Expression> ExpressionNoOp::optimize() {
+ checkArgCount(1);
+ intrusive_ptr<Expression> pR(vpOperand[0]->optimize());
+ return pR;
+ }
+
+ ExpressionNoOp::ExpressionNoOp():
+ ExpressionNary() {
+ }
+
+ void ExpressionNoOp::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionNoOp::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pValue(vpOperand[0]->evaluate(pDocument));
+ return pValue;
+ }
+
+ const char *ExpressionNoOp::getOpName() const {
+ return "$noOp";
+ }
+
+ /* ------------------------- ExpressionNot ----------------------------- */
+
+ ExpressionNot::~ExpressionNot() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionNot::create() {
+ intrusive_ptr<ExpressionNot> pExpression(new ExpressionNot());
+ return pExpression;
+ }
+
+ ExpressionNot::ExpressionNot():
+ ExpressionNary() {
+ }
+
+ void ExpressionNot::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionNot::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pOp(vpOperand[0]->evaluate(pDocument));
+
+ bool b = pOp->coerceToBool();
+ if (b)
+ return Value::getFalse();
+ return Value::getTrue();
+ }
+
+ const char *ExpressionNot::getOpName() const {
+ return "$not";
+ }
+
+ /* -------------------------- ExpressionOr ----------------------------- */
+
+ ExpressionOr::~ExpressionOr() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionOr::create() {
+ intrusive_ptr<ExpressionNary> pExpression(new ExpressionOr());
+ return pExpression;
+ }
+
+ ExpressionOr::ExpressionOr():
+ ExpressionNary() {
+ }
+
+ intrusive_ptr<const Value> ExpressionOr::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ const size_t n = vpOperand.size();
+ for(size_t i = 0; i < n; ++i) {
+ intrusive_ptr<const Value> pValue(vpOperand[i]->evaluate(pDocument));
+ if (pValue->coerceToBool())
+ return Value::getTrue();
+ }
+
+ return Value::getFalse();
+ }
+
+ void ExpressionOr::toMatcherBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const {
+ BSONObjBuilder opArray;
+ const size_t n = vpOperand.size();
+ for(size_t i = 0; i < n; ++i)
+ vpOperand[i]->toMatcherBson(&opArray, depth + 1);
+
+ pBuilder->append("$or", opArray.done());
+ }
+
+ intrusive_ptr<ExpressionNary> (*ExpressionOr::getFactory() const)() {
+ return ExpressionOr::create;
+ }
+
+ intrusive_ptr<Expression> ExpressionOr::optimize() {
+ /* optimize the disjunction as much as possible */
+ intrusive_ptr<Expression> pE(ExpressionNary::optimize());
+
+ /* if the result isn't a conjunction, we can't do anything */
+ ExpressionOr *pOr = dynamic_cast<ExpressionOr *>(pE.get());
+ if (!pOr)
+ return pE;
+
+ /*
+ Check the last argument on the result; if it's not constant (as
+ promised by ExpressionNary::optimize(),) then there's nothing
+ we can do.
+ */
+ const size_t n = pOr->vpOperand.size();
+ intrusive_ptr<Expression> pLast(pOr->vpOperand[n - 1]);
+ const ExpressionConstant *pConst =
+ dynamic_cast<ExpressionConstant *>(pLast.get());
+ if (!pConst)
+ return pE;
+
+ /*
+ Evaluate and coerce the last argument to a boolean. If it's true,
+ then we can replace this entire expression.
+ */
+ bool last = pLast->evaluate(intrusive_ptr<Document>())->coerceToBool();
+ if (last) {
+ intrusive_ptr<ExpressionConstant> pFinal(
+ ExpressionConstant::create(Value::getTrue()));
+ return pFinal;
+ }
+
+ /*
+ If we got here, the final operand was false, so we don't need it
+ anymore. If there was only one other operand, we don't need the
+ conjunction either. Note we still need to keep the promise that
+ the result will be a boolean.
+ */
+ if (n == 2) {
+ intrusive_ptr<Expression> pFinal(
+ ExpressionCoerceToBool::create(pOr->vpOperand[0]));
+ return pFinal;
+ }
+
+ /*
+ Remove the final "false" value, and return the new expression.
+ */
+ pOr->vpOperand.resize(n - 1);
+ return pE;
+ }
+
+ const char *ExpressionOr::getOpName() const {
+ return "$or";
+ }
+
+ /* ------------------------- ExpressionSecond ----------------------------- */
+
+ ExpressionSecond::~ExpressionSecond() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionSecond::create() {
+ intrusive_ptr<ExpressionSecond> pExpression(new ExpressionSecond());
+ return pExpression;
+ }
+
+ ExpressionSecond::ExpressionSecond():
+ ExpressionNary() {
+ }
+
+ void ExpressionSecond::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionSecond::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ return Value::createInt(date.tm_sec);
+ }
+
+ const char *ExpressionSecond::getOpName() const {
+ return "$second";
+ }
+
+ /* ----------------------- ExpressionStrcasecmp ---------------------------- */
+
+ ExpressionStrcasecmp::~ExpressionStrcasecmp() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionStrcasecmp::create() {
+ intrusive_ptr<ExpressionStrcasecmp> pExpression(new ExpressionStrcasecmp());
+ return pExpression;
+ }
+
+ ExpressionStrcasecmp::ExpressionStrcasecmp():
+ ExpressionNary() {
+ }
+
+ void ExpressionStrcasecmp::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(2);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionStrcasecmp::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(2);
+ intrusive_ptr<const Value> pString1(vpOperand[0]->evaluate(pDocument));
+ intrusive_ptr<const Value> pString2(vpOperand[1]->evaluate(pDocument));
+
+ /* boost::iequals returns a bool not an int so strings must actually be allocated */
+ string str1 = boost::to_upper_copy( pString1->coerceToString() );
+ string str2 = boost::to_upper_copy( pString2->coerceToString() );
+ int result = str1.compare(str2);
+
+ if (result == 0)
+ return Value::getZero();
+ if (result > 0)
+ return Value::getOne();
+ return Value::getMinusOne();
+ }
+
+ const char *ExpressionStrcasecmp::getOpName() const {
+ return "$strcasecmp";
+ }
+
+ /* ----------------------- ExpressionSubstr ---------------------------- */
+
+ ExpressionSubstr::~ExpressionSubstr() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionSubstr::create() {
+ intrusive_ptr<ExpressionSubstr> pExpression(new ExpressionSubstr());
+ return pExpression;
+ }
+
+ ExpressionSubstr::ExpressionSubstr():
+ ExpressionNary() {
+ }
+
+ void ExpressionSubstr::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(3);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionSubstr::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(3);
+ intrusive_ptr<const Value> pString(vpOperand[0]->evaluate(pDocument));
+ intrusive_ptr<const Value> pLower(vpOperand[1]->evaluate(pDocument));
+ intrusive_ptr<const Value> pLength(vpOperand[2]->evaluate(pDocument));
+
+ string str = pString->coerceToString();
+ uassert(16034, str::stream() << getOpName() <<
+ ": starting index must be a numeric type (is BSON type " <<
+ pLower->getType() << ")",
+ (pLower->getType() == NumberInt
+ || pLower->getType() == NumberLong
+ || pLower->getType() == NumberDouble));
+ uassert(16035, str::stream() << getOpName() <<
+ ": length must be a numeric type (is BSON type " <<
+ pLength->getType() << ")",
+ (pLength->getType() == NumberInt
+ || pLength->getType() == NumberLong
+ || pLength->getType() == NumberDouble));
+ string::size_type lower = static_cast< string::size_type >( pLower->coerceToLong() );
+ string::size_type length = static_cast< string::size_type >( pLength->coerceToLong() );
+ return Value::createString( str.substr(lower, length) );
+ }
+
+ const char *ExpressionSubstr::getOpName() const {
+ return "$substr";
+ }
+
+ /* ----------------------- ExpressionSubtract ---------------------------- */
+
+ ExpressionSubtract::~ExpressionSubtract() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionSubtract::create() {
+ intrusive_ptr<ExpressionSubtract> pExpression(new ExpressionSubtract());
+ return pExpression;
+ }
+
+ ExpressionSubtract::ExpressionSubtract():
+ ExpressionNary() {
+ }
+
+ void ExpressionSubtract::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(2);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionSubtract::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ BSONType productType;
+ checkArgCount(2);
+ intrusive_ptr<const Value> pLeft(vpOperand[0]->evaluate(pDocument));
+ intrusive_ptr<const Value> pRight(vpOperand[1]->evaluate(pDocument));
+ if (pLeft->getType() == Date) {
+ long long right;
+ long long left = pLeft->coerceToDate();
+ if (pRight->getType() == Date)
+ right = pRight->coerceToDate();
+ else
+ right = static_cast<long long>(pRight->coerceToDouble()*24*60*60*1000);
+ return Value::createDate(Date_t(left-right));
+ }
+
+ uassert(15996, "cannot subtract one date from another",
+ pRight->getType() != Date);
+
+ productType = Value::getWidestNumeric(
+ pRight->getType(), pLeft->getType());
+
+
+ if (productType == NumberDouble) {
+ double right = pRight->coerceToDouble();
+ double left = pLeft->coerceToDouble();
+ return Value::createDouble(left - right);
+ }
+
+ long long right = pRight->coerceToLong();
+ long long left = pLeft->coerceToLong();
+ if (productType == NumberLong)
+ return Value::createLong(left - right);
+ return Value::createInt((int)(left - right));
+ }
+
+ const char *ExpressionSubtract::getOpName() const {
+ return "$subtract";
+ }
+
+ /* ------------------------- ExpressionToLower ----------------------------- */
+
+ ExpressionToLower::~ExpressionToLower() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionToLower::create() {
+ intrusive_ptr<ExpressionToLower> pExpression(new ExpressionToLower());
+ return pExpression;
+ }
+
+ ExpressionToLower::ExpressionToLower():
+ ExpressionNary() {
+ }
+
+ void ExpressionToLower::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionToLower::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pString(vpOperand[0]->evaluate(pDocument));
+ string str = pString->coerceToString();
+ boost::to_lower(str);
+ return Value::createString(str);
+ }
+
+ const char *ExpressionToLower::getOpName() const {
+ return "$toLower";
+ }
+
+ /* ------------------------- ExpressionToUpper -------------------------- */
+
+ ExpressionToUpper::~ExpressionToUpper() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionToUpper::create() {
+ intrusive_ptr<ExpressionToUpper> pExpression(new ExpressionToUpper());
+ return pExpression;
+ }
+
+ ExpressionToUpper::ExpressionToUpper():
+ ExpressionNary() {
+ }
+
+ void ExpressionToUpper::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionToUpper::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pString(vpOperand[0]->evaluate(pDocument));
+ string str(pString->coerceToString());
+ boost::to_upper(str);
+ return Value::createString(str);
+ }
+
+ const char *ExpressionToUpper::getOpName() const {
+ return "$toUpper";
+ }
+
+ /* ------------------------- ExpressionWeek ----------------------------- */
+
+ ExpressionWeek::~ExpressionWeek() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionWeek::create() {
+ intrusive_ptr<ExpressionWeek> pExpression(new ExpressionWeek());
+ return pExpression;
+ }
+
+ ExpressionWeek::ExpressionWeek():
+ ExpressionNary() {
+ }
+
+ void ExpressionWeek::addOperand(const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionWeek::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ int dayOfWeek = date.tm_wday+1;
+ int dayOfYear = date.tm_yday;
+ int week = 0;
+ int janFirst = 0;
+ int offset = 0;
+
+ janFirst = dayOfWeek - dayOfYear % 7;
+ offset = (janFirst + 6) % 7;
+ week = (dayOfYear + offset) / 7;
+ return Value::createInt(week);
+ }
+
+ const char *ExpressionWeek::getOpName() const {
+ return "$week";
+ }
+
+ /* ------------------------- ExpressionYear ----------------------------- */
+
+ ExpressionYear::~ExpressionYear() {
+ }
+
+ intrusive_ptr<ExpressionNary> ExpressionYear::create() {
+ intrusive_ptr<ExpressionYear> pExpression(new ExpressionYear());
+ return pExpression;
+ }
+
+ ExpressionYear::ExpressionYear():
+ ExpressionNary() {
+ }
+
+ void ExpressionYear::addOperand(
+ const intrusive_ptr<Expression> &pExpression) {
+ checkArgLimit(1);
+ ExpressionNary::addOperand(pExpression);
+ }
+
+ intrusive_ptr<const Value> ExpressionYear::evaluate(
+ const intrusive_ptr<Document> &pDocument) const {
+ checkArgCount(1);
+ intrusive_ptr<const Value> pDate(vpOperand[0]->evaluate(pDocument));
+ tm date;
+ (pDate->coerceToDate()).toTm(&date);
+ return Value::createInt(date.tm_year+1900); // tm_year is years since 1900
+ }
+
+ const char *ExpressionYear::getOpName() const {
+ return "$year";
+ }
+}
diff --git a/src/mongo/db/pipeline/expression.h b/src/mongo/db/pipeline/expression.h new file mode 100755 index 00000000000..c49e385a3c7 --- /dev/null +++ b/src/mongo/db/pipeline/expression.h @@ -0,0 +1,1223 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include "db/pipeline/field_path.h"
+#include "util/intrusive_counter.h"
+
+
+namespace mongo {
+ class BSONArrayBuilder;
+ class BSONElement;
+ class BSONObjBuilder;
+ class Builder;
+ class Document;
+ class ExpressionContext;
+ class Value;
+
+ class Expression :
+ public IntrusiveCounterUnsigned {
+ public:
+ virtual ~Expression() {};
+
+ /*
+ Optimize the Expression.
+
+ This provides an opportunity to do constant folding, or to
+ collapse nested operators that have the same precedence, such as
+ $add, $and, or $or.
+
+ The Expression should be replaced with the return value, which may
+ or may not be the same object. In the case of constant folding,
+ a computed expression may be replaced by a constant.
+
+ @returns the optimized Expression
+ */
+ virtual intrusive_ptr<Expression> optimize() = 0;
+
+ /*
+ Evaluate the Expression using the given document as input.
+
+ @returns the computed value
+ */
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const = 0;
+
+ /*
+ Add the Expression (and any descendant Expressions) into a BSON
+ object that is under construction.
+
+ Unevaluated Expressions always materialize as objects. Evaluation
+ may produce a scalar or another object, either of which will be
+ substituted inline.
+
+ @param pBuilder the builder to add the expression to
+ @param fieldName the name the object should be given
+ */
+ virtual void addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName,
+ unsigned depth) const = 0;
+
+ /*
+ Add the Expression (and any descendant Expressions) into a BSON
+ array that is under construction.
+
+ Unevaluated Expressions always materialize as objects. Evaluation
+ may produce a scalar or another object, either of which will be
+ substituted inline.
+
+ @param pBuilder the builder to add the expression to
+ */
+ virtual void addToBsonArray(BSONArrayBuilder *pBuilder,
+ unsigned depth) const = 0;
+
+ /*
+ Convert the expression into a BSONObj that corresponds to the
+ db.collection.find() predicate language. This is intended for
+ use by DocumentSourceFilter.
+
+ This is more limited than the full expression language supported
+ by all available expressions in a DocumentSource processing
+ pipeline, and will fail with an assertion if an attempt is made
+ to go outside the bounds of the recognized patterns, which don't
+ include full computed expressions. There are other methods available
+ on DocumentSourceFilter which can be used to analyze a filter
+ predicate and break it up into appropriate expressions which can
+ be translated within these constraints. As a result, the default
+ implementation is to fail with an assertion; only a subset of
+ operators will be able to fulfill this request.
+
+ @param pBuilder the builder to add the expression to.
+ */
+ virtual void toMatcherBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const;
+
+ /*
+ Utility class for parseObject() below.
+
+ Only one array can be unwound in a processing pipeline. If the
+ UNWIND_OK option is used, unwindOk() will return true, and a field
+ can be declared as unwound using unwind(), after which unwindUsed()
+ will return true. Only specify UNWIND_OK if it is OK to unwind an
+ array in the current context.
+
+ DOCUMENT_OK indicates that it is OK to use a Document in the current
+ context.
+ */
+ class ObjectCtx {
+ public:
+ ObjectCtx(int options);
+ static const int UNWIND_OK = 0x0001;
+ static const int DOCUMENT_OK = 0x0002;
+
+ bool unwindOk() const;
+ bool unwindUsed() const;
+ void unwind(string fieldName);
+
+ bool documentOk() const;
+
+ private:
+ int options;
+ string unwindField;
+ };
+
+ /*
+ Parse a BSONElement Object. The object could represent a functional
+ expression or a Document expression.
+
+ @param pBsonElement the element representing the object
+ @param pCtx a MiniCtx representing the options above
+ @returns the parsed Expression
+ */
+ static intrusive_ptr<Expression> parseObject(
+ BSONElement *pBsonElement, ObjectCtx *pCtx);
+
+ static const char unwindName[];
+
+ /*
+ Parse a BSONElement Object which has already been determined to be
+ functional expression.
+
+ @param pOpName the name of the (prefix) operator
+ @param pBsonElement the BSONElement to parse
+ @returns the parsed Expression
+ */
+ static intrusive_ptr<Expression> parseExpression(
+ const char *pOpName, BSONElement *pBsonElement);
+
+
+ /*
+ Parse a BSONElement which is an operand in an Expression.
+
+ @param pBsonElement the expected operand's BSONElement
+ @returns the parsed operand, as an Expression
+ */
+ static intrusive_ptr<Expression> parseOperand(
+ BSONElement *pBsonElement);
+
+ /*
+ Produce a field path string with the field prefix removed.
+
+ Throws an error if the field prefix is not present.
+
+ @param prefixedField the prefixed field
+ @returns the field path with the prefix removed
+ */
+ static string removeFieldPrefix(const string &prefixedField);
+
+ /*
+ Enumeration of comparison operators. These are shared between a
+ few expression implementations, so they are factored out here.
+
+ Any changes to these values require adjustment of the lookup
+ table in the implementation.
+ */
+ enum CmpOp {
+ EQ = 0, // return true for a == b, false otherwise
+ NE = 1, // return true for a != b, false otherwise
+ GT = 2, // return true for a > b, false otherwise
+ GTE = 3, // return true for a >= b, false otherwise
+ LT = 4, // return true for a < b, false otherwise
+ LTE = 5, // return true for a <= b, false otherwise
+ CMP = 6, // return -1, 0, 1 for a < b, a == b, a > b
+ };
+
+ static int signum(int i);
+ };
+
+
+ class ExpressionNary :
+ public Expression,
+ public boost::enable_shared_from_this<ExpressionNary> {
+ public:
+ // virtuals from Expression
+ virtual intrusive_ptr<Expression> optimize();
+ virtual void addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+ virtual void addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+ /*
+ Add an operand to the n-ary expression.
+
+ @param pExpression the expression to add
+ */
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ /*
+ Return a factory function that will make Expression nodes of
+ the same type as this. This will be used to create constant
+ expressions for constant folding for optimize(). Only return
+ a factory function if this operator is both associative and
+ commutative. The default implementation returns NULL; optimize()
+ will recognize that and stop.
+
+ Note that ExpressionNary::optimize() promises that if it uses this
+ to fold constants, then if optimize() returns an ExpressionNary,
+ any remaining constant will be the last one in vpOperand. Derived
+ classes may take advantage of this to do further optimizations in
+ their optimize().
+
+ @returns pointer to a factory function or NULL
+ */
+ virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+ /*
+ Get the name of the operator.
+
+ @returns the name of the operator; this string belongs to the class
+ implementation, and should not be deleted
+ and should not
+ */
+ virtual const char *getOpName() const = 0;
+
+ protected:
+ ExpressionNary();
+
+ vector<intrusive_ptr<Expression> > vpOperand;
+
+ /*
+ Add the expression to the builder.
+
+ If there is only one operand (a unary operator), then the operand
+ is added directly, without an array. For more than one operand,
+ a named array is created. In both cases, the result is an object.
+
+ @param pBuilder the (blank) builder to add the expression to
+ @param pOpName the name of the operator
+ */
+ virtual void toBson(BSONObjBuilder *pBuilder,
+ const char *pOpName, unsigned depth) const;
+
+ /*
+ Checks the current size of vpOperand; if the size equal to or
+ greater than maxArgs, fires a user assertion indicating that this
+ operator cannot have this many arguments.
+
+ The equal is there because this is intended to be used in
+ addOperand() to check for the limit *before* adding the requested
+ argument.
+
+ @param maxArgs the maximum number of arguments the operator accepts
+ */
+ void checkArgLimit(unsigned maxArgs) const;
+
+ /*
+ Checks the current size of vpOperand; if the size is not equal to
+ reqArgs, fires a user assertion indicating that this must have
+ exactly reqArgs arguments.
+
+ This is meant to be used in evaluate(), *before* the evaluation
+ takes place.
+
+ @param reqArgs the number of arguments this operator requires
+ */
+ void checkArgCount(unsigned reqArgs) const;
+ };
+
+
+ class ExpressionAdd :
+ public ExpressionNary {
+ public:
+ // virtuals from Expression
+ virtual ~ExpressionAdd();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+
+ // virtuals from ExpressionNary
+ virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+ /*
+ Create an expression that finds the sum of n operands.
+
+ @returns addition expression
+ */
+ static intrusive_ptr<ExpressionNary> create();
+
+ protected:
+ // virtuals from ExpressionNary
+ virtual void toBson(BSONObjBuilder *pBuilder,
+ const char *pOpName, unsigned depth) const;
+
+ private:
+ ExpressionAdd();
+
+ /*
+ If the operator can be optimized, we save the original here.
+
+ This is necessary because addition must follow its original operand
+ ordering strictly if a string is detected, otherwise string
+ concatenation may appear to have re-ordered the operands.
+ */
+ intrusive_ptr<ExpressionAdd> pAdd;
+ mutable bool useOriginal;
+ };
+
+
+ class ExpressionAnd :
+ public ExpressionNary {
+ public:
+ // virtuals from Expression
+ virtual ~ExpressionAnd();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void toMatcherBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const;
+
+ // virtuals from ExpressionNary
+ virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+ /*
+ Create an expression that finds the conjunction of n operands.
+ The conjunction uses short-circuit logic; the expressions are
+ evaluated in the order they were added to the conjunction, and
+ the evaluation stops and returns false on the first operand that
+ evaluates to false.
+
+ @returns conjunction expression
+ */
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionAnd();
+ };
+
+
+ class ExpressionCoerceToBool :
+ public Expression,
+ public boost::enable_shared_from_this<ExpressionCoerceToBool> {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionCoerceToBool();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual void addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+ virtual void addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+ static intrusive_ptr<ExpressionCoerceToBool> create(
+ const intrusive_ptr<Expression> &pExpression);
+
+ private:
+ ExpressionCoerceToBool(const intrusive_ptr<Expression> &pExpression);
+
+ intrusive_ptr<Expression> pExpression;
+ };
+
+
+ class ExpressionCompare :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionCompare();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ /*
+ Shorthands for creating various comparisons expressions.
+ Provide for conformance with the uniform function pointer signature
+ required for parsing.
+
+ These create a particular comparision operand, without any
+ operands. Those must be added via ExpressionNary::addOperand().
+ */
+ static intrusive_ptr<ExpressionNary> createCmp();
+ static intrusive_ptr<ExpressionNary> createEq();
+ static intrusive_ptr<ExpressionNary> createNe();
+ static intrusive_ptr<ExpressionNary> createGt();
+ static intrusive_ptr<ExpressionNary> createGte();
+ static intrusive_ptr<ExpressionNary> createLt();
+ static intrusive_ptr<ExpressionNary> createLte();
+
+ private:
+ friend class ExpressionFieldRange;
+ ExpressionCompare(CmpOp cmpOp);
+
+ CmpOp cmpOp;
+ };
+
+
+ class ExpressionCond :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionCond();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionCond();
+ };
+
+
+ class ExpressionConstant :
+ public Expression,
+ public boost::enable_shared_from_this<ExpressionConstant> {
+ public:
+ // virtuals from Expression
+ virtual ~ExpressionConstant();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+ virtual void addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+ static intrusive_ptr<ExpressionConstant> createFromBsonElement(
+ BSONElement *pBsonElement);
+ static intrusive_ptr<ExpressionConstant> create(
+ const intrusive_ptr<const Value> &pValue);
+
+ /*
+ Get the constant value represented by this Expression.
+
+ @returns the value
+ */
+ intrusive_ptr<const Value> getValue() const;
+
+ private:
+ ExpressionConstant(BSONElement *pBsonElement);
+ ExpressionConstant(const intrusive_ptr<const Value> &pValue);
+
+ intrusive_ptr<const Value> pValue;
+ };
+
+
+ class ExpressionDayOfMonth :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionDayOfMonth();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionDayOfMonth();
+ };
+
+
+ class ExpressionDayOfWeek :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionDayOfWeek();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionDayOfWeek();
+ };
+
+
+ class ExpressionDayOfYear :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionDayOfYear();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionDayOfYear();
+ };
+
+
+ class ExpressionDivide :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionDivide();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionDivide();
+ };
+
+
+ class ExpressionFieldPath :
+ public Expression,
+ public boost::enable_shared_from_this<ExpressionFieldPath> {
+ public:
+ // virtuals from Expression
+ virtual ~ExpressionFieldPath();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual void addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+ virtual void addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+ /*
+ Create a field path expression.
+
+ Evaluation will extract the value associated with the given field
+ path from the source document.
+
+ @param fieldPath the field path string, without any leading document
+ indicator
+ @returns the newly created field path expression
+ */
+ static intrusive_ptr<ExpressionFieldPath> create(
+ const string &fieldPath);
+
+ /*
+ Return a string representation of the field path.
+
+ @param fieldPrefix whether or not to include the document field
+ indicator prefix
+ @returns the dot-delimited field path
+ */
+ string getFieldPath(bool fieldPrefix) const;
+
+ /*
+ Write a string representation of the field path to a stream.
+
+ @param the stream to write to
+ @param fieldPrefix whether or not to include the document field
+ indicator prefix
+ */
+ void writeFieldPath(ostream &outStream, bool fieldPrefix) const;
+
+ private:
+ ExpressionFieldPath(const string &fieldPath);
+
+ /*
+ Internal implementation of evaluate(), used recursively.
+
+ The internal implementation doesn't just use a loop because of
+ the possibility that we need to skip over an array. If the path
+ is "a.b.c", and a is an array, then we fan out from there, and
+ traverse "b.c" for each element of a:[...]. This requires that
+ a be an array of objects in order to navigate more deeply.
+
+ @param index current path field index to extract
+ @param pathLength maximum number of fields on field path
+ @param pDocument current document traversed to (not the top-level one)
+ @returns the field found; could be an array
+ */
+ intrusive_ptr<const Value> evaluatePath(
+ size_t index, const size_t pathLength,
+ intrusive_ptr<Document> pDocument) const;
+
+ FieldPath fieldPath;
+ };
+
+
+ class ExpressionFieldRange :
+ public Expression,
+ public boost::enable_shared_from_this<ExpressionFieldRange> {
+ public:
+ // virtuals from expression
+ virtual ~ExpressionFieldRange();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual void addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+ virtual void addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const;
+ virtual void toMatcherBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const;
+
+ /*
+ Create a field range expression.
+
+ Field ranges are meant to match up with classic Matcher semantics,
+ and therefore are conjunctions. For example, these appear in
+ mongo shell predicates in one of these forms:
+ { a : C } -> (a == C) // degenerate "point" range
+ { a : { $lt : C } } -> (a < C) // open range
+ { a : { $gt : C1, $lte : C2 } } -> ((a > C1) && (a <= C2)) // closed
+
+ When initially created, a field range only includes one end of
+ the range. Additional points may be added via intersect().
+
+ Note that NE and CMP are not supported.
+
+ @param pFieldPath the field path for extracting the field value
+ @param cmpOp the comparison operator
+ @param pValue the value to compare against
+ @returns the newly created field range expression
+ */
+ static intrusive_ptr<ExpressionFieldRange> create(
+ const intrusive_ptr<ExpressionFieldPath> &pFieldPath,
+ CmpOp cmpOp, const intrusive_ptr<const Value> &pValue);
+
+ /*
+ Add an intersecting range.
+
+ This can be done any number of times after creation. The
+ range is internally optimized for each new addition. If the new
+ intersection extends or reduces the values within the range, the
+ internal representation is adjusted to reflect that.
+
+ Note that NE and CMP are not supported.
+
+ @param cmpOp the comparison operator
+ @param pValue the value to compare against
+ */
+ void intersect(CmpOp cmpOp, const intrusive_ptr<const Value> &pValue);
+
+ private:
+ ExpressionFieldRange(const intrusive_ptr<ExpressionFieldPath> &pFieldPath,
+ CmpOp cmpOp,
+ const intrusive_ptr<const Value> &pValue);
+
+ intrusive_ptr<ExpressionFieldPath> pFieldPath;
+
+ class Range {
+ public:
+ Range(CmpOp cmpOp, const intrusive_ptr<const Value> &pValue);
+ Range(const Range &rRange);
+
+ Range *intersect(const Range *pRange) const;
+ bool contains(const intrusive_ptr<const Value> &pValue) const;
+
+ Range(const intrusive_ptr<const Value> &pBottom, bool bottomOpen,
+ const intrusive_ptr<const Value> &pTop, bool topOpen);
+
+ bool bottomOpen;
+ bool topOpen;
+ intrusive_ptr<const Value> pBottom;
+ intrusive_ptr<const Value> pTop;
+ };
+
+ scoped_ptr<Range> pRange;
+
+ /*
+ Add to a generic Builder.
+
+ The methods to append items to an object and an array differ by
+ their inclusion of a field name. For more complicated objects,
+ it makes sense to abstract that out and use a generic builder that
+ always looks the same, and then implement addToBsonObj() and
+ addToBsonArray() by using the common method.
+ */
+ void addToBson(Builder *pBuilder, unsigned depth) const;
+ };
+
+
+ class ExpressionHour :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionHour();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionHour();
+ };
+
+
+ class ExpressionIfNull :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionIfNull();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionIfNull();
+ };
+
+
+ class ExpressionMinute :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionMinute();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionMinute();
+ };
+
+
+ class ExpressionMod :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionMod();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionMod();
+ };
+
+
+ class ExpressionMultiply :
+ public ExpressionNary {
+ public:
+ // virtuals from Expression
+ virtual ~ExpressionMultiply();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+
+ // virtuals from ExpressionNary
+ virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+ /*
+ Create an expression that finds the product of n operands.
+
+ @returns multiplication expression
+ */
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionMultiply();
+ };
+
+
+ class ExpressionMonth :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionMonth();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionMonth();
+ };
+
+
+ class ExpressionNoOp :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionNoOp();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionNoOp();
+ };
+
+
+ class ExpressionNot :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionNot();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionNot();
+ };
+
+
+ class ExpressionObject :
+ public Expression,
+ public boost::enable_shared_from_this<ExpressionObject> {
+ public:
+ // virtuals from Expression
+ virtual ~ExpressionObject();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual void addToBsonObj(
+ BSONObjBuilder *pBuilder, string fieldName, unsigned depth) const;
+ virtual void addToBsonArray(
+ BSONArrayBuilder *pBuilder, unsigned depth) const;
+
+ /*
+ evaluate(), but return a Document instead of a Value-wrapped
+ Document.
+
+ @param pDocument the input Document
+ @returns the result document
+ */
+ intrusive_ptr<Document> evaluateDocument(
+ const intrusive_ptr<Document> &pDocument) const;
+
+ /*
+ evaluate(), but add the evaluated fields to a given document
+ instead of creating a new one.
+
+ @param pResult the Document to add the evaluated expressions to
+ @param pDocument the input Document
+ */
+ void addToDocument(const intrusive_ptr<Document> &pResult,
+ const intrusive_ptr<Document> &pDocument) const;
+
+ /*
+ Estimate the number of fields that will result from evaluating
+ this over pDocument. Does not include _id. This is an estimate
+ (really an upper bound) because we can't account for undefined
+ fields without actually doing the evaluation. But this is still
+ useful as an argument to Document::create(), if you plan to use
+ addToDocument().
+
+ @param pDocument the input document
+ @returns estimated number of fields that will result
+ */
+ size_t getSizeHint(const intrusive_ptr<Document> &pDocument) const;
+
+ /*
+ Create an empty expression. Until fields are added, this
+ will evaluate to an empty document (object).
+ */
+ static intrusive_ptr<ExpressionObject> create();
+
+ /*
+ Add a field to the document expression.
+
+ @param fieldPath the path the evaluated expression will have in the
+ result Document
+ @param pExpression the expression to evaluate obtain this field's
+ Value in the result Document
+ */
+ void addField(const string &fieldPath,
+ const intrusive_ptr<Expression> &pExpression);
+
+ /*
+ Add a field path to the set of those to be included.
+
+ Note that including a nested field implies including everything on
+ the path leading down to it.
+
+ @param fieldPath the name of the field to be included
+ */
+ void includePath(const string &fieldPath);
+
+ /*
+ Add a field path to the set of those to be excluded.
+
+ Note that excluding a nested field implies including everything on
+ the path leading down to it (because you're stating you want to see
+ all the other fields that aren't being excluded).
+
+ @param fieldName the name of the field to be excluded
+ */
+ void excludePath(const string &fieldPath);
+
+ /*
+ Return the expression for a field.
+
+ @param fieldName the field name for the expression to return
+ @returns the expression used to compute the field, if it is present,
+ otherwise NULL.
+ */
+ intrusive_ptr<Expression> getField(const string &fieldName) const;
+
+ /*
+ Get a count of the added fields.
+
+ @returns how many fields have been added
+ */
+ size_t getFieldCount() const;
+
+ /*
+ Get a count of the exclusions.
+
+ @returns how many fields have been excluded.
+ */
+ size_t getExclusionCount() const;
+
+ /*
+ Specialized BSON conversion that allows for writing out a
+ $project specification. This creates a standalone object, which must
+ be added to a containing object with a name
+
+ @param pBuilder where to write the object to
+ */
+ void documentToBson(BSONObjBuilder *pBuilder, unsigned depth) const;
+
+ private:
+ ExpressionObject();
+
+ void includePath(
+ const FieldPath *pPath, size_t pathi, size_t pathn,
+ bool excludeLast);
+
+ bool excludePaths;
+ set<string> path;
+
+ /* these two vectors are maintained in parallel */
+ vector<string> vFieldName;
+ vector<intrusive_ptr<Expression> > vpExpression;
+
+ /*
+ Utility function used by documentToBson(). Emits inclusion
+ and exclusion paths by recursively walking down the nested
+ ExpressionObject trees these have created.
+
+ @param pBuilder the builder to write boolean valued path "fields" to
+ @param pvPath pointer to a vector of strings describing the path on
+ descent; the top-level call should pass an empty vector
+ */
+ void emitPaths(BSONObjBuilder *pBuilder, vector<string> *pvPath) const;
+
+ /* utility class used by emitPaths() */
+ class PathPusher :
+ boost::noncopyable {
+ public:
+ PathPusher(vector<string> *pvPath, const string &s);
+ ~PathPusher();
+
+ private:
+ vector<string> *pvPath;
+ };
+ };
+
+
+ class ExpressionOr :
+ public ExpressionNary {
+ public:
+ // virtuals from Expression
+ virtual ~ExpressionOr();
+ virtual intrusive_ptr<Expression> optimize();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void toMatcherBson(
+ BSONObjBuilder *pBuilder, unsigned depth) const;
+
+ // virtuals from ExpressionNary
+ virtual intrusive_ptr<ExpressionNary> (*getFactory() const)();
+
+ /*
+ Create an expression that finds the conjunction of n operands.
+ The conjunction uses short-circuit logic; the expressions are
+ evaluated in the order they were added to the conjunction, and
+ the evaluation stops and returns false on the first operand that
+ evaluates to false.
+
+ @returns conjunction expression
+ */
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionOr();
+ };
+
+
+ class ExpressionSecond :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionSecond();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionSecond();
+ };
+
+
+ class ExpressionStrcasecmp :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionStrcasecmp();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionStrcasecmp();
+ };
+
+
+ class ExpressionSubstr :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionSubstr();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionSubstr();
+ };
+
+
+ class ExpressionSubtract :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionSubtract();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionSubtract();
+ };
+
+
+ class ExpressionToLower :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionToLower();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionToLower();
+ };
+
+
+ class ExpressionToUpper :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionToUpper();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionToUpper();
+ };
+
+
+ class ExpressionWeek :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionWeek();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionWeek();
+ };
+
+
+ class ExpressionYear :
+ public ExpressionNary {
+ public:
+ // virtuals from ExpressionNary
+ virtual ~ExpressionYear();
+ virtual intrusive_ptr<const Value> evaluate(
+ const intrusive_ptr<Document> &pDocument) const;
+ virtual const char *getOpName() const;
+ virtual void addOperand(const intrusive_ptr<Expression> &pExpression);
+
+ static intrusive_ptr<ExpressionNary> create();
+
+ private:
+ ExpressionYear();
+ };
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+ inline bool Expression::ObjectCtx::unwindOk() const {
+ return ((options & UNWIND_OK) != 0);
+ }
+
+ inline bool Expression::ObjectCtx::unwindUsed() const {
+ return (unwindField.size() != 0);
+ }
+
+ inline int Expression::signum(int i) {
+ if (i < 0)
+ return -1;
+ if (i > 0)
+ return 1;
+ return 0;
+ }
+
+ inline intrusive_ptr<const Value> ExpressionConstant::getValue() const {
+ return pValue;
+ }
+
+ inline string ExpressionFieldPath::getFieldPath(bool fieldPrefix) const {
+ return fieldPath.getPath(fieldPrefix);
+ }
+
+ inline void ExpressionFieldPath::writeFieldPath(
+ ostream &outStream, bool fieldPrefix) const {
+ return fieldPath.writePath(outStream, fieldPrefix);
+ }
+
+ inline size_t ExpressionObject::getFieldCount() const {
+ return vFieldName.size();
+ }
+
+ inline ExpressionObject::PathPusher::PathPusher(
+ vector<string> *pTheVPath, const string &s):
+ pvPath(pTheVPath) {
+ pvPath->push_back(s);
+ }
+
+ inline ExpressionObject::PathPusher::~PathPusher() {
+ pvPath->pop_back();
+ }
+
+}
diff --git a/src/mongo/db/pipeline/expression_context.cpp b/src/mongo/db/pipeline/expression_context.cpp new file mode 100755 index 00000000000..4835dcfa5a9 --- /dev/null +++ b/src/mongo/db/pipeline/expression_context.cpp @@ -0,0 +1,35 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+
+#include "db/pipeline/expression_context.h"
+
+namespace mongo {
+
+ ExpressionContext::~ExpressionContext() {
+ }
+
+ inline ExpressionContext::ExpressionContext():
+ inShard(false),
+ inRouter(false) {
+ }
+
+ ExpressionContext *ExpressionContext::create() {
+ return new ExpressionContext();
+ }
+
+}
diff --git a/src/mongo/db/pipeline/expression_context.h b/src/mongo/db/pipeline/expression_context.h new file mode 100755 index 00000000000..0277039c80b --- /dev/null +++ b/src/mongo/db/pipeline/expression_context.h @@ -0,0 +1,67 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+#include "util/intrusive_counter.h"
+
+namespace mongo {
+
+ class ExpressionContext :
+ public IntrusiveCounterUnsigned {
+ public:
+ virtual ~ExpressionContext();
+
+ void setInShard(bool b);
+ void setInRouter(bool b);
+
+ bool getInShard() const;
+ bool getInRouter() const;
+
+ static ExpressionContext *create();
+
+ private:
+ ExpressionContext();
+
+ bool inShard;
+ bool inRouter;
+ };
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+ inline void ExpressionContext::setInShard(bool b) {
+ inShard = b;
+ }
+
+ inline void ExpressionContext::setInRouter(bool b) {
+ inRouter = b;
+ }
+
+ inline bool ExpressionContext::getInShard() const {
+ return inShard;
+ }
+
+ inline bool ExpressionContext::getInRouter() const {
+ return inRouter;
+ }
+
+};
diff --git a/src/mongo/db/pipeline/field_path.cpp b/src/mongo/db/pipeline/field_path.cpp new file mode 100755 index 00000000000..96e1fc92f83 --- /dev/null +++ b/src/mongo/db/pipeline/field_path.cpp @@ -0,0 +1,87 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "pch.h"
+#include "db/pipeline/field_path.h"
+#include "util/mongoutils/str.h"
+
+namespace mongo {
+ using namespace mongoutils;
+
+ FieldPath::~FieldPath() {
+ }
+
+ FieldPath::FieldPath():
+ vFieldName() {
+ }
+
+ FieldPath::FieldPath(const string &fieldPath):
+ vFieldName() {
+ /*
+ The field path could be using dot notation.
+ Break the field path up by peeling off successive pieces.
+ */
+ size_t startpos = 0;
+ while(true) {
+ /* find the next dot */
+ const size_t dotpos = fieldPath.find('.', startpos);
+
+ /* if there are no more dots, use the remainder of the string */
+ if (dotpos == fieldPath.npos) {
+ vFieldName.push_back(fieldPath.substr(startpos, dotpos));
+ break;
+ }
+
+ /* use the string up to the dot */
+ const size_t length = dotpos - startpos;
+ uassert(15998, str::stream() <<
+ "field names cannot be zero length (in path \"" <<
+ fieldPath << "\")",
+ length > 0);
+
+ vFieldName.push_back(fieldPath.substr(startpos, length));
+
+ /* next time, search starting one spot after that */
+ startpos = dotpos + 1;
+ }
+ }
+
+ string FieldPath::getPath(bool fieldPrefix) const {
+ stringstream ss;
+ writePath(ss, fieldPrefix);
+ return ss.str();
+ }
+
+ void FieldPath::writePath(ostream &outStream, bool fieldPrefix) const {
+ if (fieldPrefix)
+ outStream << "$";
+
+ outStream << vFieldName[0];
+
+ const size_t n = vFieldName.size();
+ for(size_t i = 1; i < n; ++i)
+ outStream << "." << vFieldName[i];
+ }
+
+ FieldPath &FieldPath::operator=(const FieldPath &rRHS) {
+ if (this != &rRHS) {
+ vFieldName = rRHS.vFieldName;
+ }
+
+ return *this;
+ }
+
+}
diff --git a/src/mongo/db/pipeline/field_path.h b/src/mongo/db/pipeline/field_path.h new file mode 100755 index 00000000000..810c5d0c7ea --- /dev/null +++ b/src/mongo/db/pipeline/field_path.h @@ -0,0 +1,82 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+
+namespace mongo {
+
+ class FieldPath {
+ public:
+ virtual ~FieldPath();
+
+ FieldPath(const string &fieldPath);
+ FieldPath();
+
+ /*
+ Get the number of path elements in the field path.
+
+ @returns the number of path elements
+ */
+ size_t getPathLength() const;
+
+ /*
+ Get a particular path element from the path.
+
+ @param i the index of the path element
+ @returns the path element
+ */
+ string getFieldName(size_t i) const;
+
+ /*
+ Get the full path.
+
+ @param fieldPrefix whether or not to include the field prefix
+ @returns the complete field path
+ */
+ string getPath(bool fieldPrefix) const;
+
+ /*
+ Write the full path.
+
+ @param outStream where to write the path to
+ @param fieldPrefix whether or not to include the field prefix
+ */
+ void writePath(ostream &outStream, bool fieldPrefix) const;
+
+ FieldPath &operator=(const FieldPath &rRHS);
+
+ private:
+ vector<string> vFieldName;
+ };
+}
+
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+ inline size_t FieldPath::getPathLength() const {
+ return vFieldName.size();
+ }
+
+ inline string FieldPath::getFieldName(size_t i) const {
+ return vFieldName[i];
+ }
+
+}
+
diff --git a/src/mongo/db/pipeline/value.cpp b/src/mongo/db/pipeline/value.cpp new file mode 100755 index 00000000000..b83dec359cf --- /dev/null +++ b/src/mongo/db/pipeline/value.cpp @@ -0,0 +1,1034 @@ +/** + * Copyright (c) 2011 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "pch.h" +#include "db/pipeline/value.h" + +#include <boost/functional/hash.hpp> +#include "db/jsobj.h" +#include "db/pipeline/builder.h" +#include "db/pipeline/document.h" +#include "util/mongoutils/str.h" + +namespace mongo { + using namespace mongoutils; + + const intrusive_ptr<const Value> Value::pFieldUndefined( + new ValueStatic(Undefined)); + const intrusive_ptr<const Value> Value::pFieldNull(new ValueStatic()); + const intrusive_ptr<const Value> Value::pFieldTrue(new ValueStatic(true)); + const intrusive_ptr<const Value> Value::pFieldFalse(new ValueStatic(false)); + const intrusive_ptr<const Value> Value::pFieldMinusOne(new ValueStatic(-1)); + const intrusive_ptr<const Value> Value::pFieldZero(new ValueStatic(0)); + const intrusive_ptr<const Value> Value::pFieldOne(new ValueStatic(1)); + + Value::~Value() { + } + + Value::Value(): + type(jstNULL), + oidValue(), + dateValue(), + stringValue(), + pDocumentValue(), + vpValue() { + } + + Value::Value(BSONType theType): + type(theType), + oidValue(), + dateValue(), + stringValue(), + pDocumentValue(), + vpValue() { + switch(type) { + case Undefined: + case jstNULL: + case Object: // empty + case Array: // empty + break; + + case NumberDouble: + simple.doubleValue = 0; + break; + + case Bool: + simple.boolValue = false; + break; + + case NumberInt: + simple.intValue = 0; + break; + + case Timestamp: + simple.timestampValue = 0; + break; + + case NumberLong: + simple.longValue = 0; + break; + + default: + // nothing else is allowed + uassert(16001, str::stream() << + "can't create empty Value of type " << type, false); + break; + } + } + + Value::Value(bool boolValue): + type(Bool), + pDocumentValue(), + vpValue() { + simple.boolValue = boolValue; + } + + intrusive_ptr<const Value> Value::createFromBsonElement( + BSONElement *pBsonElement) { + intrusive_ptr<const Value> pValue(new Value(pBsonElement)); + return pValue; + } + + Value::Value(BSONElement *pBsonElement): + type(pBsonElement->type()), + pDocumentValue(), + vpValue() { + switch(type) { + case NumberDouble: + simple.doubleValue = pBsonElement->Double(); + break; + + case String: + stringValue = pBsonElement->String(); + break; + + case Object: { + BSONObj document(pBsonElement->embeddedObject()); + pDocumentValue = Document::createFromBsonObj(&document); + break; + } + + case Array: { + vector<BSONElement> vElement(pBsonElement->Array()); + const size_t n = vElement.size(); + + vpValue.reserve(n); // save on realloc()ing + + for(size_t i = 0; i < n; ++i) { + vpValue.push_back( + Value::createFromBsonElement(&vElement[i])); + } + break; + } + + case jstOID: + oidValue = pBsonElement->OID(); + break; + + case Bool: + simple.boolValue = pBsonElement->Bool(); + break; + + case Date: + dateValue = pBsonElement->Date(); + break; + + case RegEx: + stringValue = pBsonElement->regex(); + // TODO pBsonElement->regexFlags(); + break; + + case NumberInt: + simple.intValue = pBsonElement->numberInt(); + break; + + case Timestamp: + dateValue = pBsonElement->timestampTime(); + break; + + case NumberLong: + simple.longValue = pBsonElement->numberLong(); + break; + + case jstNULL: + break; + + case BinData: + case Symbol: + case CodeWScope: + uassert(16002, str::stream() << + "can't create Value of type " << type, false); + break; + + /* these shouldn't happen in this context */ + case MinKey: + case EOO: + case Undefined: + case DBRef: + case Code: + case MaxKey: + assert(false); // CW TODO better message + break; + } + } + + Value::Value(int intValue): + type(NumberInt), + pDocumentValue(), + vpValue() { + simple.intValue = intValue; + } + + intrusive_ptr<const Value> Value::createInt(int value) { + intrusive_ptr<const Value> pValue(new Value(value)); + return pValue; + } + + Value::Value(long long longValue): + type(NumberLong), + pDocumentValue(), + vpValue() { + simple.longValue = longValue; + } + + intrusive_ptr<const Value> Value::createLong(long long value) { + intrusive_ptr<const Value> pValue(new Value(value)); + return pValue; + } + + Value::Value(double value): + type(NumberDouble), + pDocumentValue(), + vpValue() { + simple.doubleValue = value; + } + + intrusive_ptr<const Value> Value::createDouble(double value) { + intrusive_ptr<const Value> pValue(new Value(value)); + return pValue; + } + + Value::Value(const Date_t &value): + type(Date), + pDocumentValue(), + vpValue() { + dateValue = value; + } + + intrusive_ptr<const Value> Value::createDate(const Date_t &value) { + intrusive_ptr<const Value> pValue(new Value(value)); + return pValue; + } + + Value::Value(const string &value): + type(String), + pDocumentValue(), + vpValue() { + stringValue = value; + } + + intrusive_ptr<const Value> Value::createString(const string &value) { + intrusive_ptr<const Value> pValue(new Value(value)); + return pValue; + } + + Value::Value(const intrusive_ptr<Document> &pDocument): + type(Object), + pDocumentValue(pDocument), + vpValue() { + } + + intrusive_ptr<const Value> Value::createDocument( + const intrusive_ptr<Document> &pDocument) { + intrusive_ptr<const Value> pValue(new Value(pDocument)); + return pValue; + } + + Value::Value(const vector<intrusive_ptr<const Value> > &thevpValue): + type(Array), + pDocumentValue(), + vpValue(thevpValue) { + } + + intrusive_ptr<const Value> Value::createArray( + const vector<intrusive_ptr<const Value> > &vpValue) { + intrusive_ptr<const Value> pValue(new Value(vpValue)); + return pValue; + } + + double Value::getDouble() const { + BSONType type = getType(); + if (type == NumberInt) + return simple.intValue; + if (type == NumberLong) + return static_cast< double >( simple.longValue ); + + assert(type == NumberDouble); + return simple.doubleValue; + } + + string Value::getString() const { + assert(getType() == String); + return stringValue; + } + + intrusive_ptr<Document> Value::getDocument() const { + assert(getType() == Object); + return pDocumentValue; + } + + ValueIterator::~ValueIterator() { + } + + Value::vi::~vi() { + } + + bool Value::vi::more() const { + return (nextIndex < size); + } + + intrusive_ptr<const Value> Value::vi::next() { + assert(more()); + return (*pvpValue)[nextIndex++]; + } + + Value::vi::vi(const intrusive_ptr<const Value> &pValue, + const vector<intrusive_ptr<const Value> > *thepvpValue): + size(thepvpValue->size()), + nextIndex(0), + pvpValue(thepvpValue) { + } + + intrusive_ptr<ValueIterator> Value::getArray() const { + assert(getType() == Array); + intrusive_ptr<ValueIterator> pVI( + new vi(intrusive_ptr<const Value>(this), &vpValue)); + return pVI; + } + + OID Value::getOid() const { + assert(getType() == jstOID); + return oidValue; + } + + bool Value::getBool() const { + assert(getType() == Bool); + return simple.boolValue; + } + + Date_t Value::getDate() const { + assert(getType() == Date); + return dateValue; + } + + string Value::getRegex() const { + assert(getType() == RegEx); + return stringValue; + } + + string Value::getSymbol() const { + assert(getType() == Symbol); + return stringValue; + } + + int Value::getInt() const { + assert(getType() == NumberInt); + return simple.intValue; + } + + unsigned long long Value::getTimestamp() const { + assert(getType() == Timestamp); + return dateValue; + } + + long long Value::getLong() const { + BSONType type = getType(); + if (type == NumberInt) + return simple.intValue; + + assert(type == NumberLong); + return simple.longValue; + } + + void Value::addToBson(Builder *pBuilder) const { + switch(getType()) { + case NumberDouble: + pBuilder->append(getDouble()); + break; + + case String: + pBuilder->append(getString()); + break; + + case Object: { + intrusive_ptr<Document> pDocument(getDocument()); + BSONObjBuilder subBuilder; + pDocument->toBson(&subBuilder); + subBuilder.done(); + pBuilder->append(&subBuilder); + break; + } + + case Array: { + const size_t n = vpValue.size(); + BSONArrayBuilder arrayBuilder(n); + for(size_t i = 0; i < n; ++i) { + vpValue[i]->addToBsonArray(&arrayBuilder); + } + + pBuilder->append(&arrayBuilder); + break; + } + + case BinData: + // pBuilder->appendBinData(fieldName, ...); + assert(false); // CW TODO unimplemented + break; + + case jstOID: + pBuilder->append(getOid()); + break; + + case Bool: + pBuilder->append(getBool()); + break; + + case Date: + pBuilder->append(getDate()); + break; + + case RegEx: + pBuilder->append(getRegex()); + break; + + case Symbol: + pBuilder->append(getSymbol()); + break; + + case CodeWScope: + assert(false); // CW TODO unimplemented + break; + + case NumberInt: + pBuilder->append(getInt()); + break; + + case Timestamp: + pBuilder->append((long long)getTimestamp()); + break; + + case NumberLong: + pBuilder->append(getLong()); + break; + + case jstNULL: + pBuilder->append(); + break; + + /* these shouldn't appear in this context */ + case MinKey: + case EOO: + case Undefined: + case DBRef: + case Code: + case MaxKey: + assert(false); // CW TODO better message + break; + } + } + + void Value::addToBsonObj(BSONObjBuilder *pBuilder, string fieldName) const { + BuilderObj objBuilder(pBuilder, fieldName); + addToBson(&objBuilder); + } + + void Value::addToBsonArray(BSONArrayBuilder *pBuilder) const { + BuilderArray arrBuilder(pBuilder); + addToBson(&arrBuilder); + } + + bool Value::coerceToBool() const { + BSONType type = getType(); + switch(type) { + case NumberDouble: + if (simple.doubleValue != 0) + return true; + break; + + case String: + case Object: + case Array: + case BinData: + case jstOID: + case Date: + case RegEx: + case Symbol: + case Timestamp: + return true; + + case Bool: + if (simple.boolValue) + return true; + break; + + case CodeWScope: + assert(false); // CW TODO unimplemented + break; + + case NumberInt: + if (simple.intValue != 0) + return true; + break; + + case NumberLong: + if (simple.longValue != 0) + return true; + break; + + case jstNULL: + case Undefined: + /* nothing to do */ + break; + + /* these shouldn't happen in this context */ + case MinKey: + case EOO: + case DBRef: + case Code: + case MaxKey: + assert(false); // CW TODO better message + break; + } + + return false; + } + + intrusive_ptr<const Value> Value::coerceToBoolean() const { + bool result = coerceToBool(); + + /* always normalize to the singletons */ + if (result) + return Value::getTrue(); + return Value::getFalse(); + } + + int Value::coerceToInt() const { + switch(type) { + case NumberDouble: + return (int)simple.doubleValue; + + case NumberInt: + return simple.intValue; + + case NumberLong: + return (int)simple.longValue; + + case jstNULL: + case Undefined: + break; + + case String: + default: + uassert(16003, str::stream() << + "can't convert from BSON type " << type << + " to int", + false); + } // switch(type) + + return (int)0; + } + + long long Value::coerceToLong() const { + switch(type) { + case NumberDouble: + return (long long)simple.doubleValue; + + case NumberInt: + return simple.intValue; + + case NumberLong: + return simple.longValue; + + case jstNULL: + case Undefined: + break; + + case String: + default: + uassert(16004, str::stream() << + "can't convert from BSON type " << type << + " to long", + false); + } // switch(type) + + return (long long)0; + } + + double Value::coerceToDouble() const { + switch(type) { + case NumberDouble: + return simple.doubleValue; + + case NumberInt: + return (double)simple.intValue; + + case NumberLong: + return (double)simple.longValue; + + case jstNULL: + case Undefined: + break; + + case String: + default: + uassert(16005, str::stream() << + "can't convert from BSON type " << type << + " to double", + false); + } // switch(type) + + return (double)0; + } + + Date_t Value::coerceToDate() const { + switch(type) { + + case Date: + return dateValue; + + case jstNULL: + case Undefined: + break; + + default: + uassert(16006, str::stream() << + "can't convert from BSON type " << type << + " to double", + false); + } // switch(type) + + assert(false); // CW TODO no conversion available + return jstNULL; + } + + string Value::coerceToString() const { + stringstream ss; + switch(type) { + case NumberDouble: + ss << simple.doubleValue; + return ss.str(); + + case NumberInt: + ss << simple.intValue; + return ss.str(); + + case NumberLong: + ss << simple.longValue; + return ss.str(); + + case String: + return stringValue; + + case Date: + return dateValue.toString(); + + case jstNULL: + case Undefined: + break; + + default: + uassert(16007, str::stream() << + "can't convert from BSON type " << type << + " to double", + false); + } // switch(type) + + return ""; + } + + int Value::compare(const intrusive_ptr<const Value> &rL, + const intrusive_ptr<const Value> &rR) { + BSONType lType = rL->getType(); + BSONType rType = rR->getType(); + + /* + Special handling for Undefined and NULL values; these are types, + so it's easier to handle them here before we go below to handle + values of the same types. This allows us to compare Undefined and + NULL values with everything else. As coded now: + (*) Undefined is less than everything except itself (which is equal) + (*) NULL is less than everything except Undefined and itself + */ + if (lType == Undefined) { + if (rType == Undefined) + return 0; + + /* if rType is anything else, the left value is less */ + return -1; + } + + if (lType == jstNULL) { + if (rType == Undefined) + return 1; + if (rType == jstNULL) + return 0; + + return -1; + } + + if ((rType == Undefined) || (rType == jstNULL)) { + /* + We know the left value isn't Undefined, because of the above. + Count a NULL value as greater than an undefined one. + */ + return 1; + } + + // CW TODO for now, only compare like values + uassert(16016, str::stream() << + "can't compare values of BSON types " << lType << + " and " << rType, + lType == rType); + + switch(lType) { + case NumberDouble: + if (rL->simple.doubleValue < rR->simple.doubleValue) + return -1; + if (rL->simple.doubleValue > rR->simple.doubleValue) + return 1; + return 0; + + case String: + return rL->stringValue.compare(rR->stringValue); + + case Object: + return Document::compare(rL->getDocument(), rR->getDocument()); + + case Array: { + intrusive_ptr<ValueIterator> pli(rL->getArray()); + intrusive_ptr<ValueIterator> pri(rR->getArray()); + + while(true) { + /* have we run out of left array? */ + if (!pli->more()) { + if (!pri->more()) + return 0; // the arrays are the same length + + return -1; // the left array is shorter + } + + /* have we run out of right array? */ + if (!pri->more()) + return 1; // the right array is shorter + + /* compare the two corresponding elements */ + intrusive_ptr<const Value> plv(pli->next()); + intrusive_ptr<const Value> prv(pri->next()); + const int cmp = Value::compare(plv, prv); + if (cmp) + return cmp; // values are unequal + } + + /* NOTREACHED */ + assert(false); + break; + } + + case BinData: + // pBuilder->appendBinData(fieldName, ...); + assert(false); // CW TODO unimplemented + break; + + case jstOID: + if (rL->oidValue < rR->oidValue) + return -1; + if (rL->oidValue == rR->oidValue) + return 0; + return 1; + + case Bool: + if (rL->simple.boolValue == rR->simple.boolValue) + return 0; + if (rL->simple.boolValue) + return 1; + return -1; + + case Date: + if (rL->dateValue < rR->dateValue) + return -1; + if (rL->dateValue > rR->dateValue) + return 1; + return 0; + + case RegEx: + return rL->stringValue.compare(rR->stringValue); + + case Symbol: + assert(false); // CW TODO unimplemented + break; + + case CodeWScope: + assert(false); // CW TODO unimplemented + break; + + case NumberInt: + if (rL->simple.intValue < rR->simple.intValue) + return -1; + if (rL->simple.intValue > rR->simple.intValue) + return 1; + return 0; + + case Timestamp: + if (rL->dateValue < rR->dateValue) + return -1; + if (rL->dateValue > rR->dateValue) + return 1; + return 0; + + case NumberLong: + if (rL->simple.longValue < rR->simple.longValue) + return -1; + if (rL->simple.longValue > rR->simple.longValue) + return 1; + return 0; + + case Undefined: + case jstNULL: + return 0; // treat two Undefined or NULL values as equal + + /* these shouldn't happen in this context */ + case MinKey: + case EOO: + case DBRef: + case Code: + case MaxKey: + assert(false); // CW TODO better message + break; + } // switch(lType) + + /* NOTREACHED */ + return 0; + } + + void Value::hash_combine(size_t &seed) const { + BSONType type = getType(); + boost::hash_combine(seed, (int)type); + + switch(type) { + case NumberDouble: + boost::hash_combine(seed, simple.doubleValue); + break; + + case String: + boost::hash_combine(seed, stringValue); + break; + + case Object: + getDocument()->hash_combine(seed); + break; + + case Array: { + intrusive_ptr<ValueIterator> pIter(getArray()); + while(pIter->more()) { + intrusive_ptr<const Value> pValue(pIter->next()); + pValue->hash_combine(seed); + }; + break; + } + + case BinData: + // pBuilder->appendBinData(fieldName, ...); + assert(false); // CW TODO unimplemented + break; + + case jstOID: + oidValue.hash_combine(seed); + break; + + case Bool: + boost::hash_combine(seed, simple.boolValue); + break; + + case Date: + boost::hash_combine(seed, (unsigned long long)dateValue); + break; + + case RegEx: + boost::hash_combine(seed, stringValue); + break; + + case Symbol: + assert(false); // CW TODO unimplemented + break; + + case CodeWScope: + assert(false); // CW TODO unimplemented + break; + + case NumberInt: + boost::hash_combine(seed, simple.intValue); + break; + + case Timestamp: + boost::hash_combine(seed, (unsigned long long)dateValue); + break; + + case NumberLong: + boost::hash_combine(seed, simple.longValue); + break; + + case Undefined: + case jstNULL: + break; + + /* these shouldn't happen in this context */ + case MinKey: + case EOO: + case DBRef: + case Code: + case MaxKey: + assert(false); // CW TODO better message + break; + } // switch(type) + } + + BSONType Value::getWidestNumeric(BSONType lType, BSONType rType) { + if (lType == NumberDouble) { + switch(rType) { + case NumberDouble: + case NumberLong: + case NumberInt: + case jstNULL: + case Undefined: + return NumberDouble; + + default: + break; + } + } + else if (lType == NumberLong) { + switch(rType) { + case NumberDouble: + return NumberDouble; + + case NumberLong: + case NumberInt: + case jstNULL: + case Undefined: + return NumberLong; + + default: + break; + } + } + else if (lType == NumberInt) { + switch(rType) { + case NumberDouble: + return NumberDouble; + + case NumberLong: + return NumberLong; + + case NumberInt: + case jstNULL: + case Undefined: + return NumberInt; + + default: + break; + } + } + else if ((lType == jstNULL) || (lType == Undefined)) { + switch(rType) { + case NumberDouble: + return NumberDouble; + + case NumberLong: + return NumberLong; + + case NumberInt: + return NumberInt; + + default: + break; + } + } + + /* NOTREACHED */ + return Undefined; + } + + size_t Value::getApproximateSize() const { + switch(type) { + case String: + return sizeof(Value) + stringValue.length(); + + case Object: + return sizeof(Value) + pDocumentValue->getApproximateSize(); + + case Array: { + size_t size = sizeof(Value); + const size_t n = vpValue.size(); + for(size_t i = 0; i < n; ++i) { + size += vpValue[i]->getApproximateSize(); + } + return size; + } + + case NumberDouble: + case BinData: + case jstOID: + case Bool: + case Date: + case RegEx: + case Symbol: + case CodeWScope: + case NumberInt: + case Timestamp: + case NumberLong: + case jstNULL: + case Undefined: + return sizeof(Value); + + /* these shouldn't happen in this context */ + case MinKey: + case EOO: + case DBRef: + case Code: + case MaxKey: + assert(false); // CW TODO better message + return sizeof(Value); + } + + /* + We shouldn't get here. In order to make the implementor think about + these cases, they are all listed explicitly, above. The compiler + should complain if they aren't all listed, because there's no + default. However, not all the compilers seem to do that. Therefore, + this final catch-all is here. + */ + assert(false); + return sizeof(Value); + } + + + void ValueStatic::addRef() const { + } + + void ValueStatic::release() const { + } + +} diff --git a/src/mongo/db/pipeline/value.h b/src/mongo/db/pipeline/value.h new file mode 100755 index 00000000000..8bd1bcbbbfd --- /dev/null +++ b/src/mongo/db/pipeline/value.h @@ -0,0 +1,468 @@ +/**
+ * Copyright (c) 2011 10gen Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "pch.h"
+#include "bson/bsontypes.h"
+#include "util/intrusive_counter.h"
+
+namespace mongo {
+ class BSONElement;
+ class Builder;
+ class Document;
+ class Value;
+
+ class ValueIterator :
+ public IntrusiveCounterUnsigned {
+ public:
+ virtual ~ValueIterator();
+
+ /*
+ Ask if there are more fields to return.
+
+ @returns true if there are more fields, false otherwise
+ */
+ virtual bool more() const = 0;
+
+ /*
+ Move the iterator to point to the next field and return it.
+
+ @returns the next field's <name, Value>
+ */
+ virtual intrusive_ptr<const Value> next() = 0;
+ };
+
+
+ /*
+ Values are immutable, so these are passed around as
+ intrusive_ptr<const Value>.
+ */
+ class Value :
+ public IntrusiveCounterUnsigned {
+ public:
+ ~Value();
+
+ /*
+ Construct a Value from a BSONElement.
+
+ This ignores the name of the element, and only uses the value,
+ whatever type it is.
+
+ @returns a new Value initialized from the bsonElement
+ */
+ static intrusive_ptr<const Value> createFromBsonElement(
+ BSONElement *pBsonElement);
+
+ /*
+ Construct an integer-valued Value.
+
+ For commonly used values, consider using one of the singleton
+ instances defined below.
+
+ @param value the value
+ @returns a Value with the given value
+ */
+ static intrusive_ptr<const Value> createInt(int value);
+
+ /*
+ Construct an long(long)-valued Value.
+
+ For commonly used values, consider using one of the singleton
+ instances defined below.
+
+ @param value the value
+ @returns a Value with the given value
+ */
+ static intrusive_ptr<const Value> createLong(long long value);
+
+ /*
+ Construct a double-valued Value.
+
+ @param value the value
+ @returns a Value with the given value
+ */
+ static intrusive_ptr<const Value> createDouble(double value);
+
+ /*
+ Construct a string-valued Value.
+
+ @param value the value
+ @returns a Value with the given value
+ */
+ static intrusive_ptr<const Value> createString(const string &value);
+
+ /*
+ Construct a date-valued Value.
+
+ @param value the value
+ @returns a Value with the given value
+ */
+ static intrusive_ptr<const Value> createDate(const Date_t &value);
+
+ /*
+ Construct a document-valued Value.
+
+ @param value the value
+ @returns a Value with the given value
+ */
+ static intrusive_ptr<const Value> createDocument(
+ const intrusive_ptr<Document> &pDocument);
+
+ /*
+ Construct an array-valued Value.
+
+ @param value the value
+ @returns a Value with the given value
+ */
+ static intrusive_ptr<const Value> createArray(
+ const vector<intrusive_ptr<const Value> > &vpValue);
+
+ /*
+ Get the BSON type of the field.
+
+ If the type is jstNULL, no value getter will work.
+
+ @return the BSON type of the field.
+ */
+ BSONType getType() const;
+
+ /*
+ Getters.
+
+ @returns the Value's value; asserts if the requested value type is
+ incorrect.
+ */
+ double getDouble() const;
+ string getString() const;
+ intrusive_ptr<Document> getDocument() const;
+ intrusive_ptr<ValueIterator> getArray() const;
+ OID getOid() const;
+ bool getBool() const;
+ Date_t getDate() const;
+ string getRegex() const;
+ string getSymbol() const;
+ int getInt() const;
+ unsigned long long getTimestamp() const;
+ long long getLong() const;
+
+ /*
+ Get the length of an array value.
+
+ @returns the length of the array, if this is array-valued; otherwise
+ throws an error
+ */
+ size_t getArrayLength() const;
+
+ /*
+ Add this value to the BSON object under construction.
+ */
+ void addToBsonObj(BSONObjBuilder *pBuilder, string fieldName) const;
+
+ /*
+ Add this field to the BSON array under construction.
+
+ As part of an array, the Value's name will be ignored.
+ */
+ void addToBsonArray(BSONArrayBuilder *pBuilder) const;
+
+ /*
+ Get references to singleton instances of commonly used field values.
+ */
+ static intrusive_ptr<const Value> getUndefined();
+ static intrusive_ptr<const Value> getNull();
+ static intrusive_ptr<const Value> getTrue();
+ static intrusive_ptr<const Value> getFalse();
+ static intrusive_ptr<const Value> getMinusOne();
+ static intrusive_ptr<const Value> getZero();
+ static intrusive_ptr<const Value> getOne();
+
+ /*
+ Coerce (cast) a value to a native bool, using JSON rules.
+
+ @returns the bool value
+ */
+ bool coerceToBool() const;
+
+ /*
+ Coerce (cast) a value to a Boolean Value, using JSON rules.
+
+ @returns the Boolean Value value
+ */
+ intrusive_ptr<const Value> coerceToBoolean() const;
+
+ /*
+ Coerce (cast) a value to an int, using JSON rules.
+
+ @returns the int value
+ */
+ int coerceToInt() const;
+
+ /*
+ Coerce (cast) a value to a long long, using JSON rules.
+
+ @returns the long value
+ */
+ long long coerceToLong() const;
+
+ /*
+ Coerce (cast) a value to a double, using JSON rules.
+
+ @returns the double value
+ */
+ double coerceToDouble() const;
+
+ /*
+ Coerce (cast) a value to a date, using JSON rules.
+
+ @returns the date value
+ */
+ Date_t coerceToDate() const;
+
+ /*
+ Coerce (cast) a value to a string, using JSON rules.
+
+ @returns the date value
+ */
+ string coerceToString() const;
+
+ /*
+ Compare two Values.
+
+ @param rL left value
+ @param rR right value
+ @returns an integer less than zero, zero, or an integer greater than
+ zero, depending on whether rL < rR, rL == rR, or rL > rR
+ */
+ static int compare(const intrusive_ptr<const Value> &rL,
+ const intrusive_ptr<const Value> &rR);
+
+
+ /*
+ Figure out what the widest of two numeric types is.
+
+ Widest can be thought of as "most capable," or "able to hold the
+ largest or most precise value." The progression is Int, Long, Double.
+
+ @param rL left value
+ @param rR right value
+ @returns a BSONType of NumberInt, NumberLong, or NumberDouble
+ */
+ static BSONType getWidestNumeric(BSONType lType, BSONType rType);
+
+ /*
+ Get the approximate storage size of the value, in bytes.
+
+ @returns approximate storage size of the value.
+ */
+ size_t getApproximateSize() const;
+
+ /*
+ Calculate a hash value.
+
+ Meant to be used to create composite hashes suitable for
+ boost classes such as unordered_map<>.
+
+ @param seed value to augment with this' hash
+ */
+ void hash_combine(size_t &seed) const;
+
+ /*
+ struct Hash is defined to enable the use of Values as
+ keys in boost::unordered_map<>.
+
+ Values are always referenced as immutables in the form
+ intrusive_ptr<const Value>, so these operate on that construction.
+ */
+ struct Hash :
+ unary_function<intrusive_ptr<const Value>, size_t> {
+ size_t operator()(const intrusive_ptr<const Value> &rV) const;
+ };
+
+ protected:
+ Value(); // creates null value
+ Value(BSONType type); // creates an empty (unitialized value) of type
+ // mostly useful for Undefined
+ Value(bool boolValue);
+ Value(int intValue);
+
+ private:
+ Value(BSONElement *pBsonElement);
+
+ Value(long long longValue);
+ Value(double doubleValue);
+ Value(const Date_t &dateValue);
+ Value(const string &stringValue);
+ Value(const intrusive_ptr<Document> &pDocument);
+ Value(const vector<intrusive_ptr<const Value> > &vpValue);
+
+ void addToBson(Builder *pBuilder) const;
+
+ BSONType type;
+
+ /* store value in one of these */
+ union {
+ double doubleValue;
+ bool boolValue;
+ int intValue;
+ unsigned long long timestampValue;
+ long long longValue;
+
+ } simple; // values that don't need a ctor/dtor
+ OID oidValue;
+ Date_t dateValue;
+ string stringValue; // String, Regex, Symbol
+ intrusive_ptr<Document> pDocumentValue;
+ vector<intrusive_ptr<const Value> > vpValue; // for arrays
+
+
+ /*
+ These are often used as the result of boolean or comparison
+ expressions.
+
+ These are obtained via public static getters defined above.
+ */
+ static const intrusive_ptr<const Value> pFieldUndefined;
+ static const intrusive_ptr<const Value> pFieldNull;
+ static const intrusive_ptr<const Value> pFieldTrue;
+ static const intrusive_ptr<const Value> pFieldFalse;
+ static const intrusive_ptr<const Value> pFieldMinusOne;
+ static const intrusive_ptr<const Value> pFieldZero;
+ static const intrusive_ptr<const Value> pFieldOne;
+
+ /* this implementation is used for getArray() */
+ class vi :
+ public ValueIterator {
+ public:
+ // virtuals from ValueIterator
+ virtual ~vi();
+ virtual bool more() const;
+ virtual intrusive_ptr<const Value> next();
+
+ private:
+ friend class Value;
+ vi(const intrusive_ptr<const Value> &pSource,
+ const vector<intrusive_ptr<const Value> > *pvpValue);
+
+ size_t size;
+ size_t nextIndex;
+ const vector<intrusive_ptr<const Value> > *pvpValue;
+ }; /* class vi */
+
+ };
+
+ /*
+ Equality operator for values.
+
+ Useful for unordered_map<>, etc.
+ */
+ inline bool operator==(const intrusive_ptr<const Value> &v1,
+ const intrusive_ptr<const Value> &v2) {
+ return (Value::compare(v1, v2) == 0);
+ }
+
+ /*
+ For performance reasons, there are various sharable static values
+ defined in class Value, obtainable by methods such as getUndefined(),
+ getTrue(), getOne(), etc. We don't want these to go away as they are
+ used by a multitude of threads evaluating pipelines. In order to avoid
+ having to use atomic integers in the intrusive reference counter, this
+ class overrides the reference counting methods to do nothing, making it
+ safe to use for static Values.
+
+ At this point, only the constructors necessary for the static Values in
+ common use have been defined. The remainder can be defined if necessary.
+ */
+ class ValueStatic :
+ public Value {
+ public:
+ // virtuals from IntrusiveCounterUnsigned
+ virtual void addRef() const;
+ virtual void release() const;
+
+ // constructors
+ ValueStatic();
+ ValueStatic(BSONType type);
+ ValueStatic(bool boolValue);
+ ValueStatic(int intValue);
+ };
+}
+
+/* ======================= INLINED IMPLEMENTATIONS ========================== */
+
+namespace mongo {
+
+ inline BSONType Value::getType() const {
+ return type;
+ }
+
+ inline size_t Value::getArrayLength() const {
+ assert(getType() == Array);
+ return vpValue.size();
+ }
+
+ inline intrusive_ptr<const Value> Value::getUndefined() {
+ return pFieldUndefined;
+ }
+
+ inline intrusive_ptr<const Value> Value::getNull() {
+ return pFieldNull;
+ }
+
+ inline intrusive_ptr<const Value> Value::getTrue() {
+ return pFieldTrue;
+ }
+
+ inline intrusive_ptr<const Value> Value::getFalse() {
+ return pFieldFalse;
+ }
+
+ inline intrusive_ptr<const Value> Value::getMinusOne() {
+ return pFieldMinusOne;
+ }
+
+ inline intrusive_ptr<const Value> Value::getZero() {
+ return pFieldZero;
+ }
+
+ inline intrusive_ptr<const Value> Value::getOne() {
+ return pFieldOne;
+ }
+
+ inline size_t Value::Hash::operator()(
+ const intrusive_ptr<const Value> &rV) const {
+ size_t seed = 0xf0afbeef;
+ rV->hash_combine(seed);
+ return seed;
+ }
+
+ inline ValueStatic::ValueStatic():
+ Value() {
+ }
+
+ inline ValueStatic::ValueStatic(BSONType type):
+ Value(type) {
+ }
+
+ inline ValueStatic::ValueStatic(bool boolValue):
+ Value(boolValue) {
+ }
+
+ inline ValueStatic::ValueStatic(int intValue):
+ Value(intValue) {
+ }
+
+};
diff --git a/src/mongo/db/projection.cpp b/src/mongo/db/projection.cpp new file mode 100644 index 00000000000..d07e56527af --- /dev/null +++ b/src/mongo/db/projection.cpp @@ -0,0 +1,301 @@ +// projection.cpp + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pch.h" +#include "projection.h" +#include "../util/mongoutils/str.h" + +namespace mongo { + + void Projection::init( const BSONObj& o ) { + massert( 10371 , "can only add to Projection once", _source.isEmpty()); + _source = o; + + BSONObjIterator i( o ); + int true_false = -1; + while ( i.more() ) { + BSONElement e = i.next(); + + if ( ! e.isNumber() ) + _hasNonSimple = true; + + if (e.type() == Object) { + BSONObj obj = e.embeddedObject(); + BSONElement e2 = obj.firstElement(); + if ( strcmp(e2.fieldName(), "$slice") == 0 ) { + if (e2.isNumber()) { + int i = e2.numberInt(); + if (i < 0) + add(e.fieldName(), i, -i); // limit is now positive + else + add(e.fieldName(), 0, i); + + } + else if (e2.type() == Array) { + BSONObj arr = e2.embeddedObject(); + uassert(13099, "$slice array wrong size", arr.nFields() == 2 ); + + BSONObjIterator it(arr); + int skip = it.next().numberInt(); + int limit = it.next().numberInt(); + uassert(13100, "$slice limit must be positive", limit > 0 ); + add(e.fieldName(), skip, limit); + + } + else { + uassert(13098, "$slice only supports numbers and [skip, limit] arrays", false); + } + } + else { + uassert(13097, string("Unsupported projection option: ") + obj.firstElementFieldName(), false); + } + + } + else if (!strcmp(e.fieldName(), "_id") && !e.trueValue()) { + _includeID = false; + + } + else { + + add (e.fieldName(), e.trueValue()); + + // validate input + if (true_false == -1) { + true_false = e.trueValue(); + _include = !e.trueValue(); + } + else { + uassert( 10053 , "You cannot currently mix including and excluding fields. Contact us if this is an issue." , + (bool)true_false == e.trueValue() ); + } + } + } + } + + void Projection::add(const string& field, bool include) { + if (field.empty()) { // this is the field the user referred to + _include = include; + } + else { + _include = !include; + + const size_t dot = field.find('.'); + const string subfield = field.substr(0,dot); + const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos)); + + boost::shared_ptr<Projection>& fm = _fields[subfield]; + if (!fm) + fm.reset(new Projection()); + + fm->add(rest, include); + } + } + + void Projection::add(const string& field, int skip, int limit) { + _special = true; // can't include or exclude whole object + + if (field.empty()) { // this is the field the user referred to + _skip = skip; + _limit = limit; + } + else { + const size_t dot = field.find('.'); + const string subfield = field.substr(0,dot); + const string rest = (dot == string::npos ? "" : field.substr(dot+1,string::npos)); + + boost::shared_ptr<Projection>& fm = _fields[subfield]; + if (!fm) + fm.reset(new Projection()); + + fm->add(rest, skip, limit); + } + } + + void Projection::transform( const BSONObj& in , BSONObjBuilder& b ) const { + BSONObjIterator i(in); + while ( i.more() ) { + BSONElement e = i.next(); + if ( mongoutils::str::equals( "_id" , e.fieldName() ) ) { + if ( _includeID ) + b.append( e ); + } + else { + append( b , e ); + } + } + } + + BSONObj Projection::transform( const BSONObj& in ) const { + BSONObjBuilder b; + transform( in , b ); + return b.obj(); + } + + + //b will be the value part of an array-typed BSONElement + void Projection::appendArray( BSONObjBuilder& b , const BSONObj& a , bool nested) const { + int skip = nested ? 0 : _skip; + int limit = nested ? -1 : _limit; + + if (skip < 0) { + skip = max(0, skip + a.nFields()); + } + + int i=0; + BSONObjIterator it(a); + while (it.more()) { + BSONElement e = it.next(); + + if (skip) { + skip--; + continue; + } + + if (limit != -1 && (limit-- == 0)) { + break; + } + + switch(e.type()) { + case Array: { + BSONObjBuilder subb; + appendArray(subb , e.embeddedObject(), true); + b.appendArray(b.numStr(i++), subb.obj()); + break; + } + case Object: { + BSONObjBuilder subb; + BSONObjIterator jt(e.embeddedObject()); + while (jt.more()) { + append(subb , jt.next()); + } + b.append(b.numStr(i++), subb.obj()); + break; + } + default: + if (_include) + b.appendAs(e, b.numStr(i++)); + } + } + } + + void Projection::append( BSONObjBuilder& b , const BSONElement& e ) const { + FieldMap::const_iterator field = _fields.find( e.fieldName() ); + + if (field == _fields.end()) { + if (_include) + b.append(e); + } + else { + Projection& subfm = *field->second; + + if ((subfm._fields.empty() && !subfm._special) || !(e.type()==Object || e.type()==Array) ) { + if (subfm._include) + b.append(e); + } + else if (e.type() == Object) { + BSONObjBuilder subb; + BSONObjIterator it(e.embeddedObject()); + while (it.more()) { + subfm.append(subb, it.next()); + } + b.append(e.fieldName(), subb.obj()); + + } + else { //Array + BSONObjBuilder subb; + subfm.appendArray(subb, e.embeddedObject()); + b.appendArray(e.fieldName(), subb.obj()); + } + } + } + + Projection::KeyOnly* Projection::checkKey( const BSONObj& keyPattern ) const { + if ( _include ) { + // if we default to including then we can't + // use an index because we don't know what we're missing + return 0; + } + + if ( _hasNonSimple ) + return 0; + + if ( _includeID && keyPattern["_id"].eoo() ) + return 0; + + // at this point we know its all { x : 1 } style + + auto_ptr<KeyOnly> p( new KeyOnly() ); + + int got = 0; + BSONObjIterator i( keyPattern ); + while ( i.more() ) { + BSONElement k = i.next(); + + if ( _source[k.fieldName()].type() ) { + + if ( strchr( k.fieldName() , '.' ) ) { + // TODO we currently don't support dotted fields + // SERVER-2104 + return 0; + } + + if ( ! _includeID && mongoutils::str::equals( k.fieldName() , "_id" ) ) { + p->addNo(); + } + else { + p->addYes( k.fieldName() ); + got++; + } + } + else if ( mongoutils::str::equals( "_id" , k.fieldName() ) && _includeID ) { + p->addYes( "_id" ); + } + else { + p->addNo(); + } + + } + + int need = _source.nFields(); + if ( ! _includeID ) + need--; + + if ( got == need ) + return p.release(); + + return 0; + } + + BSONObj Projection::KeyOnly::hydrate( const BSONObj& key ) const { + assert( _include.size() == _names.size() ); + + BSONObjBuilder b( key.objsize() + _stringSize + 16 ); + + BSONObjIterator i(key); + unsigned n=0; + while ( i.more() ) { + assert( n < _include.size() ); + BSONElement e = i.next(); + if ( _include[n] ) { + b.appendAs( e , _names[n] ); + } + n++; + } + + return b.obj(); + } +} diff --git a/src/mongo/db/projection.h b/src/mongo/db/projection.h new file mode 100644 index 00000000000..b5e0a0c4289 --- /dev/null +++ b/src/mongo/db/projection.h @@ -0,0 +1,129 @@ +// projection.h + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "pch.h" +#include "jsobj.h" + +namespace mongo { + + /** + * given a document and a projection specification + * can transform the document + * currently supports specifying which fields and $slice + */ + class Projection { + public: + + class KeyOnly { + public: + + KeyOnly() : _stringSize(0) {} + + BSONObj hydrate( const BSONObj& key ) const; + + void addNo() { _add( false , "" ); } + void addYes( const string& name ) { _add( true , name ); } + + private: + + void _add( bool b , const string& name ) { + _include.push_back( b ); + _names.push_back( name ); + _stringSize += name.size(); + } + + vector<bool> _include; // one entry per field in key. true iff should be in output + vector<string> _names; // name of field since key doesn't have names + + int _stringSize; + }; + + Projection() : + _include(true) , + _special(false) , + _includeID(true) , + _skip(0) , + _limit(-1) , + _hasNonSimple(false) { + } + + /** + * called once per lifetime + * e.g. { "x" : 1 , "a.y" : 1 } + */ + void init( const BSONObj& spec ); + + /** + * @return the spec init was called with + */ + BSONObj getSpec() const { return _source; } + + /** + * transforms in according to spec + */ + BSONObj transform( const BSONObj& in ) const; + + + /** + * transforms in according to spec + */ + void transform( const BSONObj& in , BSONObjBuilder& b ) const; + + + /** + * @return if the keyPattern has all the information needed to return then + * return a new KeyOnly otherwise null + * NOTE: a key may have modified the actual data + * which has to be handled above this (arrays, geo) + */ + KeyOnly* checkKey( const BSONObj& keyPattern ) const; + + bool includeID() const { return _includeID; } + + private: + + /** + * appends e to b if user wants it + * will descend into e if needed + */ + void append( BSONObjBuilder& b , const BSONElement& e ) const; + + + void add( const string& field, bool include ); + void add( const string& field, int skip, int limit ); + void appendArray( BSONObjBuilder& b , const BSONObj& a , bool nested=false) const; + + bool _include; // true if default at this level is to include + bool _special; // true if this level can't be skipped or included without recursing + + //TODO: benchmark vector<pair> vs map + typedef map<string, boost::shared_ptr<Projection> > FieldMap; + FieldMap _fields; + BSONObj _source; + bool _includeID; + + // used for $slice operator + int _skip; + int _limit; + + bool _hasNonSimple; + }; + + +} diff --git a/src/mongo/db/queryoptimizer.cpp b/src/mongo/db/queryoptimizer.cpp new file mode 100644 index 00000000000..9d9040d51e2 --- /dev/null +++ b/src/mongo/db/queryoptimizer.cpp @@ -0,0 +1,1337 @@ +// @file queryoptimizer.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" + +#include "db.h" +#include "btree.h" +#include "pdfile.h" +#include "queryoptimizer.h" +#include "cmdline.h" +#include "clientcursor.h" + +//#define DEBUGQO(x) cout << x << endl; +#define DEBUGQO(x) + +namespace mongo { + + void checkTableScanAllowed( const char * ns ) { + if ( ! cmdLine.noTableScan ) + return; + + if ( strstr( ns , ".system." ) || + strstr( ns , "local." ) ) + return; + + if ( ! nsdetails( ns ) ) + return; + + uassert( 10111 , (string)"table scans not allowed:" + ns , ! cmdLine.noTableScan ); + } + + double elementDirection( const BSONElement &e ) { + if ( e.isNumber() ) + return e.number(); + return 1; + } + + QueryPlan::QueryPlan( + NamespaceDetails *d, int idxNo, + const FieldRangeSetPair &frsp, const FieldRangeSetPair *originalFrsp, const BSONObj &originalQuery, const BSONObj &order, bool mustAssertOnYieldFailure, const BSONObj &startKey, const BSONObj &endKey , string special ) : + _d(d), _idxNo(idxNo), + _frs( frsp.frsForIndex( _d, _idxNo ) ), + _frsMulti( frsp.frsForIndex( _d, -1 ) ), + _originalQuery( originalQuery ), + _order( order ), + _index( 0 ), + _optimal( false ), + _scanAndOrderRequired( true ), + _exactKeyMatch( false ), + _direction( 0 ), + _endKeyInclusive( endKey.isEmpty() ), + _unhelpful( false ), + _impossible( false ), + _special( special ), + _type(0), + _startOrEndSpec( !startKey.isEmpty() || !endKey.isEmpty() ), + _mustAssertOnYieldFailure( mustAssertOnYieldFailure ) { + + BSONObj idxKey = _idxNo < 0 ? BSONObj() : d->idx( _idxNo ).keyPattern(); + + if ( !_frs.matchPossibleForIndex( idxKey ) ) { + _impossible = true; + _scanAndOrderRequired = false; + return; + } + + if ( willScanTable() ) { + if ( _order.isEmpty() || !strcmp( _order.firstElementFieldName(), "$natural" ) ) + _scanAndOrderRequired = false; + return; + } + + _index = &d->idx(_idxNo); + + // If the parsing or index indicates this is a special query, don't continue the processing + if ( _special.size() || + ( _index->getSpec().getType() && _index->getSpec().getType()->suitability( originalQuery, order ) != USELESS ) ) { + + if( _special.size() ) _optimal = true; + + _type = _index->getSpec().getType(); + if( !_special.size() ) _special = _index->getSpec().getType()->getPlugin()->getName(); + + massert( 13040 , (string)"no type for special: " + _special , _type ); + // hopefully safe to use original query in these contexts - don't think we can mix special with $or clause separation yet + _scanAndOrderRequired = _type->scanAndOrderRequired( _originalQuery , order ); + return; + } + + const IndexSpec &idxSpec = _index->getSpec(); + BSONObjIterator o( order ); + BSONObjIterator k( idxKey ); + if ( !o.moreWithEOO() ) + _scanAndOrderRequired = false; + while( o.moreWithEOO() ) { + BSONElement oe = o.next(); + if ( oe.eoo() ) { + _scanAndOrderRequired = false; + break; + } + if ( !k.moreWithEOO() ) + break; + BSONElement ke; + while( 1 ) { + ke = k.next(); + if ( ke.eoo() ) + goto doneCheckOrder; + if ( strcmp( oe.fieldName(), ke.fieldName() ) == 0 ) + break; + if ( !_frs.range( ke.fieldName() ).equality() ) + goto doneCheckOrder; + } + int d = elementDirection( oe ) == elementDirection( ke ) ? 1 : -1; + if ( _direction == 0 ) + _direction = d; + else if ( _direction != d ) + break; + } +doneCheckOrder: + if ( _scanAndOrderRequired ) + _direction = 0; + BSONObjIterator i( idxKey ); + int exactIndexedQueryCount = 0; + int optimalIndexedQueryCount = 0; + bool stillOptimalIndexedQueryCount = true; + set<string> orderFieldsUnindexed; + order.getFieldNames( orderFieldsUnindexed ); + while( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + const FieldRange &fr = _frs.range( e.fieldName() ); + if ( stillOptimalIndexedQueryCount ) { + if ( fr.nontrivial() ) + ++optimalIndexedQueryCount; + if ( !fr.equality() ) + stillOptimalIndexedQueryCount = false; + } + else { + if ( fr.nontrivial() ) + optimalIndexedQueryCount = -1; + } + if ( fr.equality() ) { + BSONElement e = fr.max(); + if ( !e.isNumber() && !e.mayEncapsulate() && e.type() != RegEx ) + ++exactIndexedQueryCount; + } + orderFieldsUnindexed.erase( e.fieldName() ); + } + if ( !_scanAndOrderRequired && + ( optimalIndexedQueryCount == _frs.nNontrivialRanges() ) ) + _optimal = true; + if ( exactIndexedQueryCount == _frs.nNontrivialRanges() && + orderFieldsUnindexed.size() == 0 && + exactIndexedQueryCount == idxKey.nFields() && + exactIndexedQueryCount == _originalQuery.nFields() ) { + _exactKeyMatch = true; + } + _frv.reset( new FieldRangeVector( _frs, idxSpec, _direction ) ); + if ( originalFrsp ) { + _originalFrv.reset( new FieldRangeVector( originalFrsp->frsForIndex( _d, _idxNo ), idxSpec, _direction ) ); + } + else { + _originalFrv = _frv; + } + if ( _startOrEndSpec ) { + BSONObj newStart, newEnd; + if ( !startKey.isEmpty() ) + _startKey = startKey; + else + _startKey = _frv->startKey(); + if ( !endKey.isEmpty() ) + _endKey = endKey; + else + _endKey = _frv->endKey(); + } + + if ( ( _scanAndOrderRequired || _order.isEmpty() ) && + !_frs.range( idxKey.firstElementFieldName() ).nontrivial() ) { + _unhelpful = true; + } + } + + shared_ptr<Cursor> QueryPlan::newCursor( const DiskLoc &startLoc , int numWanted ) const { + + if ( _type ) { + // hopefully safe to use original query in these contexts - don't think we can mix type with $or clause separation yet + return _type->newCursor( _originalQuery , _order , numWanted ); + } + + if ( _impossible ) { + // TODO We might want to allow this dummy table scan even in no table + // scan mode, since it won't scan anything. + if ( _frs.nNontrivialRanges() ) + checkTableScanAllowed( _frs.ns() ); + return shared_ptr<Cursor>( new BasicCursor( DiskLoc() ) ); + } + + if ( willScanTable() ) { + if ( _frs.nNontrivialRanges() ) { + checkTableScanAllowed( _frs.ns() ); + + // if we are doing a table scan on _id + // and it's a capped collection + // we warn /*disallow*/ as it's a common user error + // .system. and local collections are exempt + if ( _d && _d->capped && _frs.range( "_id" ).nontrivial() ) { + if ( cc().isSyncThread() || + str::contains( _frs.ns() , ".system." ) || + str::startsWith( _frs.ns() , "local." ) ) { + // ok + } + else { + warning() << "_id query on capped collection without an _id index, performance will be poor collection: " << _frs.ns() << endl; + //uassert( 14820, str::stream() << "doing _id query on a capped collection without an index is not allowed: " << _frs.ns() , + } + } + } + return findTableScan( _frs.ns(), _order, startLoc ); + } + + massert( 10363 , "newCursor() with start location not implemented for indexed plans", startLoc.isNull() ); + + if ( _startOrEndSpec ) { + // we are sure to spec _endKeyInclusive + return shared_ptr<Cursor>( BtreeCursor::make( _d, _idxNo, *_index, _startKey, _endKey, _endKeyInclusive, _direction >= 0 ? 1 : -1 ) ); + } + else if ( _index->getSpec().getType() ) { + return shared_ptr<Cursor>( BtreeCursor::make( _d, _idxNo, *_index, _frv->startKey(), _frv->endKey(), true, _direction >= 0 ? 1 : -1 ) ); + } + else { + return shared_ptr<Cursor>( BtreeCursor::make( _d, _idxNo, *_index, _frv, _direction >= 0 ? 1 : -1 ) ); + } + } + + shared_ptr<Cursor> QueryPlan::newReverseCursor() const { + if ( willScanTable() ) { + int orderSpec = _order.getIntField( "$natural" ); + if ( orderSpec == INT_MIN ) + orderSpec = 1; + return findTableScan( _frs.ns(), BSON( "$natural" << -orderSpec ) ); + } + massert( 10364 , "newReverseCursor() not implemented for indexed plans", false ); + return shared_ptr<Cursor>(); + } + + BSONObj QueryPlan::indexKey() const { + if ( !_index ) + return BSON( "$natural" << 1 ); + return _index->keyPattern(); + } + + void QueryPlan::registerSelf( long long nScanned ) const { + // Impossible query constraints can be detected before scanning, and we + // don't have a reserved pattern enum value for impossible constraints. + if ( _impossible ) { + return; + } + + SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex); + NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( _frs.pattern( _order ), indexKey(), nScanned ); + } + + /** + * @return a copy of the inheriting class, which will be run with its own + * query plan. If multiple plan sets are required for an $or query, the + * QueryOp of the winning plan from a given set will be cloned to generate + * QueryOps for the subsequent plan set. This function should only be called + * after the query op has completed executing. + */ + QueryOp *QueryOp::createChild() { + if( _orConstraint.get() ) { + _matcher->advanceOrClause( _orConstraint ); + _orConstraint.reset(); + } + QueryOp *ret = _createChild(); + ret->_oldMatcher = _matcher; + return ret; + } + + bool QueryPlan::isMultiKey() const { + if ( _idxNo < 0 ) + return false; + return _d->isMultikey( _idxNo ); + } + + void QueryOp::init() { + if ( _oldMatcher.get() ) { + _matcher.reset( _oldMatcher->nextClauseMatcher( qp().indexKey() ) ); + } + else { + _matcher.reset( new CoveredIndexMatcher( qp().originalQuery(), qp().indexKey(), alwaysUseRecord() ) ); + } + _init(); + } + + QueryPlanSet::QueryPlanSet( const char *ns, auto_ptr<FieldRangeSetPair> frsp, auto_ptr<FieldRangeSetPair> originalFrsp, const BSONObj &originalQuery, const BSONObj &order, bool mustAssertOnYieldFailure, const BSONElement *hint, bool honorRecordedPlan, const BSONObj &min, const BSONObj &max, bool bestGuessOnly, bool mayYield ) : + _ns(ns), + _originalQuery( originalQuery ), + _frsp( frsp ), + _originalFrsp( originalFrsp ), + _mayRecordPlan( false ), + _usingCachedPlan( false ), + _hint( BSONObj() ), + _order( order.getOwned() ), + _oldNScanned( 0 ), + _honorRecordedPlan( honorRecordedPlan ), + _min( min.getOwned() ), + _max( max.getOwned() ), + _bestGuessOnly( bestGuessOnly ), + _mayYield( mayYield ), + _yieldSometimesTracker( 256, 20 ), + _mustAssertOnYieldFailure( mustAssertOnYieldFailure ) { + if ( hint && !hint->eoo() ) { + _hint = hint->wrap(); + } + init(); + } + + bool QueryPlanSet::modifiedKeys() const { + for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i ) + if ( (*i)->isMultiKey() ) + return true; + return false; + } + + bool QueryPlanSet::hasMultiKey() const { + for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i ) + if ( (*i)->isMultiKey() ) + return true; + return false; + } + + + void QueryPlanSet::addHint( IndexDetails &id ) { + if ( !_min.isEmpty() || !_max.isEmpty() ) { + string errmsg; + BSONObj keyPattern = id.keyPattern(); + // This reformats _min and _max to be used for index lookup. + massert( 10365 , errmsg, indexDetailsForRange( _frsp->ns(), errmsg, _min, _max, keyPattern ) ); + } + NamespaceDetails *d = nsdetails(_ns); + _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(id), *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure, _min, _max ) ) ); + } + + // returns an IndexDetails * for a hint, 0 if hint is $natural. + // hint must not be eoo() + IndexDetails *parseHint( const BSONElement &hint, NamespaceDetails *d ) { + massert( 13292, "hint eoo", !hint.eoo() ); + if( hint.type() == String ) { + string hintstr = hint.valuestr(); + NamespaceDetails::IndexIterator i = d->ii(); + while( i.more() ) { + IndexDetails& ii = i.next(); + if ( ii.indexName() == hintstr ) { + return ⅈ + } + } + } + else if( hint.type() == Object ) { + BSONObj hintobj = hint.embeddedObject(); + uassert( 10112 , "bad hint", !hintobj.isEmpty() ); + if ( !strcmp( hintobj.firstElementFieldName(), "$natural" ) ) { + return 0; + } + NamespaceDetails::IndexIterator i = d->ii(); + while( i.more() ) { + IndexDetails& ii = i.next(); + if( ii.keyPattern().woCompare(hintobj) == 0 ) { + return ⅈ + } + } + } + uassert( 10113 , "bad hint", false ); + return 0; + } + + void QueryPlanSet::init() { + DEBUGQO( "QueryPlanSet::init " << ns << "\t" << _originalQuery ); + _runner.reset(); + _plans.clear(); + _usingCachedPlan = false; + + const char *ns = _frsp->ns(); + NamespaceDetails *d = nsdetails( ns ); + if ( !d || !_frsp->matchPossible() ) { + // Table scan plan, when no matches are possible + _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ) ); + return; + } + + BSONElement hint = _hint.firstElement(); + if ( !hint.eoo() ) { + IndexDetails *id = parseHint( hint, d ); + if ( id ) { + addHint( *id ); + } + else { + massert( 10366 , "natural order cannot be specified with $min/$max", _min.isEmpty() && _max.isEmpty() ); + // Table scan plan + _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ) ); + } + return; + } + + if ( !_min.isEmpty() || !_max.isEmpty() ) { + string errmsg; + BSONObj keyPattern; + IndexDetails *idx = indexDetailsForRange( ns, errmsg, _min, _max, keyPattern ); + massert( 10367 , errmsg, idx ); + _plans.push_back( QueryPlanPtr( new QueryPlan( d, d->idxNo(*idx), *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure, _min, _max ) ) ); + return; + } + + if ( isSimpleIdQuery( _originalQuery ) ) { + int idx = d->findIdIndex(); + if ( idx >= 0 ) { + _plans.push_back( QueryPlanPtr( new QueryPlan( d , idx , *_frsp , _originalFrsp.get() , _originalQuery, _order, _mustAssertOnYieldFailure ) ) ); + return; + } + } + + if ( _originalQuery.isEmpty() && _order.isEmpty() ) { + _plans.push_back( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ) ); + return; + } + + DEBUGQO( "\t special : " << _frsp->getSpecial() ); + if ( _frsp->getSpecial().size() ) { + _special = _frsp->getSpecial(); + NamespaceDetails::IndexIterator i = d->ii(); + while( i.more() ) { + int j = i.pos(); + IndexDetails& ii = i.next(); + const IndexSpec& spec = ii.getSpec(); + if ( spec.getTypeName() == _special && spec.suitability( _originalQuery , _order ) ) { + _plans.push_back( QueryPlanPtr( new QueryPlan( d , j , *_frsp , _originalFrsp.get() , _originalQuery, _order , + _mustAssertOnYieldFailure , BSONObj() , BSONObj() , _special ) ) ); + return; + } + } + uassert( 13038 , (string)"can't find special index: " + _special + " for: " + _originalQuery.toString() , 0 ); + } + + if ( _honorRecordedPlan ) { + pair< BSONObj, long long > best = QueryUtilIndexed::bestIndexForPatterns( *_frsp, _order ); + BSONObj bestIndex = best.first; + long long oldNScanned = best.second; + if ( !bestIndex.isEmpty() ) { + QueryPlanPtr p; + _oldNScanned = oldNScanned; + if ( !strcmp( bestIndex.firstElementFieldName(), "$natural" ) ) { + // Table scan plan + p.reset( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ); + } + + NamespaceDetails::IndexIterator i = d->ii(); + while( i.more() ) { + int j = i.pos(); + IndexDetails& ii = i.next(); + if( ii.keyPattern().woCompare(bestIndex) == 0 ) { + p.reset( new QueryPlan( d, j, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ); + } + } + + massert( 10368 , "Unable to locate previously recorded index", p.get() ); + if ( !( _bestGuessOnly && p->scanAndOrderRequired() ) ) { + _usingCachedPlan = true; + _plans.push_back( p ); + return; + } + } + } + + addOtherPlans( false ); + } + + void QueryPlanSet::addOtherPlans( bool checkFirst ) { + const char *ns = _frsp->ns(); + NamespaceDetails *d = nsdetails( ns ); + if ( !d ) + return; + + // If table scan is optimal or natural order requested or tailable cursor requested + if ( !_frsp->matchPossible() || ( _frsp->noNontrivialRanges() && _order.isEmpty() ) || + ( !_order.isEmpty() && !strcmp( _order.firstElementFieldName(), "$natural" ) ) ) { + // Table scan plan + addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ), checkFirst ); + return; + } + + bool normalQuery = _hint.isEmpty() && _min.isEmpty() && _max.isEmpty(); + + PlanSet plans; + QueryPlanPtr optimalPlan; + QueryPlanPtr specialPlan; + for( int i = 0; i < d->nIndexes; ++i ) { + if ( normalQuery ) { + BSONObj keyPattern = d->idx( i ).keyPattern(); + if ( !_frsp->matchPossibleForIndex( d, i, keyPattern ) ) { + // If no match is possible, only generate a trival plan that won't + // scan any documents. + QueryPlanPtr p( new QueryPlan( d, i, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ); + addPlan( p, checkFirst ); + return; + } + if ( !QueryUtilIndexed::indexUseful( *_frsp, d, i, _order ) ) { + continue; + } + } + + QueryPlanPtr p( new QueryPlan( d, i, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ); + if ( p->optimal() ) { + if ( !optimalPlan.get() ) { + optimalPlan = p; + } + } + else if ( !p->unhelpful() ) { + if ( p->special().empty() ) { + plans.push_back( p ); + } + else { + specialPlan = p; + } + } + } + if ( optimalPlan.get() ) { + addPlan( optimalPlan, checkFirst ); + return; + } + for( PlanSet::const_iterator i = plans.begin(); i != plans.end(); ++i ) { + addPlan( *i, checkFirst ); + } + + // Only add a special plan if no standard btree plans have been added. SERVER-4531 + if ( plans.empty() && specialPlan ) { + addPlan( specialPlan, checkFirst ); + return; + } + + // Table scan plan + addPlan( QueryPlanPtr( new QueryPlan( d, -1, *_frsp, _originalFrsp.get(), _originalQuery, _order, _mustAssertOnYieldFailure ) ), checkFirst ); + _mayRecordPlan = true; + } + + shared_ptr<QueryOp> QueryPlanSet::runOp( QueryOp &op ) { + if ( _usingCachedPlan ) { + Runner r( *this, op ); + shared_ptr<QueryOp> res = r.runUntilFirstCompletes(); + // _plans.size() > 1 if addOtherPlans was called in Runner::runUntilFirstCompletes(). + if ( _bestGuessOnly || res->complete() || _plans.size() > 1 ) + return res; + // A cached plan was used, so clear the plan for this query pattern and retry the query without a cached plan. + // Carefull here, as the namespace may have been dropped. + QueryUtilIndexed::clearIndexesForPatterns( *_frsp, _order ); + init(); + } + Runner r( *this, op ); + return r.runUntilFirstCompletes(); + } + + shared_ptr<QueryOp> QueryPlanSet::nextOp( QueryOp &originalOp, bool retried ) { + if ( !_runner ) { + _runner.reset( new Runner( *this, originalOp ) ); + shared_ptr<QueryOp> op = _runner->init(); + if ( op->complete() ) { + return op; + } + } + shared_ptr<QueryOp> op = _runner->nextNonError(); + if ( !op->error() ) { + return op; + } + if ( !_usingCachedPlan || _bestGuessOnly || _plans.size() > 1 ) { + return op; + } + + // Avoid an infinite loop here - this should never occur. + verify( 15878, !retried ); + + // A cached plan was used, so clear the plan for this query pattern and retry the query without a cached plan. + QueryUtilIndexed::clearIndexesForPatterns( *_frsp, _order ); + init(); + return nextOp( originalOp, true ); + } + + bool QueryPlanSet::prepareToYield() { + return _runner ? _runner->prepareToYield() : true; + } + + void QueryPlanSet::recoverFromYield() { + if ( _runner ) { + _runner->recoverFromYield(); + } + } + + void QueryPlanSet::clearRunner() { + if ( _runner ) { + _runner.reset(); + } + } + + BSONObj QueryPlanSet::explain() const { + vector<BSONObj> arr; + for( PlanSet::const_iterator i = _plans.begin(); i != _plans.end(); ++i ) { + shared_ptr<Cursor> c = (*i)->newCursor(); + BSONObjBuilder explain; + explain.append( "cursor", c->toString() ); + explain.append( "indexBounds", c->prettyIndexBounds() ); + arr.push_back( explain.obj() ); + } + BSONObjBuilder b; + b.append( "allPlans", arr ); + return b.obj(); + } + + QueryPlanSet::QueryPlanPtr QueryPlanSet::getBestGuess() const { + assert( _plans.size() ); + if ( _plans[ 0 ]->scanAndOrderRequired() ) { + for ( unsigned i=1; i<_plans.size(); i++ ) { + if ( ! _plans[i]->scanAndOrderRequired() ) + return _plans[i]; + } + + warning() << "best guess query plan requested, but scan and order are required for all plans " + << " query: " << _originalQuery + << " order: " << _order + << " choices: "; + + for ( unsigned i=0; i<_plans.size(); i++ ) + warning() << _plans[i]->indexKey() << " "; + warning() << endl; + + return QueryPlanPtr(); + } + return _plans[0]; + } + + QueryPlanSet::Runner::Runner( QueryPlanSet &plans, QueryOp &op ) : + _op( op ), + _plans( plans ) { + } + + bool QueryPlanSet::Runner::prepareToYield() { + for( vector<shared_ptr<QueryOp> >::const_iterator i = _ops.begin(); i != _ops.end(); ++i ) { + if ( !prepareToYieldOp( **i ) ) { + return false; + } + } + return true; + } + + void QueryPlanSet::Runner::recoverFromYield() { + for( vector<shared_ptr<QueryOp> >::const_iterator i = _ops.begin(); i != _ops.end(); ++i ) { + recoverFromYieldOp( **i ); + } + } + + void QueryPlanSet::Runner::mayYield() { + if ( ! _plans._mayYield ) + return; + + if ( ! _plans._yieldSometimesTracker.intervalHasElapsed() ) + return; + + int micros = ClientCursor::suggestYieldMicros(); + if ( micros <= 0 ) + return; + + if ( !prepareToYield() ) + return; + + ClientCursor::staticYield( micros , _plans._ns , 0 ); + recoverFromYield(); + } + + shared_ptr<QueryOp> QueryPlanSet::Runner::init() { + massert( 10369 , "no plans", _plans._plans.size() > 0 ); + + if ( _plans._bestGuessOnly ) { + shared_ptr<QueryOp> op( _op.createChild() ); + shared_ptr<QueryPlan> plan = _plans.getBestGuess(); + massert( 15894, "no index matches QueryPlanSet's sort with _bestGuessOnly", plan.get() ); + op->setQueryPlan( plan.get() ); + _ops.push_back( op ); + } + else { + if ( _plans._plans.size() > 1 ) + log(1) << " running multiple plans" << endl; + for( PlanSet::iterator i = _plans._plans.begin(); i != _plans._plans.end(); ++i ) { + shared_ptr<QueryOp> op( _op.createChild() ); + op->setQueryPlan( i->get() ); + _ops.push_back( op ); + } + } + + // Initialize ops. + for( vector<shared_ptr<QueryOp> >::iterator i = _ops.begin(); i != _ops.end(); ++i ) { + initOp( **i ); + if ( (*i)->complete() ) + return *i; + } + + // Put runnable ops in the priority queue. + for( vector<shared_ptr<QueryOp> >::iterator i = _ops.begin(); i != _ops.end(); ++i ) { + if ( !(*i)->error() ) { + _queue.push( *i ); + } + } + + return *_ops.begin(); + } + + shared_ptr<QueryOp> QueryPlanSet::Runner::nextNonError() { + if ( _queue.empty() ) { + return *_ops.begin(); + } + shared_ptr<QueryOp> ret; + do { + ret = next(); + } while( ret->error() && !_queue.empty() ); + return ret; + } + + shared_ptr<QueryOp> QueryPlanSet::Runner::next() { + mayYield(); + dassert( !_queue.empty() ); + OpHolder holder = _queue.pop(); + QueryOp &op = *holder._op; + nextOp( op ); + if ( op.complete() ) { + if ( _plans._mayRecordPlan && op.mayRecordPlan() ) { + op.qp().registerSelf( op.nscanned() ); + } + return holder._op; + } + if ( op.error() ) { + return holder._op; + } + if ( !_plans._bestGuessOnly && _plans._usingCachedPlan && op.nscanned() > _plans._oldNScanned * 10 && _plans._special.empty() ) { + holder._offset = -op.nscanned(); + _plans.addOtherPlans( /* avoid duplicating the initial plan */ true ); + PlanSet::iterator i = _plans._plans.begin(); + ++i; + for( ; i != _plans._plans.end(); ++i ) { + shared_ptr<QueryOp> op( _op.createChild() ); + op->setQueryPlan( i->get() ); + _ops.push_back( op ); + initOp( *op ); + if ( op->complete() ) + return op; + _queue.push( op ); + } + _plans._usingCachedPlan = false; + } + _queue.push( holder ); + return holder._op; + } + + shared_ptr<QueryOp> QueryPlanSet::Runner::runUntilFirstCompletes() { + shared_ptr<QueryOp> potentialFinisher = init(); + if ( potentialFinisher->complete() ) { + return potentialFinisher; + } + + while( !_queue.empty() ) { + shared_ptr<QueryOp> potentialFinisher = next(); + if ( potentialFinisher->complete() ) { + return potentialFinisher; + } + } + return _ops[ 0 ]; + } + +#define GUARD_OP_EXCEPTION( op, expression ) \ + try { \ + expression; \ + } \ + catch ( DBException& e ) { \ + op.setException( e.getInfo() ); \ + } \ + catch ( const std::exception &e ) { \ + op.setException( ExceptionInfo( e.what() , 0 ) ); \ + } \ + catch ( ... ) { \ + op.setException( ExceptionInfo( "Caught unknown exception" , 0 ) ); \ + } + + + void QueryPlanSet::Runner::initOp( QueryOp &op ) { + GUARD_OP_EXCEPTION( op, op.init() ); + } + + void QueryPlanSet::Runner::nextOp( QueryOp &op ) { + GUARD_OP_EXCEPTION( op, if ( !op.error() ) { op.next(); } ); + } + + bool QueryPlanSet::Runner::prepareToYieldOp( QueryOp &op ) { + GUARD_OP_EXCEPTION( op, + if ( op.error() ) { + return true; + } + else { + return op.prepareToYield(); + } ); + return true; + } + + void QueryPlanSet::Runner::recoverFromYieldOp( QueryOp &op ) { + GUARD_OP_EXCEPTION( op, if ( !op.error() ) { op.recoverFromYield(); } ); + } + + /** + * NOTE on our $or implementation: In our current qo implementation we don't + * keep statistics on our data, but we can conceptualize the problem of + * selecting an index when statistics exist for all index ranges. The + * d-hitting set problem on k sets and n elements can be reduced to the + * problem of index selection on k $or clauses and n index ranges (where + * d is the max number of indexes, and the number of ranges n is unbounded). + * In light of the fact that d-hitting set is np complete, and we don't even + * track statistics (so cost calculations are expensive) our first + * implementation uses the following greedy approach: We take one $or clause + * at a time and treat each as a separate query for index selection purposes. + * But if an index range is scanned for a particular $or clause, we eliminate + * that range from all subsequent clauses. One could imagine an opposite + * implementation where we select indexes based on the union of index ranges + * for all $or clauses, but this can have much poorer worst case behavior. + * (An index range that suits one $or clause may not suit another, and this + * is worse than the typical case of index range choice staleness because + * with $or the clauses may likely be logically distinct.) The greedy + * implementation won't do any worse than all the $or clauses individually, + * and it can often do better. In the first cut we are intentionally using + * QueryPattern tracking to record successful plans on $or clauses for use by + * subsequent $or clauses, even though there may be a significant aggregate + * $nor component that would not be represented in QueryPattern. + */ + + MultiPlanScanner::MultiPlanScanner( const char *ns, + const BSONObj &query, + const BSONObj &order, + const BSONElement *hint, + bool honorRecordedPlan, + const BSONObj &min, + const BSONObj &max, + bool bestGuessOnly, + bool mayYield ) : + _ns( ns ), + _or( !query.getField( "$or" ).eoo() ), + _query( query.getOwned() ), + _i(), + _honorRecordedPlan( honorRecordedPlan ), + _bestGuessOnly( bestGuessOnly ), + _hint( ( hint && !hint->eoo() ) ? hint->wrap() : BSONObj() ), + _mayYield( mayYield ), + _tableScanned() { + if ( !order.isEmpty() || !min.isEmpty() || !max.isEmpty() ) { + _or = false; + } + if ( _or ) { + // Only construct an OrRangeGenerator if we may handle $or clauses. + _org.reset( new OrRangeGenerator( ns, _query ) ); + if ( !_org->getSpecial().empty() ) { + _or = false; + } + else if ( uselessOr( _hint.firstElement() ) ) { + _or = false; + } + } + // if _or == false, don't use or clauses for index selection + if ( !_or ) { + auto_ptr<FieldRangeSetPair> frsp( new FieldRangeSetPair( ns, _query, true ) ); + _currentQps.reset( new QueryPlanSet( ns, frsp, auto_ptr<FieldRangeSetPair>(), _query, order, false, hint, honorRecordedPlan, min, max, _bestGuessOnly, _mayYield ) ); + } + else { + BSONElement e = _query.getField( "$or" ); + massert( 13268, "invalid $or spec", e.type() == Array && e.embeddedObject().nFields() > 0 ); + } + } + + shared_ptr<QueryOp> MultiPlanScanner::runOpOnce( QueryOp &op ) { + assertMayRunMore(); + if ( !_or ) { + ++_i; + return _currentQps->runOp( op ); + } + ++_i; + auto_ptr<FieldRangeSetPair> frsp( _org->topFrsp() ); + auto_ptr<FieldRangeSetPair> originalFrsp( _org->topFrspOriginal() ); + BSONElement hintElt = _hint.firstElement(); + _currentQps.reset( new QueryPlanSet( _ns, frsp, originalFrsp, _query, BSONObj(), true, &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) ); + shared_ptr<QueryOp> ret( _currentQps->runOp( op ) ); + if ( ! ret->complete() ) + throw MsgAssertionException( ret->exception() ); + if ( ret->qp().willScanTable() ) { + _tableScanned = true; + } else { + // If the full table was scanned, don't bother popping the last or clause. + _org->popOrClause( ret->qp().nsd(), ret->qp().idxNo(), ret->qp().indexed() ? ret->qp().indexKey() : BSONObj() ); + } + return ret; + } + + shared_ptr<QueryOp> MultiPlanScanner::runOp( QueryOp &op ) { + shared_ptr<QueryOp> ret = runOpOnce( op ); + while( !ret->stopRequested() && mayRunMore() ) { + ret = runOpOnce( *ret ); + } + return ret; + } + + shared_ptr<QueryOp> MultiPlanScanner::nextOpHandleEndOfClause() { + shared_ptr<QueryOp> op = _currentQps->nextOp( *_baseOp ); + if ( !op->complete() ) { + return op; + } + if ( op->qp().willScanTable() ) { + _tableScanned = true; + } else { + _org->popOrClause( op->qp().nsd(), op->qp().idxNo(), op->qp().indexed() ? op->qp().indexKey() : BSONObj() ); + } + return op; + } + + shared_ptr<QueryOp> MultiPlanScanner::nextOpBeginningClause() { + assertMayRunMore(); + shared_ptr<QueryOp> op; + while( mayRunMore() ) { + ++_i; + auto_ptr<FieldRangeSetPair> frsp( _org->topFrsp() ); + auto_ptr<FieldRangeSetPair> originalFrsp( _org->topFrspOriginal() ); + BSONElement hintElt = _hint.firstElement(); + _currentQps.reset( new QueryPlanSet( _ns, frsp, originalFrsp, _query, BSONObj(), true, &hintElt, _honorRecordedPlan, BSONObj(), BSONObj(), _bestGuessOnly, _mayYield ) ); + op = nextOpHandleEndOfClause(); + if ( !op->complete() ) { + return op; + } + _baseOp = op; + } + return op; + } + + shared_ptr<QueryOp> MultiPlanScanner::nextOp() { + if ( !_or ) { + if ( _i == 0 ) { + assertMayRunMore(); + ++_i; + } + return _currentQps->nextOp( *_baseOp ); + } + if ( _i == 0 ) { + return nextOpBeginningClause(); + } + shared_ptr<QueryOp> op = nextOpHandleEndOfClause(); + if ( !op->complete() ) { + return op; + } + if ( !op->stopRequested() && mayRunMore() ) { + // Finished scanning the clause, but stop hasn't been requested. + // Start scanning the next clause. + _baseOp = op; + return nextOpBeginningClause(); + } + return op; + } + + bool MultiPlanScanner::prepareToYield() { + return _currentQps.get() ? _currentQps->prepareToYield() : true; + } + + void MultiPlanScanner::recoverFromYield() { + if ( _currentQps.get() ) { + _currentQps->recoverFromYield(); + } + } + + void MultiPlanScanner::clearRunner() { + if ( _currentQps.get() ) { + _currentQps->clearRunner(); + } + } + + int MultiPlanScanner::currentNPlans() const { + return _currentQps.get() ? _currentQps->nPlans() : 0; + } + + shared_ptr<Cursor> MultiPlanScanner::singleCursor() const { + const QueryPlan *qp = singlePlan(); + if ( !qp ) { + return shared_ptr<Cursor>(); + } + // If there is only one plan and it does not require an in memory + // sort, we do not expect its cursor op to throw an exception and + // so do not need a QueryOptimizerCursor to handle this case. + return qp->newCursor(); + } + + const QueryPlan *MultiPlanScanner::singlePlan() const { + if ( _or || _currentQps->nPlans() != 1 || _currentQps->firstPlan()->scanAndOrderRequired() || _currentQps->usingCachedPlan() ) { + return 0; + } + return _currentQps->firstPlan().get(); + } + + bool MultiPlanScanner::uselessOr( const BSONElement &hint ) const { + NamespaceDetails *nsd = nsdetails( _ns ); + if ( !nsd ) { + return true; + } + if ( !hint.eoo() ) { + IndexDetails *id = parseHint( hint, nsd ); + if ( !id ) { + return true; + } + return QueryUtilIndexed::uselessOr( *_org, nsd, nsd->idxNo( *id ) ); + } + return QueryUtilIndexed::uselessOr( *_org, nsd, -1 ); + } + + MultiCursor::MultiCursor( const char *ns, const BSONObj &pattern, const BSONObj &order, shared_ptr<CursorOp> op, bool mayYield ) + : _mps( new MultiPlanScanner( ns, pattern, order, 0, true, BSONObj(), BSONObj(), !op.get(), mayYield ) ), _nscanned() { + if ( op.get() ) { + _op = op; + } + else { + _op.reset( new NoOp() ); + } + if ( _mps->mayRunMore() ) { + nextClause(); + if ( !ok() ) { + advance(); + } + } + else { + _c.reset( new BasicCursor( DiskLoc() ) ); + } + } + + MultiCursor::MultiCursor( auto_ptr<MultiPlanScanner> mps, const shared_ptr<Cursor> &c, const shared_ptr<CoveredIndexMatcher> &matcher, const QueryOp &op, long long nscanned ) + : _op( new NoOp( op ) ), _c( c ), _mps( mps ), _matcher( matcher ), _nscanned( nscanned ) { + _mps->setBestGuessOnly(); + _mps->mayYield( false ); // with a NoOp, there's no need to yield in QueryPlanSet + if ( !ok() ) { + // would have been advanced by UserQueryOp if possible + advance(); + } + } + + void MultiCursor::nextClause() { + if ( _nscanned >= 0 && _c.get() ) { + _nscanned += _c->nscanned(); + } + shared_ptr<CursorOp> best = _mps->runOpOnce( *_op ); + if ( ! best->complete() ) + throw MsgAssertionException( best->exception() ); + _c = best->newCursor(); + _matcher = best->matcher( _c ); + _op = best; + } + + bool indexWorks( const BSONObj &idxPattern, const BSONObj &sampleKey, int direction, int firstSignificantField ) { + BSONObjIterator p( idxPattern ); + BSONObjIterator k( sampleKey ); + int i = 0; + while( 1 ) { + BSONElement pe = p.next(); + BSONElement ke = k.next(); + if ( pe.eoo() && ke.eoo() ) + return true; + if ( pe.eoo() || ke.eoo() ) + return false; + if ( strcmp( pe.fieldName(), ke.fieldName() ) != 0 ) + return false; + if ( ( i == firstSignificantField ) && !( ( direction > 0 ) == ( pe.number() > 0 ) ) ) + return false; + ++i; + } + return false; + } + + BSONObj extremeKeyForIndex( const BSONObj &idxPattern, int baseDirection ) { + BSONObjIterator i( idxPattern ); + BSONObjBuilder b; + while( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + int idxDirection = e.number() >= 0 ? 1 : -1; + int direction = idxDirection * baseDirection; + switch( direction ) { + case 1: + b.appendMaxKey( e.fieldName() ); + break; + case -1: + b.appendMinKey( e.fieldName() ); + break; + default: + assert( false ); + } + } + return b.obj(); + } + + pair<int,int> keyAudit( const BSONObj &min, const BSONObj &max ) { + int direction = 0; + int firstSignificantField = 0; + BSONObjIterator i( min ); + BSONObjIterator a( max ); + while( 1 ) { + BSONElement ie = i.next(); + BSONElement ae = a.next(); + if ( ie.eoo() && ae.eoo() ) + break; + if ( ie.eoo() || ae.eoo() || strcmp( ie.fieldName(), ae.fieldName() ) != 0 ) { + return make_pair( -1, -1 ); + } + int cmp = ie.woCompare( ae ); + if ( cmp < 0 ) + direction = 1; + if ( cmp > 0 ) + direction = -1; + if ( direction != 0 ) + break; + ++firstSignificantField; + } + return make_pair( direction, firstSignificantField ); + } + + pair<int,int> flexibleKeyAudit( const BSONObj &min, const BSONObj &max ) { + if ( min.isEmpty() || max.isEmpty() ) { + return make_pair( 1, -1 ); + } + else { + return keyAudit( min, max ); + } + } + + // NOTE min, max, and keyPattern will be updated to be consistent with the selected index. + IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ) { + if ( min.isEmpty() && max.isEmpty() ) { + errmsg = "one of min or max must be specified"; + return 0; + } + + Client::Context ctx( ns ); + IndexDetails *id = 0; + NamespaceDetails *d = nsdetails( ns ); + if ( !d ) { + errmsg = "ns not found"; + return 0; + } + + pair<int,int> ret = flexibleKeyAudit( min, max ); + if ( ret == make_pair( -1, -1 ) ) { + errmsg = "min and max keys do not share pattern"; + return 0; + } + if ( keyPattern.isEmpty() ) { + NamespaceDetails::IndexIterator i = d->ii(); + while( i.more() ) { + IndexDetails& ii = i.next(); + if ( indexWorks( ii.keyPattern(), min.isEmpty() ? max : min, ret.first, ret.second ) ) { + if ( ii.getSpec().getType() == 0 ) { + id = ⅈ + keyPattern = ii.keyPattern(); + break; + } + } + } + + } + else { + if ( !indexWorks( keyPattern, min.isEmpty() ? max : min, ret.first, ret.second ) ) { + errmsg = "requested keyPattern does not match specified keys"; + return 0; + } + NamespaceDetails::IndexIterator i = d->ii(); + while( i.more() ) { + IndexDetails& ii = i.next(); + if( ii.keyPattern().woCompare(keyPattern) == 0 ) { + id = ⅈ + break; + } + if ( keyPattern.nFields() == 1 && ii.keyPattern().nFields() == 1 && + IndexDetails::isIdIndexPattern( keyPattern ) && + ii.isIdIndex() ) { + id = ⅈ + break; + } + + } + } + + if ( min.isEmpty() ) { + min = extremeKeyForIndex( keyPattern, -1 ); + } + else if ( max.isEmpty() ) { + max = extremeKeyForIndex( keyPattern, 1 ); + } + + if ( !id ) { + errmsg = str::stream() << "no index found for specified keyPattern: " << keyPattern.toString() + << " min: " << min << " max: " << max; + return 0; + } + + min = min.extractFieldsUnDotted( keyPattern ); + max = max.extractFieldsUnDotted( keyPattern ); + + return id; + } + + bool isSimpleIdQuery( const BSONObj& query ) { + BSONObjIterator i(query); + + if( !i.more() ) + return false; + + BSONElement e = i.next(); + + if( i.more() ) + return false; + + if( strcmp("_id", e.fieldName()) != 0 ) + return false; + + if ( e.isSimpleType() ) // e.g. not something like { _id : { $gt : ... + return true; + + if ( e.type() == Object ) + return e.Obj().firstElementFieldName()[0] != '$'; + + return false; + } + + shared_ptr<Cursor> bestGuessCursor( const char *ns, const BSONObj &query, const BSONObj &sort ) { + if( !query.getField( "$or" ).eoo() ) { + return shared_ptr<Cursor>( new MultiCursor( ns, query, sort ) ); + } + else { + auto_ptr<FieldRangeSetPair> frsp( new FieldRangeSetPair( ns, query, true ) ); + auto_ptr<FieldRangeSetPair> origFrsp( new FieldRangeSetPair( *frsp ) ); + + QueryPlanSet qps( ns, frsp, origFrsp, query, sort, false ); + QueryPlanSet::QueryPlanPtr qpp = qps.getBestGuess(); + if( ! qpp.get() ) return shared_ptr<Cursor>(); + + shared_ptr<Cursor> ret = qpp->newCursor(); + + // If we don't already have a matcher, supply one. + if ( !query.isEmpty() && ! ret->matcher() ) { + shared_ptr<CoveredIndexMatcher> matcher( new CoveredIndexMatcher( query, ret->indexKeyPattern() ) ); + ret->setMatcher( matcher ); + } + return ret; + } + } + + bool QueryUtilIndexed::indexUseful( const FieldRangeSetPair &frsp, NamespaceDetails *d, int idxNo, const BSONObj &order ) { + DEV frsp.assertValidIndex( d, idxNo ); + BSONObj keyPattern = d->idx( idxNo ).keyPattern(); + if ( !frsp.matchPossibleForIndex( d, idxNo, keyPattern ) ) { + // No matches are possible in the index so the index may be useful. + return true; + } + return d->idx( idxNo ).getSpec().suitability( frsp.simplifiedQueryForIndex( d, idxNo, keyPattern ), order ) != USELESS; + } + + void QueryUtilIndexed::clearIndexesForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order ) { + SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex); + NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( frsp.ns() ); + nsd.registerIndexForPattern( frsp._singleKey.pattern( order ), BSONObj(), 0 ); + nsd.registerIndexForPattern( frsp._multiKey.pattern( order ), BSONObj(), 0 ); + } + + pair< BSONObj, long long > QueryUtilIndexed::bestIndexForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order ) { + SimpleMutex::scoped_lock lk(NamespaceDetailsTransient::_qcMutex); + NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( frsp.ns() ); + // TODO Maybe it would make sense to return the index with the lowest + // nscanned if there are two possibilities. + if ( frsp._singleKey.matchPossible() ) { + QueryPattern pattern = frsp._singleKey.pattern( order ); + BSONObj oldIdx = nsd.indexForPattern( pattern ); + if ( !oldIdx.isEmpty() ) { + long long oldNScanned = nsd.nScannedForPattern( pattern ); + return make_pair( oldIdx, oldNScanned ); + } + } + if ( frsp._multiKey.matchPossible() ) { + QueryPattern pattern = frsp._multiKey.pattern( order ); + BSONObj oldIdx = nsd.indexForPattern( pattern ); + if ( !oldIdx.isEmpty() ) { + long long oldNScanned = nsd.nScannedForPattern( pattern ); + return make_pair( oldIdx, oldNScanned ); + } + } + return make_pair( BSONObj(), 0 ); + } + + bool QueryUtilIndexed::uselessOr( const OrRangeGenerator &org, NamespaceDetails *d, int hintIdx ) { + for( list<FieldRangeSetPair>::const_iterator i = org._originalOrSets.begin(); i != org._originalOrSets.end(); ++i ) { + if ( hintIdx != -1 ) { + if ( !indexUseful( *i, d, hintIdx, BSONObj() ) ) { + return true; + } + } + else { + bool useful = false; + for( int j = 0; j < d->nIndexes; ++j ) { + if ( indexUseful( *i, d, j, BSONObj() ) ) { + useful = true; + break; + } + } + if ( !useful ) { + return true; + } + } + } + return false; + } + +} // namespace mongo diff --git a/src/mongo/db/queryoptimizer.h b/src/mongo/db/queryoptimizer.h new file mode 100644 index 00000000000..297c6fe9505 --- /dev/null +++ b/src/mongo/db/queryoptimizer.h @@ -0,0 +1,599 @@ +// @file queryoptimizer.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "cursor.h" +#include "jsobj.h" +#include "queryutil.h" +#include "matcher.h" +#include "../util/net/listen.h" +#include <queue> + +namespace mongo { + + class IndexDetails; + class IndexType; + class ElapsedTracker; + + /** A plan for executing a query using the given index spec and FieldRangeSet. */ + class QueryPlan : boost::noncopyable { + public: + + /** + * @param originalFrsp - original constraints for this query clause. If null, frsp will be used instead. + */ + QueryPlan(NamespaceDetails *d, + int idxNo, // -1 = no index + const FieldRangeSetPair &frsp, + const FieldRangeSetPair *originalFrsp, + const BSONObj &originalQuery, + const BSONObj &order, + bool mustAssertOnYieldFailure = true, + const BSONObj &startKey = BSONObj(), + const BSONObj &endKey = BSONObj(), + string special="" ); + + /** @return true iff no other plans should be considered. */ + bool optimal() const { return _optimal; } + /* @return true iff this plan should not be considered at all. */ + bool unhelpful() const { return _unhelpful; } + /** @return true iff ScanAndOrder processing will be required for result set. */ + bool scanAndOrderRequired() const { return _scanAndOrderRequired; } + /** + * @return true iff the index we are using has keys such that it can completely resolve the + * query expression to match by itself without ever checking the main object. + */ + bool exactKeyMatch() const { return _exactKeyMatch; } + /** @return true iff this QueryPlan would perform an unindexed scan. */ + bool willScanTable() const { return _idxNo < 0 && !_impossible; } + /** @return 'special' attribute of the plan, which was either set explicitly or generated from the index. */ + const string &special() const { return _special; } + + /** @return a new cursor based on this QueryPlan's index and FieldRangeSet. */ + shared_ptr<Cursor> newCursor( const DiskLoc &startLoc = DiskLoc() , int numWanted=0 ) const; + /** @return a new reverse cursor if this is an unindexed plan. */ + shared_ptr<Cursor> newReverseCursor() const; + /** Register this plan as a winner for its QueryPattern, with specified 'nscanned'. */ + void registerSelf( long long nScanned ) const; + + int direction() const { return _direction; } + BSONObj indexKey() const; + bool indexed() const { return _index; } + int idxNo() const { return _idxNo; } + const char *ns() const { return _frs.ns(); } + NamespaceDetails *nsd() const { return _d; } + BSONObj originalQuery() const { return _originalQuery; } + BSONObj simplifiedQuery( const BSONObj& fields = BSONObj() ) const { return _frs.simplifiedQuery( fields ); } + const FieldRange &range( const char *fieldName ) const { return _frs.range( fieldName ); } + shared_ptr<FieldRangeVector> originalFrv() const { return _originalFrv; } + + const FieldRangeSet &multikeyFrs() const { return _frsMulti; } + + bool mustAssertOnYieldFailure() const { return _mustAssertOnYieldFailure; } + + /** The following member functions are just for testing. */ + + shared_ptr<FieldRangeVector> frv() const { return _frv; } + bool isMultiKey() const; + + private: + NamespaceDetails * _d; + int _idxNo; + const FieldRangeSet &_frs; + const FieldRangeSet &_frsMulti; + const BSONObj &_originalQuery; + const BSONObj &_order; + const IndexDetails * _index; + bool _optimal; + bool _scanAndOrderRequired; + bool _exactKeyMatch; + int _direction; + shared_ptr<FieldRangeVector> _frv; + shared_ptr<FieldRangeVector> _originalFrv; + BSONObj _startKey; + BSONObj _endKey; + bool _endKeyInclusive; + bool _unhelpful; + bool _impossible; + string _special; + IndexType * _type; + bool _startOrEndSpec; + bool _mustAssertOnYieldFailure; + }; + + /** + * Inherit from this interface to implement a new query operation. + * The query optimizer will clone the QueryOp that is provided, giving + * each clone its own query plan. + * + * Normal sequence of events: + * 1) A new QueryOp is generated using createChild(). + * 2) A QueryPlan is assigned to this QueryOp with setQueryPlan(). + * 3) _init() is called on the QueryPlan. + * 4) next() is called repeatedly, with nscanned() checked after each call. + * 5) In one of these calls to next(), setComplete() is called. + * 6) The QueryPattern for the QueryPlan may be recorded as a winner. + */ + class QueryOp { + public: + QueryOp() : _complete(), _stopRequested(), _qp(), _error() {} + + /** Used when handing off from one QueryOp to another. */ + QueryOp( const QueryOp &other ) : + _complete(), _stopRequested(), _qp(), _error(), _matcher( other._matcher ), + _orConstraint( other._orConstraint ) {} + + virtual ~QueryOp() {} + + /** @return QueryPlan assigned to this QueryOp by the query optimizer. */ + const QueryPlan &qp() const { return *_qp; } + + /** Advance to next potential matching document (eg using a cursor). */ + virtual void next() = 0; + /** + * @return current 'nscanned' metric for this QueryOp. Used to compare + * cost to other QueryOps. + */ + virtual long long nscanned() = 0; + /** Take any steps necessary before the db mutex is yielded. */ + virtual bool prepareToYield() { massert( 13335, "yield not supported", false ); return false; } + /** Recover once the db mutex is regained. */ + virtual void recoverFromYield() { massert( 13336, "yield not supported", false ); } + + /** + * @return true iff the QueryPlan for this QueryOp may be registered + * as a winning plan. + */ + virtual bool mayRecordPlan() const = 0; + + /** @return true iff the implementation called setComplete() or setStop(). */ + bool complete() const { return _complete; } + /** @return true iff the implementation called steStop(). */ + bool stopRequested() const { return _stopRequested; } + /** @return true iff the implementation threw an exception. */ + bool error() const { return _error; } + /** @return the exception thrown by implementation if one was thrown. */ + ExceptionInfo exception() const { return _exception; } + + /** To be called by QueryPlanSet::Runner only. */ + + QueryOp *createChild(); + void setQueryPlan( const QueryPlan *qp ) { _qp = qp; assert( _qp != NULL ); } + void init(); + void setException( const DBException &e ) { + _error = true; + _exception = e.getInfo(); + } + + shared_ptr<CoveredIndexMatcher> matcher( const shared_ptr<Cursor>& c ) const { + return matcher( c.get() ); + } + shared_ptr<CoveredIndexMatcher> matcher( Cursor* c ) const { + if( ! c ) return _matcher; + return c->matcher() ? c->matcherPtr() : _matcher; + } + + protected: + /** Call if all results have been found. */ + void setComplete() { + _orConstraint = qp().originalFrv(); + _complete = true; + } + /** Call if the scan is complete even if not all results have been found. */ + void setStop() { setComplete(); _stopRequested = true; } + + /** Handle initialization after a QueryPlan has been set. */ + virtual void _init() = 0; + + /** @return a copy of the inheriting class, which will be run with its own query plan. */ + virtual QueryOp *_createChild() const = 0; + + virtual bool alwaysUseRecord() const { return false; } + + private: + bool _complete; + bool _stopRequested; + ExceptionInfo _exception; + const QueryPlan *_qp; + bool _error; + shared_ptr<CoveredIndexMatcher> _matcher; + shared_ptr<CoveredIndexMatcher> _oldMatcher; + shared_ptr<FieldRangeVector> _orConstraint; + }; + + // temp. this class works if T::operator< is variant unlike a regular stl priority queue. + // but it's very slow. however if v.size() is always very small, it would be fine, + // maybe even faster than a smart impl that does more memory allocations. + template<class T> + class our_priority_queue : boost::noncopyable { + vector<T> v; + public: + our_priority_queue() { + v.reserve(4); + } + int size() const { return v.size(); } + bool empty() const { return v.empty(); } + void push(const T & x) { + v.push_back(x); + } + T pop() { + size_t t = 0; + for( size_t i = 1; i < v.size(); i++ ) { + if( v[t] < v[i] ) + t = i; + } + T ret = v[t]; + v.erase(v.begin()+t); + return ret; + } + }; + + /** + * A set of candidate query plans for a query. This class can return a best buess plan or run a + * QueryOp on all the plans. + */ + class QueryPlanSet { + public: + + typedef boost::shared_ptr<QueryPlan> QueryPlanPtr; + typedef vector<QueryPlanPtr> PlanSet; + + /** + * @param originalFrsp - original constraints for this query clause; if null, frsp will be used. + */ + QueryPlanSet( const char *ns, + auto_ptr<FieldRangeSetPair> frsp, + auto_ptr<FieldRangeSetPair> originalFrsp, + const BSONObj &originalQuery, + const BSONObj &order, + bool mustAssertOnYieldFailure = true, + const BSONElement *hint = 0, + bool honorRecordedPlan = true, + const BSONObj &min = BSONObj(), + const BSONObj &max = BSONObj(), + bool bestGuessOnly = false, + bool mayYield = false); + + /** @return number of candidate plans. */ + int nPlans() const { return _plans.size(); } + + /** + * Clone op for each query plan, and @return the first cloned op to call + * setComplete() or setStop(). + */ + + shared_ptr<QueryOp> runOp( QueryOp &op ); + template<class T> + shared_ptr<T> runOp( T &op ) { + return dynamic_pointer_cast<T>( runOp( static_cast<QueryOp&>( op ) ) ); + } + + /** Initialize or iterate a runner generated from @param originalOp. */ + shared_ptr<QueryOp> nextOp( QueryOp &originalOp, bool retried = false ); + + /** Yield the runner member. */ + + bool prepareToYield(); + void recoverFromYield(); + + /** Clear the runner member. */ + void clearRunner(); + + QueryPlanPtr firstPlan() const { return _plans[ 0 ]; } + + /** @return metadata about cursors and index bounds for all plans, suitable for explain output. */ + BSONObj explain() const; + /** @return true iff a plan is selected based on previous success of this plan. */ + bool usingCachedPlan() const { return _usingCachedPlan; } + /** @return a single plan that may work well for the specified query. */ + QueryPlanPtr getBestGuess() const; + + //for testing + const FieldRangeSetPair &frsp() const { return *_frsp; } + const FieldRangeSetPair *originalFrsp() const { return _originalFrsp.get(); } + bool modifiedKeys() const; + bool hasMultiKey() const; + + private: + void addOtherPlans( bool checkFirst ); + void addPlan( QueryPlanPtr plan, bool checkFirst ) { + if ( checkFirst && plan->indexKey().woCompare( _plans[ 0 ]->indexKey() ) == 0 ) + return; + _plans.push_back( plan ); + } + void init(); + void addHint( IndexDetails &id ); + class Runner { + public: + Runner( QueryPlanSet &plans, QueryOp &op ); + + /** + * Iterate interactively through candidate documents on all plans. + * QueryOp objects are returned at each interleaved step. + */ + + /** @return a plan that has completed, otherwise an arbitrary plan. */ + shared_ptr<QueryOp> init(); + /** + * Move the Runner forward one iteration, and @return the plan for + * this iteration. + */ + shared_ptr<QueryOp> next(); + /** @return next non error op if there is one, otherwise an error op. */ + shared_ptr<QueryOp> nextNonError(); + + bool prepareToYield(); + void recoverFromYield(); + + /** Run until first op completes. */ + shared_ptr<QueryOp> runUntilFirstCompletes(); + + void mayYield(); + QueryOp &_op; + QueryPlanSet &_plans; + static void initOp( QueryOp &op ); + static void nextOp( QueryOp &op ); + static bool prepareToYieldOp( QueryOp &op ); + static void recoverFromYieldOp( QueryOp &op ); + private: + vector<shared_ptr<QueryOp> > _ops; + struct OpHolder { + OpHolder( const shared_ptr<QueryOp> &op ) : _op( op ), _offset() {} + shared_ptr<QueryOp> _op; + long long _offset; + bool operator<( const OpHolder &other ) const { + return _op->nscanned() + _offset > other._op->nscanned() + other._offset; + } + }; + our_priority_queue<OpHolder> _queue; + }; + + const char *_ns; + BSONObj _originalQuery; + auto_ptr<FieldRangeSetPair> _frsp; + auto_ptr<FieldRangeSetPair> _originalFrsp; + PlanSet _plans; + bool _mayRecordPlan; + bool _usingCachedPlan; + BSONObj _hint; + BSONObj _order; + long long _oldNScanned; + bool _honorRecordedPlan; + BSONObj _min; + BSONObj _max; + string _special; + bool _bestGuessOnly; + bool _mayYield; + ElapsedTracker _yieldSometimesTracker; + shared_ptr<Runner> _runner; + bool _mustAssertOnYieldFailure; + }; + + /** Handles $or type queries by generating a QueryPlanSet for each $or clause. */ + class MultiPlanScanner { + public: + MultiPlanScanner( const char *ns, + const BSONObj &query, + const BSONObj &order, + const BSONElement *hint = 0, + bool honorRecordedPlan = true, + const BSONObj &min = BSONObj(), + const BSONObj &max = BSONObj(), + bool bestGuessOnly = false, + bool mayYield = false); + + /** + * Clone op for each query plan of a single $or clause, and @return the first cloned op + * to call setComplete() or setStop(). + */ + + shared_ptr<QueryOp> runOpOnce( QueryOp &op ); + template<class T> + shared_ptr<T> runOpOnce( T &op ) { + return dynamic_pointer_cast<T>( runOpOnce( static_cast<QueryOp&>( op ) ) ); + } + + /** + * For each $or clause, calls runOpOnce on the child QueryOp cloned from the winning QueryOp + * of the previous $or clause (or from the supplied 'op' for the first $or clause). + */ + + shared_ptr<QueryOp> runOp( QueryOp &op ); + template<class T> + shared_ptr<T> runOp( T &op ) { + return dynamic_pointer_cast<T>( runOp( static_cast<QueryOp&>( op ) ) ); + } + + /** Initialize or iterate a runner generated from @param originalOp. */ + + void initialOp( const shared_ptr<QueryOp> &originalOp ) { _baseOp = originalOp; } + shared_ptr<QueryOp> nextOp(); + + /** Yield the runner member. */ + + bool prepareToYield(); + void recoverFromYield(); + + /** Clear the runner member. */ + void clearRunner(); + + int currentNPlans() const; + + /** + * @return a single simple cursor if the scanner would run a single cursor + * for this query, otherwise return an empty shared_ptr. + */ + shared_ptr<Cursor> singleCursor() const; + + /** + * @return the query plan that would be used if the scanner would run a single + * cursor for this query, otherwise 0. The returned plan is invalid if this + * MultiPlanScanner is destroyed, hence we return a raw pointer. + */ + const QueryPlan *singlePlan() const; + + /** @return true iff more $or clauses need to be scanned. */ + bool mayRunMore() const { return _or ? ( !_tableScanned && !_org->orFinished() ) : _i == 0; } + /** @return non-$or version of explain output. */ + BSONObj oldExplain() const { assertNotOr(); return _currentQps->explain(); } + /** @return true iff this is not a $or query and a plan is selected based on previous success of this plan. */ + bool usingCachedPlan() const { return !_or && _currentQps->usingCachedPlan(); } + /** Don't attempt to scan multiple plans, just use the best guess. */ + void setBestGuessOnly() { _bestGuessOnly = true; } + /** Yielding is allowed while running each QueryPlan. */ + void mayYield( bool val ) { _mayYield = val; } + bool modifiedKeys() const { return _currentQps->modifiedKeys(); } + bool hasMultiKey() const { return _currentQps->hasMultiKey(); } + + private: + void assertNotOr() const { + massert( 13266, "not implemented for $or query", !_or ); + } + void assertMayRunMore() const { + massert( 13271, "can't run more ops", mayRunMore() ); + } + shared_ptr<QueryOp> nextOpBeginningClause(); + shared_ptr<QueryOp> nextOpHandleEndOfClause(); + bool uselessOr( const BSONElement &hint ) const; + const char * _ns; + bool _or; + BSONObj _query; + shared_ptr<OrRangeGenerator> _org; // May be null in certain non $or query cases. + auto_ptr<QueryPlanSet> _currentQps; + int _i; + bool _honorRecordedPlan; + bool _bestGuessOnly; + BSONObj _hint; + bool _mayYield; + bool _tableScanned; + shared_ptr<QueryOp> _baseOp; + }; + + /** Provides a cursor interface for certain limited uses of a MultiPlanScanner. */ + class MultiCursor : public Cursor { + public: + class CursorOp : public QueryOp { + public: + CursorOp() {} + CursorOp( const QueryOp &other ) : QueryOp( other ) {} + virtual shared_ptr<Cursor> newCursor() const = 0; + }; + /** takes ownership of 'op' */ + MultiCursor( const char *ns, const BSONObj &pattern, const BSONObj &order, shared_ptr<CursorOp> op = shared_ptr<CursorOp>(), bool mayYield = false ); + /** + * Used + * 1. To handoff a query to a getMore() + * 2. To handoff a QueryOptimizerCursor + * @param nscanned is an optional initial value, if not supplied nscanned() + * will always return -1 + */ + MultiCursor( auto_ptr<MultiPlanScanner> mps, const shared_ptr<Cursor> &c, const shared_ptr<CoveredIndexMatcher> &matcher, const QueryOp &op, long long nscanned = -1 ); + + virtual bool ok() { return _c->ok(); } + virtual Record* _current() { return _c->_current(); } + virtual BSONObj current() { return _c->current(); } + virtual DiskLoc currLoc() { return _c->currLoc(); } + virtual bool advance() { + _c->advance(); + while( !ok() && _mps->mayRunMore() ) { + nextClause(); + } + return ok(); + } + virtual BSONObj currKey() const { return _c->currKey(); } + virtual DiskLoc refLoc() { return _c->refLoc(); } + virtual void noteLocation() { _c->noteLocation(); } + virtual void checkLocation() { _c->checkLocation(); } + virtual bool supportGetMore() { return true; } + virtual bool supportYields() { return _c->supportYields(); } + virtual BSONObj indexKeyPattern() { return _c->indexKeyPattern(); } + + /** + * with update we could potentially get the same document on multiple + * indexes, but update appears to already handle this with seenObjects + * so we don't have to do anything special here. + */ + virtual bool getsetdup(DiskLoc loc) { return _c->getsetdup( loc ); } + + virtual bool autoDedup() const { return _c->autoDedup(); } + + virtual bool modifiedKeys() const { return _mps->modifiedKeys(); } + + virtual bool isMultiKey() const { return _mps->hasMultiKey(); } + + virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; } + virtual CoveredIndexMatcher* matcher() const { return _matcher.get(); } + + virtual bool capped() const { return _c->capped(); } + + /** return -1 if we're a getmore handoff */ + virtual long long nscanned() { return _nscanned >= 0 ? _nscanned + _c->nscanned() : _nscanned; } + /** just for testing */ + shared_ptr<Cursor> sub_c() const { return _c; } + private: + class NoOp : public CursorOp { + public: + NoOp() {} + NoOp( const QueryOp &other ) : CursorOp( other ) {} + virtual void _init() { setComplete(); } + virtual void next() {} + virtual bool mayRecordPlan() const { return false; } + virtual QueryOp *_createChild() const { return new NoOp(); } + virtual shared_ptr<Cursor> newCursor() const { return qp().newCursor(); } + virtual long long nscanned() { assert( false ); return 0; } + }; + void nextClause(); + shared_ptr<CursorOp> _op; + shared_ptr<Cursor> _c; + auto_ptr<MultiPlanScanner> _mps; + shared_ptr<CoveredIndexMatcher> _matcher; + long long _nscanned; + }; + + /** NOTE min, max, and keyPattern will be updated to be consistent with the selected index. */ + IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ); + + bool isSimpleIdQuery( const BSONObj& query ); + + /** + * @return a single cursor that may work well for the given query. + * It is possible no cursor is returned if the sort is not supported by an index. Clients are responsible + * for checking this if they are not sure an index for a sort exists, and defaulting to a non-sort if + * no suitable indices exist. + */ + shared_ptr<Cursor> bestGuessCursor( const char *ns, const BSONObj &query, const BSONObj &sort ); + + /** + * Add-on functionality for queryutil classes requiring access to indexing + * functionality not currently linked to mongos. + * TODO Clean this up a bit, possibly with separate sharded and non sharded + * implementations for the appropriate queryutil classes or by pulling index + * related functionality into separate wrapper classes. + */ + struct QueryUtilIndexed { + /** @return true if the index may be useful according to its KeySpec. */ + static bool indexUseful( const FieldRangeSetPair &frsp, NamespaceDetails *d, int idxNo, const BSONObj &order ); + /** Clear any indexes recorded as the best for either the single or multi key pattern. */ + static void clearIndexesForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order ); + /** Return a recorded best index for the single or multi key pattern. */ + static pair< BSONObj, long long > bestIndexForPatterns( const FieldRangeSetPair &frsp, const BSONObj &order ); + static bool uselessOr( const OrRangeGenerator& org, NamespaceDetails *d, int hintIdx ); + }; + +} // namespace mongo diff --git a/src/mongo/db/queryoptimizercursor.cpp b/src/mongo/db/queryoptimizercursor.cpp new file mode 100644 index 00000000000..07f8df12815 --- /dev/null +++ b/src/mongo/db/queryoptimizercursor.cpp @@ -0,0 +1,530 @@ +// @file queryoptimizercursor.cpp + +/** + * Copyright (C) 2011 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "pch.h" +#include "queryoptimizer.h" +#include "pdfile.h" +#include "clientcursor.h" +#include "btree.h" +#include "queryoptimizercursor.h" + +namespace mongo { + + static const int OutOfOrderDocumentsAssertionCode = 14810; + + /** + * A QueryOp implementation utilized by the QueryOptimizerCursor + */ + class QueryOptimizerCursorOp : public QueryOp { + public: + /** + * @param aggregateNscanned - shared long long counting total nscanned for + * query ops for all cursors. + * @param requireIndex - if unindexed scans should be prohibited. + */ + QueryOptimizerCursorOp( long long &aggregateNscanned, bool requireIndex, int cumulativeCount = 0 ) : _matchCounter( aggregateNscanned, cumulativeCount ), _countingMatches(), _mustAdvance(), _capped(), _yieldRecoveryFailed(), _requireIndex( requireIndex ) {} + + virtual void _init() { + if ( qp().scanAndOrderRequired() ) { + throw MsgAssertionException( OutOfOrderDocumentsAssertionCode, "order spec cannot be satisfied with index" ); + } + if ( _requireIndex && strcmp( qp().indexKey().firstElementFieldName(), "$natural" ) == 0 ) { + throw MsgAssertionException( 9011, "Not an index cursor" ); + } + _c = qp().newCursor(); + + // The QueryOptimizerCursor::prepareToTouchEarlierIterate() implementation requires _c->prepareToYield() to work. + verify( 15940, _c->supportYields() ); + _capped = _c->capped(); + + // TODO This violates the current Cursor interface abstraction, but for now it's simpler to keep our own set of + // dups rather than avoid poisoning the cursor's dup set with unreturned documents. Deduping documents + // matched in this QueryOptimizerCursorOp will run against the takeover cursor. + _matchCounter.setCheckDups( _c->isMultiKey() ); + + _matchCounter.updateNscanned( _c->nscanned() ); + } + + virtual long long nscanned() { + return _c ? _c->nscanned() : _matchCounter.nscanned(); + } + + virtual bool prepareToYield() { + if ( _c && !_cc ) { + _cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , _c , qp().ns() ) ); + } + if ( _cc ) { + recordCursorLocation(); + return _cc->prepareToYield( _yieldData ); + } + // no active cursor - ok to yield + return true; + } + + virtual void recoverFromYield() { + if ( _cc && !ClientCursor::recoverFromYield( _yieldData ) ) { + _yieldRecoveryFailed = true; + _c.reset(); + _cc.reset(); + + if ( _capped ) { + msgassertedNoTrace( 13338, str::stream() << "capped cursor overrun: " << qp().ns() ); + } + else if ( qp().mustAssertOnYieldFailure() ) { + msgassertedNoTrace( 15892, str::stream() << "QueryOptimizerCursorOp::recoverFromYield() failed to recover" ); + } + else { + // we don't fail query since we're fine with returning partial data if collection dropped + // also, see SERVER-2454 + } + } + else { + checkCursorAdvanced(); + } + } + + void prepareToTouchEarlierIterate() { + recordCursorLocation(); + if ( _c ) { + _c->prepareToTouchEarlierIterate(); + } + } + + void recoverFromTouchingEarlierIterate() { + if ( _c ) { + _c->recoverFromTouchingEarlierIterate(); + } + checkCursorAdvanced(); + } + + virtual void next() { + mayAdvance(); + + if ( _matchCounter.enoughCumulativeMatchesToChooseAPlan() ) { + setStop(); + return; + } + if ( !_c || !_c->ok() ) { + setComplete(); + return; + } + + _mustAdvance = true; + } + virtual QueryOp *_createChild() const { + return new QueryOptimizerCursorOp( _matchCounter.aggregateNscanned(), _requireIndex, _matchCounter.cumulativeCount() ); + } + DiskLoc currLoc() const { return _c ? _c->currLoc() : DiskLoc(); } + BSONObj currKey() const { return _c ? _c->currKey() : BSONObj(); } + bool currentMatches( MatchDetails *details ) { + bool ret = ( _c && _c->ok() ) ? matcher( _c.get() )->matchesCurrent( _c.get(), details ) : false; + // Cache the match, so we can count it in mayAdvance(). + _matchCounter.setMatch( ret ); + return ret; + } + virtual bool mayRecordPlan() const { + return !_yieldRecoveryFailed && complete() && ( !stopRequested() || _matchCounter.enoughMatchesToRecordPlan() ); + } + shared_ptr<Cursor> cursor() const { return _c; } + private: + void mayAdvance() { + if ( !_c ) { + return; + } + if ( countingMatches() ) { + // Check match if not yet known. + if ( !_matchCounter.knowMatch() ) { + currentMatches( 0 ); + } + _matchCounter.countMatch( currLoc() ); + } + if ( _mustAdvance ) { + _c->advance(); + handleCursorAdvanced(); + } + _matchCounter.updateNscanned( _c->nscanned() ); + } + // Don't count matches on the first call to next(), which occurs before the first result is returned. + bool countingMatches() { + if ( _countingMatches ) { + return true; + } + _countingMatches = true; + return false; + } + + void recordCursorLocation() { + _posBeforeYield = currLoc(); + } + void checkCursorAdvanced() { + // This check will not correctly determine if we are looking at a different document in + // all cases, but it is adequate for updating the query plan's match count (just used to pick + // plans, not returned to the client) and adjust iteration via _mustAdvance. + if ( _posBeforeYield != currLoc() ) { + // If the yield advanced our position, the next next() will be a no op. + handleCursorAdvanced(); + } + } + void handleCursorAdvanced() { + _mustAdvance = false; + _matchCounter.resetMatch(); + } + + CachedMatchCounter _matchCounter; + bool _countingMatches; + bool _mustAdvance; + bool _capped; + shared_ptr<Cursor> _c; + ClientCursor::CleanupPointer _cc; + DiskLoc _posBeforeYield; + ClientCursor::YieldData _yieldData; + bool _yieldRecoveryFailed; + bool _requireIndex; + }; + + /** + * This cursor runs a MultiPlanScanner iteratively and returns results from + * the scanner's cursors as they become available. Once the scanner chooses + * a single plan, this cursor becomes a simple wrapper around that single + * plan's cursor (called the 'takeover' cursor). + */ + class QueryOptimizerCursor : public Cursor { + public: + QueryOptimizerCursor( auto_ptr<MultiPlanScanner> &mps, bool requireIndex ) : + _mps( mps ), + _originalOp( new QueryOptimizerCursorOp( _nscanned, requireIndex ) ), + _currOp(), + _nscanned() { + _mps->initialOp( _originalOp ); + shared_ptr<QueryOp> op = _mps->nextOp(); + rethrowOnError( op ); + if ( !op->complete() ) { + _currOp = dynamic_cast<QueryOptimizerCursorOp*>( op.get() ); + } + } + + virtual bool ok() { return _takeover ? _takeover->ok() : !currLoc().isNull(); } + + virtual Record* _current() { + if ( _takeover ) { + return _takeover->_current(); + } + assertOk(); + return currLoc().rec(); + } + + virtual BSONObj current() { + if ( _takeover ) { + return _takeover->current(); + } + assertOk(); + return currLoc().obj(); + } + + virtual DiskLoc currLoc() { return _takeover ? _takeover->currLoc() : _currLoc(); } + + DiskLoc _currLoc() const { + dassert( !_takeover ); + return _currOp ? _currOp->currLoc() : DiskLoc(); + } + + virtual bool advance() { + return _advance( false ); + } + + virtual BSONObj currKey() const { + if ( _takeover ) { + return _takeover->currKey(); + } + assertOk(); + return _currOp->currKey(); + } + + /** + * When return value isNull(), our cursor will be ignored for yielding by the client cursor implementation. + * In such cases, an internal ClientCursor will update the position of component cursors when necessary. + */ + virtual DiskLoc refLoc() { return _takeover ? _takeover->refLoc() : DiskLoc(); } + + virtual BSONObj indexKeyPattern() { + if ( _takeover ) { + return _takeover->indexKeyPattern(); + } + assertOk(); + return _currOp->cursor()->indexKeyPattern(); + } + + virtual bool supportGetMore() { return false; } + + virtual bool supportYields() { return _takeover ? _takeover->supportYields() : true; } + + virtual void prepareToTouchEarlierIterate() { + if ( _takeover ) { + _takeover->prepareToTouchEarlierIterate(); + } + else if ( _currOp ) { + if ( _mps->currentNPlans() == 1 ) { + // This single plan version is a bit more performant, so we use it when possible. + _currOp->prepareToTouchEarlierIterate(); + } + else { + // With multiple plans, the 'earlier iterate' could be the current iterate of one of + // the component plans. We do a full yield of all plans, using ClientCursors. + verify( 15941, _mps->prepareToYield() ); + } + } + } + + virtual void recoverFromTouchingEarlierIterate() { + if ( _takeover ) { + _takeover->recoverFromTouchingEarlierIterate(); + } + else if ( _currOp ) { + if ( _mps->currentNPlans() == 1 ) { + _currOp->recoverFromTouchingEarlierIterate(); + } + else { + recoverFromYield(); + } + } + } + + virtual bool prepareToYield() { + if ( _takeover ) { + return _takeover->prepareToYield(); + } + else if ( _currOp ) { + return _mps->prepareToYield(); + } + else { + // No state needs to be protected, so yielding is fine. + return true; + } + } + + virtual void recoverFromYield() { + if ( _takeover ) { + _takeover->recoverFromYield(); + return; + } + if ( _currOp ) { + _mps->recoverFromYield(); + if ( _currOp->error() || !ok() ) { + // Advance to a non error op if on of the ops errored out. + // Advance to a following $or clause if the $or clause returned all results. + _advance( true ); + } + } + } + + virtual string toString() { return "QueryOptimizerCursor"; } + + virtual bool getsetdup(DiskLoc loc) { + if ( _takeover ) { + if ( getdupInternal( loc ) ) { + return true; + } + return _takeover->getsetdup( loc ); + } + assertOk(); + return getsetdupInternal( loc ); + } + + /** Matcher needs to know if the the cursor being forwarded to is multikey. */ + virtual bool isMultiKey() const { + if ( _takeover ) { + return _takeover->isMultiKey(); + } + assertOk(); + return _currOp->cursor()->isMultiKey(); + } + + virtual bool modifiedKeys() const { return true; } + + /** Initial capped wrapping cases (before takeover) are handled internally by a component ClientCursor. */ + virtual bool capped() const { return _takeover ? _takeover->capped() : false; } + + virtual long long nscanned() { return _takeover ? _takeover->nscanned() : _nscanned; } + + virtual shared_ptr<CoveredIndexMatcher> matcherPtr() const { + if ( _takeover ) { + return _takeover->matcherPtr(); + } + assertOk(); + return _currOp->matcher( _currOp->cursor() ); + } + + virtual CoveredIndexMatcher* matcher() const { + if ( _takeover ) { + return _takeover->matcher(); + } + assertOk(); + return _currOp->matcher( _currOp->cursor() ).get(); + } + + virtual bool currentMatches( MatchDetails *details = 0 ) { + if ( _takeover ) { + return _takeover->currentMatches( details ); + } + assertOk(); + return _currOp->currentMatches( details ); + } + + private: + /** + * Advances the QueryPlanSet::Runner. + * @param force - advance even if the current query op is not valid. The 'force' param should only be specified + * when there are plans left in the runner. + */ + bool _advance( bool force ) { + if ( _takeover ) { + return _takeover->advance(); + } + + if ( !force && !ok() ) { + return false; + } + + DiskLoc prevLoc = _currLoc(); + + _currOp = 0; + shared_ptr<QueryOp> op = _mps->nextOp(); + rethrowOnError( op ); + + // Avoiding dynamic_cast here for performance. Soon we won't need to + // do a cast at all. + QueryOptimizerCursorOp *qocop = (QueryOptimizerCursorOp*)( op.get() ); + + if ( !op->complete() ) { + // The 'qocop' will be valid until we call _mps->nextOp() again. We return 'current' values from this op. + _currOp = qocop; + } + else if ( op->stopRequested() ) { + if ( qocop->cursor() ) { + // Ensure that prepareToTouchEarlierIterate() may be called safely when a BasicCursor takes over. + if ( !prevLoc.isNull() && prevLoc == qocop->currLoc() ) { + qocop->cursor()->advance(); + } + // Clear the Runner and any unnecessary QueryOps and their ClientCursors. + _mps->clearRunner(); + _takeover.reset( new MultiCursor( _mps, + qocop->cursor(), + op->matcher( qocop->cursor() ), + *op, + _nscanned - qocop->cursor()->nscanned() ) ); + } + } + + return ok(); + } + /** Forward an exception when the runner errs out. */ + void rethrowOnError( const shared_ptr< QueryOp > &op ) { + if ( op->error() ) { + throw MsgAssertionException( op->exception() ); + } + } + + void assertOk() const { + massert( 14809, "Invalid access for cursor that is not ok()", !_currLoc().isNull() ); + } + + /** Insert and check for dups before takeover occurs */ + bool getsetdupInternal(const DiskLoc &loc) { + return _dups.getsetdup( loc ); + } + + /** Just check for dups - after takeover occurs */ + bool getdupInternal(const DiskLoc &loc) { + dassert( _takeover ); + return _dups.getdup( loc ); + } + + auto_ptr<MultiPlanScanner> _mps; + shared_ptr<QueryOptimizerCursorOp> _originalOp; + QueryOptimizerCursorOp *_currOp; + shared_ptr<Cursor> _takeover; + long long _nscanned; + // Using a SmallDupSet seems a bit hokey, but I've measured a 5% performance improvement with ~100 document non multi key scans. + SmallDupSet _dups; + }; + + shared_ptr<Cursor> newQueryOptimizerCursor( auto_ptr<MultiPlanScanner> mps, bool requireIndex ) { + try { + return shared_ptr<Cursor>( new QueryOptimizerCursor( mps, requireIndex ) ); + } catch( const AssertionException &e ) { + if ( e.getCode() == OutOfOrderDocumentsAssertionCode ) { + // If no indexes follow the requested sort order, return an + // empty pointer. This is legacy behavior based on bestGuessCursor(). + return shared_ptr<Cursor>(); + } + throw; + } + return shared_ptr<Cursor>(); + } + + shared_ptr<Cursor> NamespaceDetailsTransient::getCursor( const char *ns, const BSONObj &query, + const BSONObj &order, bool requireIndex, + bool *simpleEqualityMatch ) { + if ( simpleEqualityMatch ) { + *simpleEqualityMatch = false; + } + if ( query.isEmpty() && order.isEmpty() && !requireIndex ) { + // TODO This will not use a covered index currently. + return theDataFileMgr.findAll( ns ); + } + if ( isSimpleIdQuery( query ) ) { + Database *database = cc().database(); + verify( 15985, database ); + NamespaceDetails *d = database->namespaceIndex.details(ns); + if ( d ) { + int idxNo = d->findIdIndex(); + if ( idxNo >= 0 ) { + IndexDetails& i = d->idx( idxNo ); + BSONObj key = i.getKeyFromQuery( query ); + return shared_ptr<Cursor>( BtreeCursor::make( d, idxNo, i, key, key, true, 1 ) ); + } + } + } + auto_ptr<MultiPlanScanner> mps( new MultiPlanScanner( ns, query, order ) ); // mayYield == false + shared_ptr<Cursor> single = mps->singleCursor(); + if ( single ) { + if ( !( requireIndex && + dynamic_cast<BasicCursor*>( single.get() ) /* May not use an unindexed cursor */ ) ) { + if ( !query.isEmpty() && !single->matcher() ) { + shared_ptr<CoveredIndexMatcher> matcher( new CoveredIndexMatcher( query, single->indexKeyPattern() ) ); + single->setMatcher( matcher ); + } + if ( simpleEqualityMatch ) { + const QueryPlan *qp = mps->singlePlan(); + if ( qp->exactKeyMatch() && !single->matcher()->needRecord() ) { + *simpleEqualityMatch = true; + } + } + return single; + } + } + return newQueryOptimizerCursor( mps, requireIndex ); + } + + /** This interface just available for testing. */ + shared_ptr<Cursor> newQueryOptimizerCursor( const char *ns, const BSONObj &query, const BSONObj &order, bool requireIndex ) { + auto_ptr<MultiPlanScanner> mps( new MultiPlanScanner( ns, query, order ) ); // mayYield == false + return newQueryOptimizerCursor( mps, requireIndex ); + } + +} // namespace mongo; diff --git a/src/mongo/db/queryoptimizercursor.h b/src/mongo/db/queryoptimizercursor.h new file mode 100644 index 00000000000..ee5a1663370 --- /dev/null +++ b/src/mongo/db/queryoptimizercursor.h @@ -0,0 +1,150 @@ +// @file queryoptimizercursor.h + +/** + * Copyright (C) 2011 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +namespace mongo { + + /** Helper class for caching and counting matches during execution of a QueryPlan. */ + class CachedMatchCounter { + public: + /** + * @param aggregateNscanned - shared count of nscanned for this and othe plans. + * @param cumulativeCount - starting point for accumulated count over a series of plans. + */ + CachedMatchCounter( long long &aggregateNscanned, int cumulativeCount ) : _aggregateNscanned( aggregateNscanned ), _nscanned(), _cumulativeCount( cumulativeCount ), _count(), _checkDups(), _match( Unknown ), _counted() {} + + /** Set whether dup checking is enabled when counting. */ + void setCheckDups( bool checkDups ) { _checkDups = checkDups; } + + /** + * Usual sequence of events: + * 1) resetMatch() - reset stored match value to Unkonwn. + * 2) setMatch() - set match value to a definite true/false value. + * 3) knowMatch() - check if setMatch() has been called. + * 4) countMatch() - increment count if match is true. + */ + + void resetMatch() { + _match = Unknown; + _counted = false; + } + void setMatch( bool match ) { _match = match ? True : False; } + bool knowMatch() const { return _match != Unknown; } + void countMatch( const DiskLoc &loc ) { + if ( !_counted && _match == True && !getsetdup( loc ) ) { + ++_cumulativeCount; + ++_count; + _counted = true; + } + } + + bool enoughCumulativeMatchesToChooseAPlan() const { + // This is equivalent to the default condition for switching from + // a query to a getMore, which was the historical default match count for + // choosing a plan. + return _cumulativeCount >= 101; + } + bool enoughMatchesToRecordPlan() const { + // Recording after 50 matches is a historical default (101 default limit / 2). + return _count > 50; + } + + int cumulativeCount() const { return _cumulativeCount; } + int count() const { return _count; } + + /** Update local and aggregate nscanned counts. */ + void updateNscanned( long long nscanned ) { + _aggregateNscanned += ( nscanned - _nscanned ); + _nscanned = nscanned; + } + long long nscanned() const { return _nscanned; } + long long &aggregateNscanned() const { return _aggregateNscanned; } + private: + bool getsetdup( const DiskLoc &loc ) { + if ( !_checkDups ) { + return false; + } + pair<set<DiskLoc>::iterator, bool> p = _dups.insert( loc ); + return !p.second; + } + long long &_aggregateNscanned; + long long _nscanned; + int _cumulativeCount; + int _count; + bool _checkDups; + enum MatchState { Unknown, False, True }; + MatchState _match; + bool _counted; + set<DiskLoc> _dups; + }; + + /** Dup tracking class, optimizing one common case with small set and few initial reads. */ + class SmallDupSet { + public: + SmallDupSet() : _accesses() { + _vec.reserve( 250 ); + } + /** @return true if @param 'loc' already added to the set, false if adding to the set in this call. */ + bool getsetdup( const DiskLoc &loc ) { + access(); + return vec() ? getsetdupVec( loc ) : getsetdupSet( loc ); + } + /** @return true when @param loc in the set. */ + bool getdup( const DiskLoc &loc ) { + access(); + return vec() ? getdupVec( loc ) : getdupSet( loc ); + } + private: + void access() { + ++_accesses; + mayUpgrade(); + } + void mayUpgrade() { + if ( vec() && _accesses > 500 ) { + _set.insert( _vec.begin(), _vec.end() ); + } + } + bool vec() const { + return _set.size() == 0; + } + bool getsetdupVec( const DiskLoc &loc ) { + if ( getdupVec( loc ) ) { + return true; + } + _vec.push_back( loc ); + return false; + } + bool getdupVec( const DiskLoc &loc ) const { + for( vector<DiskLoc>::const_iterator i = _vec.begin(); i != _vec.end(); ++i ) { + if ( *i == loc ) { + return true; + } + } + return false; + } + bool getsetdupSet( const DiskLoc &loc ) { + pair<set<DiskLoc>::iterator, bool> p = _set.insert(loc); + return !p.second; + } + bool getdupSet( const DiskLoc &loc ) { + return _set.count( loc ) > 0; + } + vector<DiskLoc> _vec; + set<DiskLoc> _set; + long long _accesses; + }; +} // namespace mongo diff --git a/src/mongo/db/querypattern.cpp b/src/mongo/db/querypattern.cpp new file mode 100644 index 00000000000..e20e2b6a6ae --- /dev/null +++ b/src/mongo/db/querypattern.cpp @@ -0,0 +1,99 @@ +// @file querypattern.cpp - Query pattern matching for selecting similar plans given similar queries. + +/* Copyright 2011 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "querypattern.h" + +namespace mongo { + + QueryPattern::QueryPattern( const FieldRangeSet &frs, const BSONObj &sort ) { + for( map<string,FieldRange>::const_iterator i = frs.ranges().begin(); i != frs.ranges().end(); ++i ) { + if ( i->second.equality() ) { + _fieldTypes[ i->first ] = QueryPattern::Equality; + } + else if ( i->second.empty() ) { + // This case generally results from an upper and lower bound that are inconsistent for a single key index. + _fieldTypes[ i->first ] = QueryPattern::UpperAndLowerBound; + } + else if ( i->second.nontrivial() ) { + bool upper = i->second.max().type() != MaxKey; + bool lower = i->second.min().type() != MinKey; + if ( upper && lower ) + _fieldTypes[ i->first ] = QueryPattern::UpperAndLowerBound; + else if ( upper ) + _fieldTypes[ i->first ] = QueryPattern::UpperBound; + else if ( lower ) + _fieldTypes[ i->first ] = QueryPattern::LowerBound; + } + } + setSort( sort ); + } + + /** for testing only - speed unimportant */ + bool QueryPattern::operator==( const QueryPattern &other ) const { + bool less = operator<( other ); + bool more = other.operator<( *this ); + assert( !( less && more ) ); + return !( less || more ); + } + + /** for testing only - speed unimportant */ + bool QueryPattern::operator!=( const QueryPattern &other ) const { + return !operator==( other ); + } + + string typeToString( enum QueryPattern::Type t ) { + switch (t) { + case QueryPattern::Equality: + return "Equality"; + case QueryPattern::LowerBound: + return "LowerBound"; + case QueryPattern::UpperBound: + return "UpperBound"; + case QueryPattern::UpperAndLowerBound: + return "UpperAndLowerBound"; + } + return ""; + } + + string QueryPattern::toString() const { + BSONObjBuilder b; + for( map<string,Type>::const_iterator i = _fieldTypes.begin(); i != _fieldTypes.end(); ++i ) { + b << i->first << typeToString( i->second ); + } + return BSON( "query" << b.done() << "sort" << _sort ).toString(); + } + + void QueryPattern::setSort( const BSONObj sort ) { + _sort = normalizeSort( sort ); + } + + BSONObj QueryPattern::normalizeSort( const BSONObj &spec ) { + if ( spec.isEmpty() ) + return spec; + int direction = ( spec.firstElement().number() >= 0 ) ? 1 : -1; + BSONObjIterator i( spec ); + BSONObjBuilder b; + while( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + b.append( e.fieldName(), direction * ( ( e.number() >= 0 ) ? -1 : 1 ) ); + } + return b.obj(); + } + +} // namespace mongo diff --git a/src/mongo/db/querypattern.h b/src/mongo/db/querypattern.h new file mode 100644 index 00000000000..000c301a0de --- /dev/null +++ b/src/mongo/db/querypattern.h @@ -0,0 +1,78 @@ +// @file querypattern.h - Query pattern matching for selecting similar plans given similar queries. + +/* Copyright 2011 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "jsobj.h" +#include "queryutil.h" + +namespace mongo { + + /** + * Implements query pattern matching, used to determine if a query is + * similar to an earlier query and should use the same plan. + * + * Two queries will generate the same QueryPattern, and therefore match each + * other, if their fields have the same Types and they have the same sort + * spec. + */ + class QueryPattern { + public: + QueryPattern( const FieldRangeSet &frs, const BSONObj &sort ); + enum Type { + Equality, + LowerBound, + UpperBound, + UpperAndLowerBound + }; + bool operator<( const QueryPattern &other ) const; + /** for testing only */ + bool operator==( const QueryPattern &other ) const; + /** for testing only */ + bool operator!=( const QueryPattern &other ) const; + /** for development / debugging */ + string toString() const; + private: + void setSort( const BSONObj sort ); + static BSONObj normalizeSort( const BSONObj &spec ); + map<string,Type> _fieldTypes; + BSONObj _sort; + }; + + inline bool QueryPattern::operator<( const QueryPattern &other ) const { + map<string,Type>::const_iterator i = _fieldTypes.begin(); + map<string,Type>::const_iterator j = other._fieldTypes.begin(); + while( i != _fieldTypes.end() ) { + if ( j == other._fieldTypes.end() ) + return false; + if ( i->first < j->first ) + return true; + else if ( i->first > j->first ) + return false; + if ( i->second < j->second ) + return true; + else if ( i->second > j->second ) + return false; + ++i; + ++j; + } + if ( j != other._fieldTypes.end() ) + return true; + return _sort.woCompare( other._sort ) < 0; + } + +} // namespace mongo diff --git a/src/mongo/db/queryutil-inl.h b/src/mongo/db/queryutil-inl.h new file mode 100644 index 00000000000..08d3b1fac52 --- /dev/null +++ b/src/mongo/db/queryutil-inl.h @@ -0,0 +1,153 @@ +// @file queryutil-inl.h - Inline definitions for frequently called queryutil.h functions + +/* Copyright 2011 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace mongo { + + inline bool FieldInterval::equality() const { + if ( _cachedEquality == -1 ) { + _cachedEquality = ( _lower._inclusive && _upper._inclusive && _lower._bound.woCompare( _upper._bound, false ) == 0 ); + } + return _cachedEquality != 0; + } + + inline bool FieldRange::equality() const { + return + !empty() && + min().woCompare( max(), false ) == 0 && + maxInclusive() && + minInclusive(); + } + + inline bool FieldRange::inQuery() const { + if ( equality() ) { + return true; + } + for( vector<FieldInterval>::const_iterator i = _intervals.begin(); i != _intervals.end(); ++i ) { + if ( !i->equality() ) { + return false; + } + } + return true; + } + + /** + * TODO Assumes intervals are contiguous and minKey/maxKey will not be + * matched against. + */ + inline bool FieldRange::nontrivial() const { + return + ! empty() && + ( _intervals.size() != 1 || + minKey.firstElement().woCompare( min(), false ) != 0 || + maxKey.firstElement().woCompare( max(), false ) != 0 ); + } + + inline const FieldRange &FieldRangeSet::range( const char *fieldName ) const { + map<string,FieldRange>::const_iterator f = _ranges.find( fieldName ); + if ( f == _ranges.end() ) + return trivialRange(); + return f->second; + } + + inline FieldRange &FieldRangeSet::range( const char *fieldName ) { + map<string,FieldRange>::iterator f = _ranges.find( fieldName ); + if ( f == _ranges.end() ) { + _ranges.insert( make_pair( string( fieldName ), trivialRange() ) ); + return _ranges.find( fieldName )->second; + } + return f->second; + } + + inline int FieldRangeSet::nNontrivialRanges() const { + int count = 0; + for( map<string,FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) { + if ( i->second.nontrivial() ) + ++count; + } + return count; + } + + inline bool FieldRangeSet::matchPossible() const { + for( map<string,FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) { + if ( i->second.empty() ) { + return false; + } + } + return true; + } + + inline bool FieldRangeSet::matchPossibleForIndex( const BSONObj &keyPattern ) const { + if ( !_singleKey ) { + return matchPossible(); + } + BSONObjIterator i( keyPattern ); + while( i.more() ) { + BSONElement e = i.next(); + if ( e.fieldName() == string( "$natural" ) ) { + return true; + } + if ( range( e.fieldName() ).empty() ) { + return false; + } + } + return true; + } + + inline long long FieldRangeVector::size() { + long long ret = 1; + for( vector<FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) { + ret *= i->intervals().size(); + } + return ret; + } + + inline FieldRangeSetPair *OrRangeGenerator::topFrsp() const { + FieldRangeSetPair *ret = new FieldRangeSetPair( _baseSet ); + if (_orSets.size()) { + *ret &= _orSets.front(); + } + return ret; + } + + inline FieldRangeSetPair *OrRangeGenerator::topFrspOriginal() const { + FieldRangeSetPair *ret = new FieldRangeSetPair( _baseSet ); + if (_originalOrSets.size()) { + *ret &= _originalOrSets.front(); + } + return ret; + } + + inline bool FieldRangeSetPair::matchPossibleForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const { + assertValidIndexOrNoIndex( d, idxNo ); + if ( !matchPossible() ) { + return false; + } + if ( idxNo < 0 ) { + // multi key matchPossible() is true, so return true. + return true; + } + return frsForIndex( d, idxNo ).matchPossibleForIndex( keyPattern ); + } + + inline void FieldRangeSetPair::assertValidIndexOrNoIndex( const NamespaceDetails *d, int idxNo ) const { + massert( 14049, "FieldRangeSetPair invalid index specified", idxNo >= -1 ); + if ( idxNo >= 0 ) { + assertValidIndex( d, idxNo ); + } + } + +} // namespace mongo diff --git a/src/mongo/db/queryutil.cpp b/src/mongo/db/queryutil.cpp new file mode 100644 index 00000000000..e6748c4bc2e --- /dev/null +++ b/src/mongo/db/queryutil.cpp @@ -0,0 +1,1551 @@ +// @file queryutil.cpp + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "pch.h" + +#include "btree.h" +#include "matcher.h" +#include "pdfile.h" +#include "queryoptimizer.h" +#include "../util/unittest.h" +#include "dbmessage.h" +#include "indexkey.h" +#include "../util/mongoutils/str.h" + +namespace mongo { + extern BSONObj staticNull; + extern BSONObj staticUndefined; + + /** returns a string that when used as a matcher, would match a super set of regex() + returns "" for complex regular expressions + used to optimize queries in some simple regex cases that start with '^' + + if purePrefix != NULL, sets it to whether the regex can be converted to a range query + */ + string simpleRegex(const char* regex, const char* flags, bool* purePrefix) { + string r = ""; + + if (purePrefix) *purePrefix = false; + + bool multilineOK; + if ( regex[0] == '\\' && regex[1] == 'A') { + multilineOK = true; + regex += 2; + } + else if (regex[0] == '^') { + multilineOK = false; + regex += 1; + } + else { + return r; + } + + bool extended = false; + while (*flags) { + switch (*(flags++)) { + case 'm': // multiline + if (multilineOK) + continue; + else + return r; + case 'x': // extended + extended = true; + break; + default: + return r; // cant use index + } + } + + stringstream ss; + + while(*regex) { + char c = *(regex++); + if ( c == '*' || c == '?' ) { + // These are the only two symbols that make the last char optional + r = ss.str(); + r = r.substr( 0 , r.size() - 1 ); + return r; //breaking here fails with /^a?/ + } + else if (c == '|') { + // whole match so far is optional. Nothing we can do here. + return string(); + } + else if (c == '\\') { + c = *(regex++); + if (c == 'Q'){ + // \Q...\E quotes everything inside + while (*regex) { + c = (*regex++); + if (c == '\\' && (*regex == 'E')){ + regex++; //skip the 'E' + break; // go back to start of outer loop + } + else { + ss << c; // character should match itself + } + } + } + else if ((c >= 'A' && c <= 'Z') || + (c >= 'a' && c <= 'z') || + (c >= '0' && c <= '0') || + (c == '\0')) { + // don't know what to do with these + r = ss.str(); + break; + } + else { + // slash followed by non-alphanumeric represents the following char + ss << c; + } + } + else if (strchr("^$.[()+{", c)) { + // list of "metacharacters" from man pcrepattern + r = ss.str(); + break; + } + else if (extended && c == '#') { + // comment + r = ss.str(); + break; + } + else if (extended && isspace(c)) { + continue; + } + else { + // self-matching char + ss << c; + } + } + + if ( r.empty() && *regex == 0 ) { + r = ss.str(); + if (purePrefix) *purePrefix = !r.empty(); + } + + return r; + } + inline string simpleRegex(const BSONElement& e) { + switch(e.type()) { + case RegEx: + return simpleRegex(e.regex(), e.regexFlags()); + case Object: { + BSONObj o = e.embeddedObject(); + return simpleRegex(o["$regex"].valuestrsafe(), o["$options"].valuestrsafe()); + } + default: assert(false); return ""; //return squashes compiler warning + } + } + + string simpleRegexEnd( string regex ) { + ++regex[ regex.length() - 1 ]; + return regex; + } + + + FieldRange::FieldRange( const BSONElement &e, bool singleKey, bool isNot, bool optimize ) + : _singleKey( singleKey ) { + int op = e.getGtLtOp(); + + // NOTE with $not, we could potentially form a complementary set of intervals. + if ( !isNot && !e.eoo() && e.type() != RegEx && op == BSONObj::opIN ) { + set<BSONElement,element_lt> vals; + vector<FieldRange> regexes; + uassert( 12580 , "invalid query" , e.isABSONObj() ); + BSONObjIterator i( e.embeddedObject() ); + while( i.more() ) { + BSONElement ie = i.next(); + uassert( 15881, "$elemMatch not allowed within $in", + ie.type() != Object || + ie.embeddedObject().firstElement().getGtLtOp() != BSONObj::opELEM_MATCH ); + if ( ie.type() == RegEx ) { + regexes.push_back( FieldRange( ie, singleKey, false, optimize ) ); + } + else { + // A document array may be indexed by its first element, by undefined + // if it is empty, or as a full array if it is embedded within another + // array. + vals.insert( ie ); + if ( ie.type() == Array ) { + BSONElement temp = ie.embeddedObject().firstElement(); + if ( temp.eoo() ) { + temp = staticUndefined.firstElement(); + } + vals.insert( temp ); + } + } + } + + for( set<BSONElement,element_lt>::const_iterator i = vals.begin(); i != vals.end(); ++i ) + _intervals.push_back( FieldInterval(*i) ); + + for( vector<FieldRange>::const_iterator i = regexes.begin(); i != regexes.end(); ++i ) + *this |= *i; + + return; + } + + // A document array may be indexed by its first element, by undefined + // if it is empty, or as a full array if it is embedded within another + // array. + if ( e.type() == Array && op == BSONObj::Equality ) { + + _intervals.push_back( FieldInterval(e) ); + BSONElement temp = e.embeddedObject().firstElement(); + if ( temp.eoo() ) { + temp = staticUndefined.firstElement(); + } + if ( temp < e ) { + _intervals.insert( _intervals.begin() , temp ); + } + else { + _intervals.push_back( FieldInterval(temp) ); + } + + return; + } + + _intervals.push_back( FieldInterval() ); + FieldInterval &initial = _intervals[ 0 ]; + BSONElement &lower = initial._lower._bound; + bool &lowerInclusive = initial._lower._inclusive; + BSONElement &upper = initial._upper._bound; + bool &upperInclusive = initial._upper._inclusive; + lower = minKey.firstElement(); + lowerInclusive = true; + upper = maxKey.firstElement(); + upperInclusive = true; + + if ( e.eoo() ) + return; + + bool existsSpec = false; + if ( op == BSONObj::opEXISTS ) { + existsSpec = e.trueValue(); + } + + if ( e.type() == RegEx + || (e.type() == Object && !e.embeddedObject()["$regex"].eoo()) + ) { + uassert( 13454, "invalid regular expression operator", op == BSONObj::Equality || op == BSONObj::opREGEX ); + if ( !isNot ) { // no optimization for negated regex - we could consider creating 2 intervals comprising all nonmatching prefixes + const string r = simpleRegex(e); + if ( r.size() ) { + lower = addObj( BSON( "" << r ) ).firstElement(); + upper = addObj( BSON( "" << simpleRegexEnd( r ) ) ).firstElement(); + upperInclusive = false; + } + else { + BSONObjBuilder b1(32), b2(32); + b1.appendMinForType( "" , String ); + lower = addObj( b1.obj() ).firstElement(); + + b2.appendMaxForType( "" , String ); + upper = addObj( b2.obj() ).firstElement(); + upperInclusive = false; //MaxForType String is an empty Object + } + + // regex matches self - regex type > string type + if (e.type() == RegEx) { + BSONElement re = addObj( BSON( "" << e ) ).firstElement(); + _intervals.push_back( FieldInterval(re) ); + } + else { + BSONObj orig = e.embeddedObject(); + BSONObjBuilder b; + b.appendRegex("", orig["$regex"].valuestrsafe(), orig["$options"].valuestrsafe()); + BSONElement re = addObj( b.obj() ).firstElement(); + _intervals.push_back( FieldInterval(re) ); + } + + } + return; + } + if ( isNot ) { + switch( op ) { + case BSONObj::Equality: + return; +// op = BSONObj::NE; +// break; + case BSONObj::opALL: + case BSONObj::opMOD: // NOTE for mod and type, we could consider having 1-2 intervals comprising the complementary types (multiple intervals already possible with $in) + case BSONObj::opTYPE: + // no bound calculation + return; + case BSONObj::NE: + op = BSONObj::Equality; + break; + case BSONObj::LT: + op = BSONObj::GTE; + break; + case BSONObj::LTE: + op = BSONObj::GT; + break; + case BSONObj::GT: + op = BSONObj::LTE; + break; + case BSONObj::GTE: + op = BSONObj::LT; + break; + case BSONObj::opEXISTS: + existsSpec = !existsSpec; + break; + default: // otherwise doesn't matter + break; + } + } + switch( op ) { + case BSONObj::Equality: + lower = upper = e; + break; + case BSONObj::NE: { + // this will invalidate the upper/lower references above + _intervals.push_back( FieldInterval() ); + // optimize doesn't make sense for negative ranges + _intervals[ 0 ]._upper._bound = e; + _intervals[ 0 ]._upper._inclusive = false; + _intervals[ 1 ]._lower._bound = e; + _intervals[ 1 ]._lower._inclusive = false; + _intervals[ 1 ]._upper._bound = maxKey.firstElement(); + _intervals[ 1 ]._upper._inclusive = true; + optimize = false; // don't run optimize code below + break; + } + case BSONObj::LT: + upperInclusive = false; + case BSONObj::LTE: + upper = e; + break; + case BSONObj::GT: + lowerInclusive = false; + case BSONObj::GTE: + lower = e; + break; + case BSONObj::opALL: { + uassert( 10370 , "$all requires array", e.type() == Array ); + BSONObjIterator i( e.embeddedObject() ); + bool bound = false; + while ( i.more() ) { + BSONElement x = i.next(); + if ( x.type() == Object && x.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) { + // taken care of elsewhere + } + else if ( x.type() != RegEx ) { + lower = upper = x; + bound = true; + break; + } + } + if ( !bound ) { // if no good non regex bound found, try regex bounds + BSONObjIterator i( e.embeddedObject() ); + while( i.more() ) { + BSONElement x = i.next(); + if ( x.type() != RegEx ) + continue; + string simple = simpleRegex( x.regex(), x.regexFlags() ); + if ( !simple.empty() ) { + lower = addObj( BSON( "" << simple ) ).firstElement(); + upper = addObj( BSON( "" << simpleRegexEnd( simple ) ) ).firstElement(); + break; + } + } + } + break; + } + case BSONObj::opMOD: { + { + BSONObjBuilder b; + b.appendMinForType( "" , NumberDouble ); + lower = addObj( b.obj() ).firstElement(); + } + { + BSONObjBuilder b; + b.appendMaxForType( "" , NumberDouble ); + upper = addObj( b.obj() ).firstElement(); + } + break; + } + case BSONObj::opTYPE: { + BSONType t = (BSONType)e.numberInt(); + { + BSONObjBuilder b; + b.appendMinForType( "" , t ); + lower = addObj( b.obj() ).firstElement(); + } + { + BSONObjBuilder b; + b.appendMaxForType( "" , t ); + upper = addObj( b.obj() ).firstElement(); + } + + break; + } + case BSONObj::opREGEX: + case BSONObj::opOPTIONS: + // do nothing + break; + case BSONObj::opELEM_MATCH: { + log() << "warning: shouldn't get here?" << endl; + break; + } + case BSONObj::opNEAR: + case BSONObj::opWITHIN: + _special = "2d"; + break; + case BSONObj::opEXISTS: { + if ( !existsSpec ) { + lower = upper = staticNull.firstElement(); + } + optimize = false; + break; + } + default: + break; + } + + if ( optimize ) { + if ( lower.type() != MinKey && upper.type() == MaxKey && lower.isSimpleType() ) { // TODO: get rid of isSimpleType + BSONObjBuilder b; + b.appendMaxForType( lower.fieldName() , lower.type() ); + upper = addObj( b.obj() ).firstElement(); + } + else if ( lower.type() == MinKey && upper.type() != MaxKey && upper.isSimpleType() ) { // TODO: get rid of isSimpleType + if( upper.type() == Date ) + lowerInclusive = false; + BSONObjBuilder b; + b.appendMinForType( upper.fieldName() , upper.type() ); + lower = addObj( b.obj() ).firstElement(); + } + } + + } + + void FieldRange::finishOperation( const vector<FieldInterval> &newIntervals, const FieldRange &other ) { + _intervals = newIntervals; + for( vector<BSONObj>::const_iterator i = other._objData.begin(); i != other._objData.end(); ++i ) + _objData.push_back( *i ); + if ( _special.size() == 0 && other._special.size() ) + _special = other._special; + } + + // as called, these functions find the max/min of a bound in the + // opposite direction, so inclusive bounds are considered less + // superlative + FieldBound maxFieldBound( const FieldBound &a, const FieldBound &b ) { + int cmp = a._bound.woCompare( b._bound, false ); + if ( ( cmp == 0 && !b._inclusive ) || cmp < 0 ) + return b; + return a; + } + + FieldBound minFieldBound( const FieldBound &a, const FieldBound &b ) { + int cmp = a._bound.woCompare( b._bound, false ); + if ( ( cmp == 0 && !b._inclusive ) || cmp > 0 ) + return b; + return a; + } + + bool fieldIntervalOverlap( const FieldInterval &one, const FieldInterval &two, FieldInterval &result ) { + result._lower = maxFieldBound( one._lower, two._lower ); + result._upper = minFieldBound( one._upper, two._upper ); + return result.strictValid(); + } + + const FieldRange &FieldRange::operator&=( const FieldRange &other ) { + if ( !_singleKey && nontrivial() ) { + if ( other <= *this ) { + *this = other; + } + return *this; + } + vector<FieldInterval> newIntervals; + vector<FieldInterval>::const_iterator i = _intervals.begin(); + vector<FieldInterval>::const_iterator j = other._intervals.begin(); + while( i != _intervals.end() && j != other._intervals.end() ) { + FieldInterval overlap; + if ( fieldIntervalOverlap( *i, *j, overlap ) ) { + newIntervals.push_back( overlap ); + } + if ( i->_upper == minFieldBound( i->_upper, j->_upper ) ) { + ++i; + } + else { + ++j; + } + } + finishOperation( newIntervals, other ); + return *this; + } + + void handleInterval( const FieldInterval &lower, FieldBound &low, FieldBound &high, vector<FieldInterval> &newIntervals ) { + if ( low._bound.eoo() ) { + low = lower._lower; high = lower._upper; + } + else { + int cmp = high._bound.woCompare( lower._lower._bound, false ); + if ( ( cmp < 0 ) || ( cmp == 0 && !high._inclusive && !lower._lower._inclusive ) ) { + FieldInterval tmp; + tmp._lower = low; + tmp._upper = high; + newIntervals.push_back( tmp ); + low = lower._lower; high = lower._upper; + } + else { + high = lower._upper; + } + } + } + + const FieldRange &FieldRange::operator|=( const FieldRange &other ) { + vector<FieldInterval> newIntervals; + FieldBound low; + FieldBound high; + vector<FieldInterval>::const_iterator i = _intervals.begin(); + vector<FieldInterval>::const_iterator j = other._intervals.begin(); + while( i != _intervals.end() && j != other._intervals.end() ) { + int cmp = i->_lower._bound.woCompare( j->_lower._bound, false ); + if ( ( cmp == 0 && i->_lower._inclusive ) || cmp < 0 ) { + handleInterval( *i, low, high, newIntervals ); + ++i; + } + else { + handleInterval( *j, low, high, newIntervals ); + ++j; + } + } + while( i != _intervals.end() ) { + handleInterval( *i, low, high, newIntervals ); + ++i; + } + while( j != other._intervals.end() ) { + handleInterval( *j, low, high, newIntervals ); + ++j; + } + FieldInterval tmp; + tmp._lower = low; + tmp._upper = high; + newIntervals.push_back( tmp ); + finishOperation( newIntervals, other ); + return *this; + } + + const FieldRange &FieldRange::operator-=( const FieldRange &other ) { + vector<FieldInterval> newIntervals; + vector<FieldInterval>::iterator i = _intervals.begin(); + vector<FieldInterval>::const_iterator j = other._intervals.begin(); + while( i != _intervals.end() && j != other._intervals.end() ) { + int cmp = i->_lower._bound.woCompare( j->_lower._bound, false ); + if ( cmp < 0 || + ( cmp == 0 && i->_lower._inclusive && !j->_lower._inclusive ) ) { + int cmp2 = i->_upper._bound.woCompare( j->_lower._bound, false ); + if ( cmp2 < 0 ) { + newIntervals.push_back( *i ); + ++i; + } + else if ( cmp2 == 0 ) { + newIntervals.push_back( *i ); + if ( newIntervals.back()._upper._inclusive && j->_lower._inclusive ) { + newIntervals.back()._upper._inclusive = false; + } + ++i; + } + else { + newIntervals.push_back( *i ); + newIntervals.back()._upper = j->_lower; + newIntervals.back()._upper.flipInclusive(); + int cmp3 = i->_upper._bound.woCompare( j->_upper._bound, false ); + if ( cmp3 < 0 || + ( cmp3 == 0 && ( !i->_upper._inclusive || j->_upper._inclusive ) ) ) { + ++i; + } + else { + i->_lower = j->_upper; + i->_lower.flipInclusive(); + ++j; + } + } + } + else { + int cmp2 = i->_lower._bound.woCompare( j->_upper._bound, false ); + if ( cmp2 > 0 || + ( cmp2 == 0 && ( !i->_lower._inclusive || !j->_upper._inclusive ) ) ) { + ++j; + } + else { + int cmp3 = i->_upper._bound.woCompare( j->_upper._bound, false ); + if ( cmp3 < 0 || + ( cmp3 == 0 && ( !i->_upper._inclusive || j->_upper._inclusive ) ) ) { + ++i; + } + else { + i->_lower = j->_upper; + i->_lower.flipInclusive(); + ++j; + } + } + } + } + while( i != _intervals.end() ) { + newIntervals.push_back( *i ); + ++i; + } + finishOperation( newIntervals, other ); + return *this; + } + + // TODO write a proper implementation that doesn't do a full copy + bool FieldRange::operator<=( const FieldRange &other ) const { + FieldRange temp = *this; + temp -= other; + return temp.empty(); + } + + void FieldRange::setExclusiveBounds() { + for( vector<FieldInterval>::iterator i = _intervals.begin(); i != _intervals.end(); ++i ) { + i->_lower._inclusive = false; + i->_upper._inclusive = false; + } + } + + void FieldRange::reverse( FieldRange &ret ) const { + assert( _special.empty() ); + ret._intervals.clear(); + ret._objData = _objData; + for( vector<FieldInterval>::const_reverse_iterator i = _intervals.rbegin(); i != _intervals.rend(); ++i ) { + FieldInterval fi; + fi._lower = i->_upper; + fi._upper = i->_lower; + ret._intervals.push_back( fi ); + } + } + + BSONObj FieldRange::addObj( const BSONObj &o ) { + _objData.push_back( o ); + return o; + } + + string FieldInterval::toString() const { + StringBuilder buf; + buf << ( _lower._inclusive ? "[" : "(" ); + buf << _lower._bound; + buf << " , "; + buf << _upper._bound; + buf << ( _upper._inclusive ? "]" : ")" ); + return buf.str(); + } + + string FieldRange::toString() const { + StringBuilder buf; + buf << "(FieldRange special: " << _special << " singleKey: " << _special << " intervals: "; + for( vector<FieldInterval>::const_iterator i = _intervals.begin(); i != _intervals.end(); ++i ) { + buf << i->toString(); + } + + buf << ")"; + return buf.str(); + } + + string FieldRangeSet::getSpecial() const { + string s = ""; + for ( map<string,FieldRange>::const_iterator i=_ranges.begin(); i!=_ranges.end(); i++ ) { + if ( i->second.getSpecial().size() == 0 ) + continue; + uassert( 13033 , "can't have 2 special fields" , s.size() == 0 ); + s = i->second.getSpecial(); + } + return s; + } + + /** + * Btree scanning for a multidimentional key range will yield a + * multidimensional box. The idea here is that if an 'other' + * multidimensional box contains the current box we don't have to scan + * the current box. If the 'other' box contains the current box in + * all dimensions but one, we can safely subtract the values of 'other' + * along that one dimension from the values for the current box on the + * same dimension. In other situations, subtracting the 'other' + * box from the current box yields a result that is not a box (but + * rather can be expressed as a union of boxes). We don't support + * such splitting currently in calculating index ranges. Note that + * where I have said 'box' above, I actually mean sets of boxes because + * a field range can consist of multiple intervals. + */ + const FieldRangeSet &FieldRangeSet::operator-=( const FieldRangeSet &other ) { + int nUnincluded = 0; + string unincludedKey; + map<string,FieldRange>::iterator i = _ranges.begin(); + map<string,FieldRange>::const_iterator j = other._ranges.begin(); + while( nUnincluded < 2 && i != _ranges.end() && j != other._ranges.end() ) { + int cmp = i->first.compare( j->first ); + if ( cmp == 0 ) { + if ( i->second <= j->second ) { + // nothing + } + else { + ++nUnincluded; + unincludedKey = i->first; + } + ++i; + ++j; + } + else if ( cmp < 0 ) { + ++i; + } + else { + // other has a bound we don't, nothing can be done + return *this; + } + } + if ( j != other._ranges.end() ) { + // other has a bound we don't, nothing can be done + return *this; + } + if ( nUnincluded > 1 ) { + return *this; + } + if ( nUnincluded == 0 ) { + makeEmpty(); + return *this; + } + // nUnincluded == 1 + range( unincludedKey.c_str() ) -= other.range( unincludedKey.c_str() ); + appendQueries( other ); + return *this; + } + + const FieldRangeSet &FieldRangeSet::operator&=( const FieldRangeSet &other ) { + map<string,FieldRange>::iterator i = _ranges.begin(); + map<string,FieldRange>::const_iterator j = other._ranges.begin(); + while( i != _ranges.end() && j != other._ranges.end() ) { + int cmp = i->first.compare( j->first ); + if ( cmp == 0 ) { + // Same field name, so find range intersection. + i->second &= j->second; + ++i; + ++j; + } + else if ( cmp < 0 ) { + // Field present in *this. + ++i; + } + else { + // Field not present in *this, so add it. + range( j->first.c_str() ) = j->second; + ++j; + } + } + while( j != other._ranges.end() ) { + // Field not present in *this, add it. + range( j->first.c_str() ) = j->second; + ++j; + } + appendQueries( other ); + return *this; + } + + void FieldRangeSet::appendQueries( const FieldRangeSet &other ) { + for( vector<BSONObj>::const_iterator i = other._queries.begin(); i != other._queries.end(); ++i ) { + _queries.push_back( *i ); + } + } + + void FieldRangeSet::makeEmpty() { + for( map<string,FieldRange>::iterator i = _ranges.begin(); i != _ranges.end(); ++i ) { + i->second.makeEmpty(); + } + } + + void FieldRangeSet::processOpElement( const char *fieldName, const BSONElement &f, bool isNot, bool optimize ) { + BSONElement g = f; + int op2 = g.getGtLtOp(); + if ( op2 == BSONObj::opALL ) { + BSONElement h = g; + uassert( 13050 , "$all requires array", h.type() == Array ); + BSONObjIterator i( h.embeddedObject() ); + if( i.more() ) { + BSONElement x = i.next(); + if ( x.type() == Object && x.embeddedObject().firstElement().getGtLtOp() == BSONObj::opELEM_MATCH ) { + g = x.embeddedObject().firstElement(); + op2 = g.getGtLtOp(); + } + } + } + if ( op2 == BSONObj::opELEM_MATCH ) { + BSONObjIterator k( g.embeddedObjectUserCheck() ); + while ( k.more() ) { + BSONElement h = k.next(); + StringBuilder buf(32); + buf << fieldName << "." << h.fieldName(); + string fullname = buf.str(); + + int op3 = getGtLtOp( h ); + if ( op3 == BSONObj::Equality ) { + range( fullname.c_str() ) &= FieldRange( h , _singleKey , isNot , optimize ); + } + else { + BSONObjIterator l( h.embeddedObject() ); + while ( l.more() ) { + range( fullname.c_str() ) &= FieldRange( l.next() , _singleKey , isNot , optimize ); + } + } + } + } + else { + range( fieldName ) &= FieldRange( f , _singleKey , isNot , optimize ); + } + } + + void FieldRangeSet::processQueryField( const BSONElement &e, bool optimize ) { + if ( e.fieldName()[ 0 ] == '$' ) { + if ( strcmp( e.fieldName(), "$and" ) == 0 ) { + uassert( 14816 , "$and expression must be a nonempty array" , e.type() == Array && e.embeddedObject().nFields() > 0 ); + BSONObjIterator i( e.embeddedObject() ); + while( i.more() ) { + BSONElement e = i.next(); + uassert( 14817 , "$and elements must be objects" , e.type() == Object ); + BSONObjIterator j( e.embeddedObject() ); + while( j.more() ) { + processQueryField( j.next(), optimize ); + } + } + } + + if ( strcmp( e.fieldName(), "$where" ) == 0 ) { + return; + } + + if ( strcmp( e.fieldName(), "$or" ) == 0 ) { + return; + } + + if ( strcmp( e.fieldName(), "$nor" ) == 0 ) { + return; + } + } + + bool equality = ( getGtLtOp( e ) == BSONObj::Equality ); + if ( equality && e.type() == Object ) { + equality = ( strcmp( e.embeddedObject().firstElementFieldName(), "$not" ) != 0 ); + } + + if ( equality || ( e.type() == Object && !e.embeddedObject()[ "$regex" ].eoo() ) ) { + range( e.fieldName() ) &= FieldRange( e , _singleKey , false , optimize ); + } + if ( !equality ) { + BSONObjIterator j( e.embeddedObject() ); + while( j.more() ) { + BSONElement f = j.next(); + if ( strcmp( f.fieldName(), "$not" ) == 0 ) { + switch( f.type() ) { + case Object: { + BSONObjIterator k( f.embeddedObject() ); + while( k.more() ) { + BSONElement g = k.next(); + uassert( 13034, "invalid use of $not", g.getGtLtOp() != BSONObj::Equality ); + processOpElement( e.fieldName(), g, true, optimize ); + } + break; + } + case RegEx: + processOpElement( e.fieldName(), f, true, optimize ); + break; + default: + uassert( 13041, "invalid use of $not", false ); + } + } + else { + processOpElement( e.fieldName(), f, false, optimize ); + } + } + } + } + + FieldRangeSet::FieldRangeSet( const char *ns, const BSONObj &query, bool singleKey, bool optimize ) + : _ns( ns ), _queries( 1, query.getOwned() ), _singleKey( singleKey ) { + BSONObjIterator i( _queries[ 0 ] ); + + while( i.more() ) { + processQueryField( i.next(), optimize ); + } + } + + FieldRangeVector::FieldRangeVector( const FieldRangeSet &frs, const IndexSpec &indexSpec, int direction ) + :_indexSpec( indexSpec ), _direction( direction >= 0 ? 1 : -1 ) { + _queries = frs._queries; + BSONObjIterator i( _indexSpec.keyPattern ); + set< string > baseObjectNontrivialPrefixes; + while( i.more() ) { + BSONElement e = i.next(); + const FieldRange *range = &frs.range( e.fieldName() ); + if ( !frs.singleKey() ) { + string prefix = str::before( e.fieldName(), '.' ); + if ( baseObjectNontrivialPrefixes.count( prefix ) > 0 ) { + // A field with the same parent field has already been + // constrainted, and with a multikey index we cannot + // constrain this field. + range = &frs.trivialRange(); + } else { + if ( range->nontrivial() ) { + baseObjectNontrivialPrefixes.insert( prefix ); + } + } + } + int number = (int) e.number(); // returns 0.0 if not numeric + bool forward = ( ( number >= 0 ? 1 : -1 ) * ( direction >= 0 ? 1 : -1 ) > 0 ); + if ( forward ) { + _ranges.push_back( *range ); + } + else { + _ranges.push_back( FieldRange( BSONObj().firstElement(), frs.singleKey(), false, true ) ); + range->reverse( _ranges.back() ); + } + assert( !_ranges.back().empty() ); + } + uassert( 13385, "combinatorial limit of $in partitioning of result set exceeded", size() < 1000000 ); + } + + BSONObj FieldRangeVector::startKey() const { + BSONObjBuilder b; + for( vector<FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) { + const FieldInterval &fi = i->intervals().front(); + b.appendAs( fi._lower._bound, "" ); + } + return b.obj(); + } + + BSONObj FieldRangeVector::endKey() const { + BSONObjBuilder b; + for( vector<FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) { + const FieldInterval &fi = i->intervals().back(); + b.appendAs( fi._upper._bound, "" ); + } + return b.obj(); + } + + BSONObj FieldRangeVector::obj() const { + BSONObjBuilder b; + BSONObjIterator k( _indexSpec.keyPattern ); + for( int i = 0; i < (int)_ranges.size(); ++i ) { + BSONArrayBuilder a( b.subarrayStart( k.next().fieldName() ) ); + for( vector<FieldInterval>::const_iterator j = _ranges[ i ].intervals().begin(); + j != _ranges[ i ].intervals().end(); ++j ) { + a << BSONArray( BSON_ARRAY( j->_lower._bound << j->_upper._bound ).clientReadable() ); + } + a.done(); + } + return b.obj(); + } + + FieldRange *FieldRangeSet::__singleKeyTrivialRange = 0; + FieldRange *FieldRangeSet::__multiKeyTrivialRange = 0; + const FieldRange &FieldRangeSet::trivialRange() const { + FieldRange *&ret = _singleKey ? __singleKeyTrivialRange : __multiKeyTrivialRange; + if ( ret == 0 ) { + ret = new FieldRange( BSONObj().firstElement(), _singleKey, false, true ); + } + return *ret; + } + + BSONObj FieldRangeSet::simplifiedQuery( const BSONObj &_fields ) const { + BSONObj fields = _fields; + if ( fields.isEmpty() ) { + BSONObjBuilder b; + for( map<string,FieldRange>::const_iterator i = _ranges.begin(); i != _ranges.end(); ++i ) { + b.append( i->first, 1 ); + } + fields = b.obj(); + } + BSONObjBuilder b; + BSONObjIterator i( fields ); + while( i.more() ) { + BSONElement e = i.next(); + const char *name = e.fieldName(); + const FieldRange &eRange = range( name ); + assert( !eRange.empty() ); + if ( eRange.equality() ) + b.appendAs( eRange.min(), name ); + else if ( eRange.nontrivial() ) { + BSONObj o; + BSONObjBuilder c; + if ( eRange.min().type() != MinKey ) + c.appendAs( eRange.min(), eRange.minInclusive() ? "$gte" : "$gt" ); + if ( eRange.max().type() != MaxKey ) + c.appendAs( eRange.max(), eRange.maxInclusive() ? "$lte" : "$lt" ); + o = c.obj(); + b.append( name, o ); + } + } + return b.obj(); + } + + QueryPattern FieldRangeSet::pattern( const BSONObj &sort ) const { + return QueryPattern( *this, sort ); + } + + // TODO get rid of this + BoundList FieldRangeSet::indexBounds( const BSONObj &keyPattern, int direction ) const { + typedef vector<pair<shared_ptr<BSONObjBuilder>, shared_ptr<BSONObjBuilder> > > BoundBuilders; + BoundBuilders builders; + builders.push_back( make_pair( shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ), shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ) ) ); + BSONObjIterator i( keyPattern ); + bool ineq = false; // until ineq is true, we are just dealing with equality and $in bounds + while( i.more() ) { + BSONElement e = i.next(); + const FieldRange &fr = range( e.fieldName() ); + int number = (int) e.number(); // returns 0.0 if not numeric + bool forward = ( ( number >= 0 ? 1 : -1 ) * ( direction >= 0 ? 1 : -1 ) > 0 ); + if ( !ineq ) { + if ( fr.equality() ) { + for( BoundBuilders::const_iterator j = builders.begin(); j != builders.end(); ++j ) { + j->first->appendAs( fr.min(), "" ); + j->second->appendAs( fr.min(), "" ); + } + } + else { + if ( !fr.inQuery() ) { + ineq = true; + } + BoundBuilders newBuilders; + const vector<FieldInterval> &intervals = fr.intervals(); + for( BoundBuilders::const_iterator i = builders.begin(); i != builders.end(); ++i ) { + BSONObj first = i->first->obj(); + BSONObj second = i->second->obj(); + + const unsigned maxCombinations = 4000000; + if ( forward ) { + for( vector<FieldInterval>::const_iterator j = intervals.begin(); j != intervals.end(); ++j ) { + uassert( 13303, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < maxCombinations ); + newBuilders.push_back( make_pair( shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ), shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ) ) ); + newBuilders.back().first->appendElements( first ); + newBuilders.back().second->appendElements( second ); + newBuilders.back().first->appendAs( j->_lower._bound, "" ); + newBuilders.back().second->appendAs( j->_upper._bound, "" ); + } + } + else { + for( vector<FieldInterval>::const_reverse_iterator j = intervals.rbegin(); j != intervals.rend(); ++j ) { + uassert( 13304, "combinatorial limit of $in partitioning of result set exceeded", newBuilders.size() < maxCombinations ); + newBuilders.push_back( make_pair( shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ), shared_ptr<BSONObjBuilder>( new BSONObjBuilder() ) ) ); + newBuilders.back().first->appendElements( first ); + newBuilders.back().second->appendElements( second ); + newBuilders.back().first->appendAs( j->_upper._bound, "" ); + newBuilders.back().second->appendAs( j->_lower._bound, "" ); + } + } + } + builders = newBuilders; + } + } + else { + for( BoundBuilders::const_iterator j = builders.begin(); j != builders.end(); ++j ) { + j->first->appendAs( forward ? fr.min() : fr.max(), "" ); + j->second->appendAs( forward ? fr.max() : fr.min(), "" ); + } + } + } + BoundList ret; + for( BoundBuilders::const_iterator i = builders.begin(); i != builders.end(); ++i ) + ret.push_back( make_pair( i->first->obj(), i->second->obj() ) ); + return ret; + } + + FieldRangeSet *FieldRangeSet::subset( const BSONObj &fields ) const { + FieldRangeSet *ret = new FieldRangeSet( _ns, BSONObj(), _singleKey, true ); + BSONObjIterator i( fields ); + while( i.more() ) { + BSONElement e = i.next(); + if ( range( e.fieldName() ).nontrivial() ) { + ret->range( e.fieldName() ) = range( e.fieldName() ); + } + } + ret->_queries = _queries; + return ret; + } + + bool FieldRangeSetPair::noNontrivialRanges() const { + return _singleKey.matchPossible() && _singleKey.nNontrivialRanges() == 0 && + _multiKey.matchPossible() && _multiKey.nNontrivialRanges() == 0; + } + + FieldRangeSetPair &FieldRangeSetPair::operator&=( const FieldRangeSetPair &other ) { + _singleKey &= other._singleKey; + _multiKey &= other._multiKey; + return *this; + } + + FieldRangeSetPair &FieldRangeSetPair::operator-=( const FieldRangeSet &scanned ) { + _singleKey -= scanned; + _multiKey -= scanned; + return *this; + } + + BSONObj FieldRangeSetPair::simplifiedQueryForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const { + return frsForIndex( d, idxNo ).simplifiedQuery( keyPattern ); + } + + void FieldRangeSetPair::assertValidIndex( const NamespaceDetails *d, int idxNo ) const { + massert( 14048, "FieldRangeSetPair invalid index specified", idxNo >= 0 && idxNo < d->nIndexes ); + } + + const FieldRangeSet &FieldRangeSetPair::frsForIndex( const NamespaceDetails* nsd, int idxNo ) const { + assertValidIndexOrNoIndex( nsd, idxNo ); + if ( idxNo < 0 ) { + // An unindexed cursor cannot have a "single key" constraint. + return _multiKey; + } + return nsd->isMultikey( idxNo ) ? _multiKey : _singleKey; + } + + bool FieldRangeVector::matchesElement( const BSONElement &e, int i, bool forward ) const { + bool eq; + int l = matchingLowElement( e, i, forward, eq ); + return ( l % 2 == 0 ); // if we're inside an interval + } + + // binary search for interval containing the specified element + // an even return value indicates that the element is contained within a valid interval + int FieldRangeVector::matchingLowElement( const BSONElement &e, int i, bool forward, bool &lowEquality ) const { + lowEquality = false; + int l = -1; + int h = _ranges[ i ].intervals().size() * 2; + while( l + 1 < h ) { + int m = ( l + h ) / 2; + BSONElement toCmp; + bool toCmpInclusive; + const FieldInterval &interval = _ranges[ i ].intervals()[ m / 2 ]; + if ( m % 2 == 0 ) { + toCmp = interval._lower._bound; + toCmpInclusive = interval._lower._inclusive; + } + else { + toCmp = interval._upper._bound; + toCmpInclusive = interval._upper._inclusive; + } + int cmp = toCmp.woCompare( e, false ); + if ( !forward ) { + cmp = -cmp; + } + if ( cmp < 0 ) { + l = m; + } + else if ( cmp > 0 ) { + h = m; + } + else { + if ( m % 2 == 0 ) { + lowEquality = true; + } + int ret = m; + // if left match and inclusive, all good + // if left match and not inclusive, return right before left bound + // if right match and inclusive, return left bound + // if right match and not inclusive, return right bound + if ( ( m % 2 == 0 && !toCmpInclusive ) || ( m % 2 == 1 && toCmpInclusive ) ) { + --ret; + } + return ret; + } + } + assert( l + 1 == h ); + return l; + } + + bool FieldRangeVector::matchesKey( const BSONObj &key ) const { + BSONObjIterator j( key ); + BSONObjIterator k( _indexSpec.keyPattern ); + for( int l = 0; l < (int)_ranges.size(); ++l ) { + int number = (int) k.next().number(); + bool forward = ( number >= 0 ? 1 : -1 ) * ( _direction >= 0 ? 1 : -1 ) > 0; + if ( !matchesElement( j.next(), l, forward ) ) { + return false; + } + } + return true; + } + + bool FieldRangeVector::matches( const BSONObj &obj ) const { + + bool ok = false; + + // TODO The representation of matching keys could potentially be optimized + // more for the case at hand. (For example, we can potentially consider + // fields individually instead of constructing several bson objects using + // multikey arrays.) But getKeys() canonically defines the key set for a + // given object and for now we are using it as is. + BSONObjSet keys; + _indexSpec.getKeys( obj, keys ); + for( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) { + if ( matchesKey( *i ) ) { + ok = true; + break; + } + } + + LOG(5) << "FieldRangeVector::matches() returns " << ok << endl; + + return ok; + } + + BSONObj FieldRangeVector::firstMatch( const BSONObj &obj ) const { + // NOTE Only works in forward direction. + assert( _direction >= 0 ); + BSONObjSet keys( BSONObjCmp( _indexSpec.keyPattern ) ); + _indexSpec.getKeys( obj, keys ); + for( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) { + if ( matchesKey( *i ) ) { + return *i; + } + } + return BSONObj(); + } + + // TODO optimize more + int FieldRangeVectorIterator::advance( const BSONObj &curr ) { + BSONObjIterator j( curr ); + BSONObjIterator o( _v._indexSpec.keyPattern ); + // track first field for which we are not at the end of the valid values, + // since we may need to advance from the key prefix ending with this field + int latestNonEndpoint = -1; + // iterate over fields to determine appropriate advance method + for( int i = 0; i < (int)_i.size(); ++i ) { + if ( i > 0 && !_v._ranges[ i - 1 ].intervals()[ _i[ i - 1 ] ].equality() ) { + // if last bound was inequality, we don't know anything about where we are for this field + // TODO if possible avoid this certain cases when value in previous field of the previous + // key is the same as value of previous field in current key + setMinus( i ); + } + bool eq = false; + BSONElement oo = o.next(); + bool reverse = ( ( oo.number() < 0 ) ^ ( _v._direction < 0 ) ); + BSONElement jj = j.next(); + if ( _i[ i ] == -1 ) { // unknown position for this field, do binary search + bool lowEquality; + int l = _v.matchingLowElement( jj, i, !reverse, lowEquality ); + if ( l % 2 == 0 ) { // we are in a valid range for this field + _i[ i ] = l / 2; + int diff = (int)_v._ranges[ i ].intervals().size() - _i[ i ]; + if ( diff > 1 ) { + latestNonEndpoint = i; + } + else if ( diff == 1 ) { + int x = _v._ranges[ i ].intervals()[ _i[ i ] ]._upper._bound.woCompare( jj, false ); + if ( x != 0 ) { + latestNonEndpoint = i; + } + } + continue; + } + else { // not in a valid range for this field - determine if and how to advance + // check if we're after the last interval for this field + if ( l == (int)_v._ranges[ i ].intervals().size() * 2 - 1 ) { + if ( latestNonEndpoint == -1 ) { + return -2; + } + setZero( latestNonEndpoint + 1 ); + // skip to curr / latestNonEndpoint + 1 / superlative + _after = true; + return latestNonEndpoint + 1; + } + _i[ i ] = ( l + 1 ) / 2; + if ( lowEquality ) { + // skip to curr / i + 1 / superlative + _after = true; + return i + 1; + } + // skip to curr / i / nextbounds + _cmp[ i ] = &_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._bound; + _inc[ i ] = _v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive; + for( int j = i + 1; j < (int)_i.size(); ++j ) { + _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound; + _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive; + } + _after = false; + return i; + } + } + bool first = true; + // _i[ i ] != -1, so we have a starting interval for this field + // which serves as a lower/equal bound on the first iteration - + // we advance from this interval to find a matching interval + while( _i[ i ] < (int)_v._ranges[ i ].intervals().size() ) { + // compare to current interval's upper bound + int x = _v._ranges[ i ].intervals()[ _i[ i ] ]._upper._bound.woCompare( jj, false ); + if ( reverse ) { + x = -x; + } + if ( x == 0 && _v._ranges[ i ].intervals()[ _i[ i ] ]._upper._inclusive ) { + eq = true; + break; + } + // see if we're less than the upper bound + if ( x > 0 ) { + if ( i == 0 && first ) { + // the value of 1st field won't go backward, so don't check lower bound + // TODO maybe we can check first only? + break; + } + // if it's an equality interval, don't need to compare separately to lower bound + if ( !_v._ranges[ i ].intervals()[ _i[ i ] ].equality() ) { + // compare to current interval's lower bound + x = _v._ranges[ i ].intervals()[ _i[ i ] ]._lower._bound.woCompare( jj, false ); + if ( reverse ) { + x = -x; + } + } + // if we're equal to and not inclusive the lower bound, advance + if ( ( x == 0 && !_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive ) ) { + setZero( i + 1 ); + // skip to curr / i + 1 / superlative + _after = true; + return i + 1; + } + // if we're less than the lower bound, advance + if ( x > 0 ) { + setZero( i + 1 ); + // skip to curr / i / nextbounds + _cmp[ i ] = &_v._ranges[ i ].intervals()[ _i[ i ] ]._lower._bound; + _inc[ i ] = _v._ranges[ i ].intervals()[ _i[ i ] ]._lower._inclusive; + for( int j = i + 1; j < (int)_i.size(); ++j ) { + _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound; + _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive; + } + _after = false; + return i; + } + else { + break; + } + } + // we're above the upper bound, so try next interval and reset remaining fields + ++_i[ i ]; + setZero( i + 1 ); + first = false; + } + int diff = (int)_v._ranges[ i ].intervals().size() - _i[ i ]; + if ( diff > 1 || ( !eq && diff == 1 ) ) { + // check if we're not at the end of valid values for this field + latestNonEndpoint = i; + } + else if ( diff == 0 ) { // check if we're past the last interval for this field + if ( latestNonEndpoint == -1 ) { + return -2; + } + // more values possible, skip... + setZero( latestNonEndpoint + 1 ); + // skip to curr / latestNonEndpoint + 1 / superlative + _after = true; + return latestNonEndpoint + 1; + } + } + return -1; + } + + void FieldRangeVectorIterator::prepDive() { + for( int j = 0; j < (int)_i.size(); ++j ) { + _cmp[ j ] = &_v._ranges[ j ].intervals().front()._lower._bound; + _inc[ j ] = _v._ranges[ j ].intervals().front()._lower._inclusive; + } + } + + BSONObj FieldRangeVectorIterator::startKey() { + BSONObjBuilder b; + for( int unsigned i = 0; i < _i.size(); ++i ) { + const FieldInterval &fi = _v._ranges[ i ].intervals()[ _i[ i ] ]; + b.appendAs( fi._lower._bound, "" ); + } + return b.obj(); + } + + // temp + BSONObj FieldRangeVectorIterator::endKey() { + BSONObjBuilder b; + for( int unsigned i = 0; i < _i.size(); ++i ) { + const FieldInterval &fi = _v._ranges[ i ].intervals()[ _i[ i ] ]; + b.appendAs( fi._upper._bound, "" ); + } + return b.obj(); + } + + OrRangeGenerator::OrRangeGenerator( const char *ns, const BSONObj &query , bool optimize ) + : _baseSet( ns, query, optimize ), _orFound() { + + BSONObjIterator i( _baseSet.originalQuery() ); + + while( i.more() ) { + BSONElement e = i.next(); + if ( strcmp( e.fieldName(), "$or" ) == 0 ) { + uassert( 13262, "$or requires nonempty array", e.type() == Array && e.embeddedObject().nFields() > 0 ); + BSONObjIterator j( e.embeddedObject() ); + while( j.more() ) { + BSONElement f = j.next(); + uassert( 13263, "$or array must contain objects", f.type() == Object ); + _orSets.push_back( FieldRangeSetPair( ns, f.embeddedObject(), optimize ) ); + uassert( 13291, "$or may not contain 'special' query", _orSets.back().getSpecial().empty() ); + _originalOrSets.push_back( _orSets.back() ); + } + _orFound = true; + continue; + } + } + } + + void OrRangeGenerator::assertMayPopOrClause() { + massert( 13274, "no or clause to pop", !orFinished() ); + } + + void OrRangeGenerator::popOrClause( NamespaceDetails *nsd, int idxNo, const BSONObj &keyPattern ) { + assertMayPopOrClause(); + auto_ptr<FieldRangeSet> holder; + const FieldRangeSet *toDiff = &_originalOrSets.front().frsForIndex( nsd, idxNo ); + BSONObj indexSpec = keyPattern; + if ( !indexSpec.isEmpty() && toDiff->matchPossibleForIndex( indexSpec ) ) { + holder.reset( toDiff->subset( indexSpec ) ); + toDiff = holder.get(); + } + popOrClause( toDiff, nsd, idxNo, keyPattern ); + } + + void OrRangeGenerator::popOrClauseSingleKey() { + assertMayPopOrClause(); + FieldRangeSet *toDiff = &_originalOrSets.front()._singleKey; + popOrClause( toDiff ); + } + + /** + * Removes the top or clause, which would have been recently scanned, and + * removes the field ranges it covers from all subsequent or clauses. As a + * side effect, this function may invalidate the return values of topFrs() + * calls made before this function was called. + * @param indexSpec - Keys of the index that was used to satisfy the last or + * clause. Used to determine the range of keys that were scanned. If + * empty we do not constrain the previous clause's ranges using index keys, + * which may reduce opportunities for range elimination. + */ + void OrRangeGenerator::popOrClause( const FieldRangeSet *toDiff, NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) { + list<FieldRangeSetPair>::iterator i = _orSets.begin(); + list<FieldRangeSetPair>::iterator j = _originalOrSets.begin(); + ++i; + ++j; + while( i != _orSets.end() ) { + *i -= *toDiff; + // Check if match is possible at all, and if it is possible for the recently scanned index. + if( !i->matchPossible() || ( d && !i->matchPossibleForIndex( d, idxNo, keyPattern ) ) ) { + i = _orSets.erase( i ); + j = _originalOrSets.erase( j ); + } + else { + ++i; + ++j; + } + } + _oldOrSets.push_front( _orSets.front() ); + _orSets.pop_front(); + _originalOrSets.pop_front(); + } + + struct SimpleRegexUnitTest : UnitTest { + void run() { + { + BSONObjBuilder b; + b.appendRegex("r", "^foo"); + BSONObj o = b.done(); + assert( simpleRegex(o.firstElement()) == "foo" ); + } + { + BSONObjBuilder b; + b.appendRegex("r", "^f?oo"); + BSONObj o = b.done(); + assert( simpleRegex(o.firstElement()) == "" ); + } + { + BSONObjBuilder b; + b.appendRegex("r", "^fz?oo"); + BSONObj o = b.done(); + assert( simpleRegex(o.firstElement()) == "f" ); + } + { + BSONObjBuilder b; + b.appendRegex("r", "^f", ""); + BSONObj o = b.done(); + assert( simpleRegex(o.firstElement()) == "f" ); + } + { + BSONObjBuilder b; + b.appendRegex("r", "\\Af", ""); + BSONObj o = b.done(); + assert( simpleRegex(o.firstElement()) == "f" ); + } + { + BSONObjBuilder b; + b.appendRegex("r", "^f", "m"); + BSONObj o = b.done(); + assert( simpleRegex(o.firstElement()) == "" ); + } + { + BSONObjBuilder b; + b.appendRegex("r", "\\Af", "m"); + BSONObj o = b.done(); + assert( simpleRegex(o.firstElement()) == "f" ); + } + { + BSONObjBuilder b; + b.appendRegex("r", "\\Af", "mi"); + BSONObj o = b.done(); + assert( simpleRegex(o.firstElement()) == "" ); + } + { + BSONObjBuilder b; + b.appendRegex("r", "\\Af \t\vo\n\ro \\ \\# #comment", "mx"); + BSONObj o = b.done(); + assert( simpleRegex(o.firstElement()) == "foo #" ); + } + { + assert( simpleRegex("^\\Qasdf\\E", "", NULL) == "asdf" ); + assert( simpleRegex("^\\Qasdf\\E.*", "", NULL) == "asdf" ); + assert( simpleRegex("^\\Qasdf", "", NULL) == "asdf" ); // PCRE supports this + assert( simpleRegex("^\\Qasdf\\\\E", "", NULL) == "asdf\\" ); + assert( simpleRegex("^\\Qas.*df\\E", "", NULL) == "as.*df" ); + assert( simpleRegex("^\\Qas\\Q[df\\E", "", NULL) == "as\\Q[df" ); + assert( simpleRegex("^\\Qas\\E\\\\E\\Q$df\\E", "", NULL) == "as\\E$df" ); // quoted string containing \E + } + + } + } simple_regex_unittest; + + + long long applySkipLimit( long long num , const BSONObj& cmd ) { + BSONElement s = cmd["skip"]; + BSONElement l = cmd["limit"]; + + if ( s.isNumber() ) { + num = num - s.numberLong(); + if ( num < 0 ) { + num = 0; + } + } + + if ( l.isNumber() ) { + long long limit = l.numberLong(); + if ( limit < num ) { + num = limit; + } + } + + return num; + } + + +} // namespace mongo diff --git a/src/mongo/db/queryutil.h b/src/mongo/db/queryutil.h new file mode 100644 index 00000000000..aefef27cc8b --- /dev/null +++ b/src/mongo/db/queryutil.h @@ -0,0 +1,443 @@ +// @file queryutil.h - Utility classes representing ranges of valid BSONElement values for a query. + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "jsobj.h" +#include "indexkey.h" + +namespace mongo { + + /** + * One side of an interval of valid BSONElements, specified by a value and a + * boolean indicating whether the interval includes the value. + */ + struct FieldBound { + BSONElement _bound; + bool _inclusive; + bool operator==( const FieldBound &other ) const { + return _bound.woCompare( other._bound ) == 0 && + _inclusive == other._inclusive; + } + void flipInclusive() { _inclusive = !_inclusive; } + }; + + /** A closed interval composed of a lower and an upper FieldBound. */ + struct FieldInterval { + FieldInterval() : _cachedEquality( -1 ) {} + FieldInterval( const BSONElement& e ) : _cachedEquality( -1 ) { + _lower._bound = _upper._bound = e; + _lower._inclusive = _upper._inclusive = true; + } + FieldBound _lower; + FieldBound _upper; + /** @return true iff no single element can be contained in the interval. */ + bool strictValid() const { + int cmp = _lower._bound.woCompare( _upper._bound, false ); + return ( cmp < 0 || ( cmp == 0 && _lower._inclusive && _upper._inclusive ) ); + } + /** @return true iff the interval is an equality constraint. */ + bool equality() const; + mutable int _cachedEquality; + + string toString() const; + }; + + /** + * An ordered list of FieldIntervals expressing constraints on valid + * BSONElement values for a field. + */ + class FieldRange { + public: + FieldRange( const BSONElement &e , bool singleKey , bool isNot=false , bool optimize=true ); + + /** @return Range intersection with 'other'. */ + const FieldRange &operator&=( const FieldRange &other ); + /** @return Range union with 'other'. */ + const FieldRange &operator|=( const FieldRange &other ); + /** @return Range of elements elements included in 'this' but not 'other'. */ + const FieldRange &operator-=( const FieldRange &other ); + /** @return true iff this range is a subset of 'other'. */ + bool operator<=( const FieldRange &other ) const; + + /** + * If there are any valid values for this range, the extreme values can + * be extracted. + */ + + BSONElement min() const { assert( !empty() ); return _intervals[ 0 ]._lower._bound; } + BSONElement max() const { assert( !empty() ); return _intervals[ _intervals.size() - 1 ]._upper._bound; } + bool minInclusive() const { assert( !empty() ); return _intervals[ 0 ]._lower._inclusive; } + bool maxInclusive() const { assert( !empty() ); return _intervals[ _intervals.size() - 1 ]._upper._inclusive; } + + /** @return true iff this range expresses a single equality interval. */ + bool equality() const; + /** @return true if all the intervals for this range are equalities */ + bool inQuery() const; + /** @return true iff this range does not include every BSONElement */ + bool nontrivial() const; + /** @return true iff this range matches no BSONElements. */ + bool empty() const { return _intervals.empty(); } + + /** Empty the range so it matches no BSONElements. */ + void makeEmpty() { _intervals.clear(); } + const vector<FieldInterval> &intervals() const { return _intervals; } + string getSpecial() const { return _special; } + /** Make component intervals noninclusive. */ + void setExclusiveBounds(); + /** + * Constructs a range where all FieldIntervals and FieldBounds are in + * the opposite order of the current range. + * NOTE the resulting intervals might not be strictValid(). + */ + void reverse( FieldRange &ret ) const; + + string toString() const; + private: + BSONObj addObj( const BSONObj &o ); + void finishOperation( const vector<FieldInterval> &newIntervals, const FieldRange &other ); + vector<FieldInterval> _intervals; + // Owns memory for our BSONElements. + vector<BSONObj> _objData; + string _special; + bool _singleKey; + }; + + /** + * A BoundList contains intervals specified by inclusive start + * and end bounds. The intervals should be nonoverlapping and occur in + * the specified direction of traversal. For example, given a simple index {i:1} + * and direction +1, one valid BoundList is: (1, 2); (4, 6). The same BoundList + * would be valid for index {i:-1} with direction -1. + */ + typedef vector<pair<BSONObj,BSONObj> > BoundList; + + class QueryPattern; + + /** + * A set of FieldRanges determined from constraints on the fields of a query, + * that may be used to determine index bounds. + */ + class FieldRangeSet { + public: + friend class OrRangeGenerator; + friend class FieldRangeVector; + FieldRangeSet( const char *ns, const BSONObj &query , bool singleKey , bool optimize=true ); + + /** @return true if there is a nontrivial range for the given field. */ + bool hasRange( const char *fieldName ) const { + map<string, FieldRange>::const_iterator f = _ranges.find( fieldName ); + return f != _ranges.end(); + } + /** @return range for the given field. */ + const FieldRange &range( const char *fieldName ) const; + /** @return range for the given field. */ + FieldRange &range( const char *fieldName ); + /** @return the number of nontrivial ranges. */ + int nNontrivialRanges() const; + /** @return the field ranges comprising this set. */ + const map<string,FieldRange> &ranges() const { return _ranges; } + /** + * @return true if a match could be possible on every field. Generally this + * is not useful information for a single key FieldRangeSet and + * matchPossibleForIndex() should be used instead. + */ + bool matchPossible() const; + /** + * @return true if a match could be possible given the value of _singleKey + * and index key 'keyPattern'. + * @param keyPattern May be {} or {$natural:1} for a non index scan. + */ + bool matchPossibleForIndex( const BSONObj &keyPattern ) const; + + const char *ns() const { return _ns; } + + /** + * @return a simplified query from the extreme values of the nontrivial + * fields. + * @param fields If specified, the fields of the returned object are + * ordered to match those of 'fields'. + */ + BSONObj simplifiedQuery( const BSONObj &fields = BSONObj() ) const; + + QueryPattern pattern( const BSONObj &sort = BSONObj() ) const; + string getSpecial() const; + + /** + * @return a FieldRangeSet approximation of the documents in 'this' but + * not in 'other'. The approximation will be a superset of the documents + * in 'this' but not 'other'. + */ + const FieldRangeSet &operator-=( const FieldRangeSet &other ); + /** @return intersection of 'this' with 'other'. */ + const FieldRangeSet &operator&=( const FieldRangeSet &other ); + + /** + * @return an ordered list of bounds generated using an index key pattern + * and traversal direction. + * + * NOTE This function is deprecated in the query optimizer and only + * currently used by the sharding code. + */ + BoundList indexBounds( const BSONObj &keyPattern, int direction ) const; + + /** + * @return - A new FieldRangeSet based on this FieldRangeSet, but with only + * a subset of the fields. + * @param fields - Only fields which are represented as field names in this object + * will be included in the returned FieldRangeSet. + */ + FieldRangeSet *subset( const BSONObj &fields ) const; + + bool singleKey() const { return _singleKey; } + + BSONObj originalQuery() const { return _queries[ 0 ]; } + private: + void appendQueries( const FieldRangeSet &other ); + void makeEmpty(); + void processQueryField( const BSONElement &e, bool optimize ); + void processOpElement( const char *fieldName, const BSONElement &f, bool isNot, bool optimize ); + static FieldRange *__singleKeyTrivialRange; + static FieldRange *__multiKeyTrivialRange; + const FieldRange &trivialRange() const; + map<string,FieldRange> _ranges; + const char *_ns; + // Owns memory for FieldRange BSONElements. + vector<BSONObj> _queries; + bool _singleKey; + }; + + class NamespaceDetails; + + /** + * A pair of FieldRangeSets, one representing constraints for single key + * indexes and the other representing constraints for multi key indexes and + * unindexed scans. In several member functions the caller is asked to + * supply an index so that the implementation may utilize the proper + * FieldRangeSet and return results that are appropriate with respect to that + * supplied index. + */ + class FieldRangeSetPair { + public: + FieldRangeSetPair( const char *ns, const BSONObj &query, bool optimize=true ) + :_singleKey( ns, query, true, optimize ), _multiKey( ns, query, false, optimize ) {} + + /** + * @return the appropriate single or multi key FieldRangeSet for the specified index. + * @param idxNo -1 for non index scan. + */ + const FieldRangeSet &frsForIndex( const NamespaceDetails* nsd, int idxNo ) const; + + /** @return a field range in the single key FieldRangeSet. */ + const FieldRange &singleKeyRange( const char *fieldName ) const { + return _singleKey.range( fieldName ); + } + /** @return true if the range limits are equivalent to an empty query. */ + bool noNontrivialRanges() const; + /** @return false if a match is impossible regardless of index. */ + bool matchPossible() const { return _multiKey.matchPossible(); } + /** + * @return false if a match is impossible on the specified index. + * @param idxNo -1 for non index scan. + */ + bool matchPossibleForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const; + + const char *ns() const { return _singleKey.ns(); } + + string getSpecial() const { return _singleKey.getSpecial(); } + + /** Intersect with another FieldRangeSetPair. */ + FieldRangeSetPair &operator&=( const FieldRangeSetPair &other ); + /** + * Subtract a FieldRangeSet, generally one expressing a range that has + * already been scanned. + */ + FieldRangeSetPair &operator-=( const FieldRangeSet &scanned ); + + BoundList singleKeyIndexBounds( const BSONObj &keyPattern, int direction ) const { + return _singleKey.indexBounds( keyPattern, direction ); + } + + BSONObj originalQuery() const { return _singleKey.originalQuery(); } + + private: + FieldRangeSetPair( const FieldRangeSet &singleKey, const FieldRangeSet &multiKey ) + :_singleKey( singleKey ), _multiKey( multiKey ) {} + void assertValidIndex( const NamespaceDetails *d, int idxNo ) const; + void assertValidIndexOrNoIndex( const NamespaceDetails *d, int idxNo ) const; + /** matchPossibleForIndex() must be true. */ + BSONObj simplifiedQueryForIndex( NamespaceDetails *d, int idxNo, const BSONObj &keyPattern ) const; + FieldRangeSet _singleKey; + FieldRangeSet _multiKey; + friend class OrRangeGenerator; + friend struct QueryUtilIndexed; + }; + + class IndexSpec; + + /** + * An ordered list of fields and their FieldRanges, corresponding to valid + * index keys for a given index spec. + */ + class FieldRangeVector { + public: + /** + * @param frs The valid ranges for all fields, as defined by the query spec + * @param indexSpec The index spec (key pattern and info) + * @param direction The direction of index traversal + */ + FieldRangeVector( const FieldRangeSet &frs, const IndexSpec &indexSpec, int direction ); + + /** @return the number of index ranges represented by 'this' */ + long long size(); + /** @return starting point for an index traversal. */ + BSONObj startKey() const; + /** @return end point for an index traversal. */ + BSONObj endKey() const; + /** @return a client readable representation of 'this' */ + BSONObj obj() const; + + const IndexSpec& getSpec(){ return _indexSpec; } + + /** + * @return true iff the provided document matches valid ranges on all + * of this FieldRangeVector's fields, which is the case iff this document + * would be returned while scanning the index corresponding to this + * FieldRangeVector. This function is used for $or clause deduping. + */ + bool matches( const BSONObj &obj ) const; + + /** + * @return first key of 'obj' that would be encountered by a forward + * index scan using this FieldRangeVector, BSONObj() if no such key. + */ + BSONObj firstMatch( const BSONObj &obj ) const; + + private: + int matchingLowElement( const BSONElement &e, int i, bool direction, bool &lowEquality ) const; + bool matchesElement( const BSONElement &e, int i, bool direction ) const; + bool matchesKey( const BSONObj &key ) const; + vector<FieldRange> _ranges; + const IndexSpec _indexSpec; + int _direction; + vector<BSONObj> _queries; // make sure mem owned + friend class FieldRangeVectorIterator; + }; + + /** + * Helper class for iterating through an ordered representation of keys + * to find those keys that match a specified FieldRangeVector. + */ + class FieldRangeVectorIterator { + public: + FieldRangeVectorIterator( const FieldRangeVector &v ) : _v( v ), _i( _v._ranges.size(), -1 ), _cmp( _v._ranges.size(), 0 ), _inc( _v._ranges.size(), false ), _after() { + } + static BSONObj minObject() { + BSONObjBuilder b; b.appendMinKey( "" ); + return b.obj(); + } + static BSONObj maxObject() { + BSONObjBuilder b; b.appendMaxKey( "" ); + return b.obj(); + } + /** + * @return Suggested advance method, based on current key. + * -2 Iteration is complete, no need to advance. + * -1 Advance to the next key, without skipping. + * >=0 Skip parameter. If @return is r, skip to the key comprised + * of the first r elements of curr followed by the (r+1)th and + * remaining elements of cmp() (with inclusivity specified by + * the (r+1)th and remaining elements of inc()). If after() is + * true, skip past this key not to it. + */ + int advance( const BSONObj &curr ); + const vector<const BSONElement *> &cmp() const { return _cmp; } + const vector<bool> &inc() const { return _inc; } + bool after() const { return _after; } + void prepDive(); + void setZero( int i ) { for( int j = i; j < (int)_i.size(); ++j ) _i[ j ] = 0; } + void setMinus( int i ) { for( int j = i; j < (int)_i.size(); ++j ) _i[ j ] = -1; } + bool ok() { return _i[ 0 ] < (int)_v._ranges[ 0 ].intervals().size(); } + BSONObj startKey(); + // temp + BSONObj endKey(); + private: + const FieldRangeVector &_v; + vector<int> _i; + vector<const BSONElement*> _cmp; + vector<bool> _inc; + bool _after; + }; + + /** + * As we iterate through $or clauses this class generates a FieldRangeSetPair + * for the current $or clause, in some cases by excluding ranges that were + * included in a previous clause. + */ + class OrRangeGenerator { + public: + OrRangeGenerator( const char *ns, const BSONObj &query , bool optimize=true ); + + /** + * @return true iff we are done scanning $or clauses. if there's a + * useless or clause, we won't use or index ranges to help with scanning. + */ + bool orFinished() const { return _orFound && _orSets.empty(); } + /** Iterates to the next $or clause by removing the current $or clause. */ + void popOrClause( NamespaceDetails *nsd, int idxNo, const BSONObj &keyPattern ); + void popOrClauseSingleKey(); + /** @return FieldRangeSetPair for the current $or clause. */ + FieldRangeSetPair *topFrsp() const; + /** + * @return original FieldRangeSetPair for the current $or clause. While the + * original bounds are looser, they are composed of fewer ranges and it + * is faster to do operations with them; when they can be used instead of + * more precise bounds, they should. + */ + FieldRangeSetPair *topFrspOriginal() const; + + string getSpecial() const { return _baseSet.getSpecial(); } + + bool moreOrClauses() const { return !_orSets.empty(); } + private: + void assertMayPopOrClause(); + void popOrClause( const FieldRangeSet *toDiff, NamespaceDetails *d = 0, int idxNo = -1, const BSONObj &keyPattern = BSONObj() ); + FieldRangeSetPair _baseSet; + list<FieldRangeSetPair> _orSets; + list<FieldRangeSetPair> _originalOrSets; + // ensure memory is owned + list<FieldRangeSetPair> _oldOrSets; + bool _orFound; + friend struct QueryUtilIndexed; + }; + + /** returns a string that when used as a matcher, would match a super set of regex() + returns "" for complex regular expressions + used to optimize queries in some simple regex cases that start with '^' + + if purePrefix != NULL, sets it to whether the regex can be converted to a range query + */ + string simpleRegex(const char* regex, const char* flags, bool* purePrefix=NULL); + + /** returns the upper bound of a query that matches prefix */ + string simpleRegexEnd( string prefix ); + + long long applySkipLimit( long long num , const BSONObj& cmd ); + +} // namespace mongo + +#include "queryutil-inl.h" diff --git a/src/mongo/db/record.cpp b/src/mongo/db/record.cpp new file mode 100644 index 00000000000..17987002efc --- /dev/null +++ b/src/mongo/db/record.cpp @@ -0,0 +1,267 @@ +// record.cpp + +#include "pch.h" +#include "pdfile.h" +#include "../util/processinfo.h" +#include "../util/net/listen.h" +#include "pagefault.h" + +namespace mongo { + + namespace ps { + + enum State { + In , Out, Unk + }; + + enum Constants { + SliceSize = 65536 , + MaxChain = 20 , // intentionally very low + NumSlices = 10 , + RotateTimeSecs = 90 + }; + + int hash( size_t region ) { + return + abs( ( ( 7 + (int)(region & 0xFFFF) ) + * ( 11 + (int)( ( region >> 16 ) & 0xFFFF ) ) +#if defined(_WIN64) || defined(__amd64__) + * ( 13 + (int)( ( region >> 32 ) & 0xFFFF ) ) + * ( 17 + (int)( ( region >> 48 ) & 0xFFFF ) ) +#endif + ) % SliceSize ); + } + + + /** + * simple hash map for region -> status + * this constitures a single region of time + * it does chaining, but very short chains + */ + class Slice { + + struct Entry { + size_t region; + unsigned long long value; + }; + + public: + + Slice() { + reset(); + } + + void reset() { + memset( _data , 0 , SliceSize * sizeof(Entry) ); + } + + State get( int regionHash , size_t region , short offset ) { + DEV assert( hash( region ) == regionHash ); + + Entry * e = _get( regionHash , region , false ); + if ( ! e ) + return Unk; + + return ( e->value & ( ((unsigned long long)1) << offset ) ) ? In : Out; + } + + /** + * @return true if added, false if full + */ + bool in( int regionHash , size_t region , short offset ) { + DEV assert( hash( region ) == regionHash ); + + Entry * e = _get( regionHash , region , true ); + if ( ! e ) + return false; + + e->value |= ((unsigned long long)1) << offset; + return true; + } + + private: + + Entry* _get( int start , size_t region , bool add ) { + for ( int i=0; i<MaxChain; i++ ) { + + int bucket = ( start + i ) % SliceSize; + + if ( _data[bucket].region == 0 ) { + if ( ! add ) + return 0; + + _data[bucket].region = region; + return &_data[bucket]; + } + + if ( _data[bucket].region == region ) { + return &_data[bucket]; + } + } + return 0; + } + + Entry _data[SliceSize]; + }; + + + /** + * this contains many slices of times + * the idea you put mem status in the current time slice + * and then after a certain period of time, it rolls off so we check again + */ + class Rolling { + + public: + Rolling() + : _lock( "ps::Rolling" ){ + _curSlice = 0; + _lastRotate = Listener::getElapsedTimeMillis(); + } + + + /** + * after this call, we assume the page is in ram + * @param doHalf if this is a known good access, want to put in first half + * @return whether we know the page is in ram + */ + bool access( size_t region , short offset , bool doHalf ) { + int regionHash = hash(region); + + SimpleMutex::scoped_lock lk( _lock ); + + static int rarely_count = 0; + if ( rarely_count++ % 2048 == 0 ) { + long long now = Listener::getElapsedTimeMillis(); + RARELY if ( now == 0 ) { + tlog() << "warning Listener::getElapsedTimeMillis returning 0ms" << endl; + } + + if ( now - _lastRotate > ( 1000 * RotateTimeSecs ) ) { + _rotate(); + } + } + + for ( int i=0; i<NumSlices / ( doHalf ? 2 : 1 ); i++ ) { + int pos = (_curSlice+i)%NumSlices; + State s = _slices[pos].get( regionHash , region , offset ); + + if ( s == In ) + return true; + + if ( s == Out ) { + _slices[pos].in( regionHash , region , offset ); + return false; + } + } + + // we weren't in any slice + // so add to cur + if ( ! _slices[_curSlice].in( regionHash , region , offset ) ) { + _rotate(); + _slices[_curSlice].in( regionHash , region , offset ); + } + return false; + } + + private: + + void _rotate() { + _curSlice = ( _curSlice + 1 ) % NumSlices; + _slices[_curSlice].reset(); + _lastRotate = Listener::getElapsedTimeMillis(); + } + + int _curSlice; + long long _lastRotate; + Slice _slices[NumSlices]; + + SimpleMutex _lock; + } rolling; + + } + + bool Record::MemoryTrackingEnabled = true; + + volatile int __record_touch_dummy = 1; // this is used to make sure the compiler doesn't get too smart on us + void Record::touch( bool entireRecrd ) { + if ( lengthWithHeaders > HeaderSize ) { // this also makes sure lengthWithHeaders is in memory + char * addr = data; + char * end = data + netLength(); + for ( ; addr <= end ; addr += 2048 ) { + __record_touch_dummy += addr[0]; + + break; // TODO: remove this, pending SERVER-3711 + + // note if this is a touch of a deletedrecord, we don't want to touch more than the first part. we may simply + // be updated the linked list and a deletedrecord could be gigantic. similar circumstance just less extreme + // exists for any record if we are just updating its header, say on a remove(); some sort of hints might be + // useful. + + if ( ! entireRecrd ) + break; + } + } + } + + const bool blockSupported = ProcessInfo::blockCheckSupported(); + + bool Record::likelyInPhysicalMemory() { + if ( ! MemoryTrackingEnabled ) + return true; + + const size_t page = (size_t)data >> 12; + const size_t region = page >> 6; + const size_t offset = page & 0x3f; + + if ( ps::rolling.access( region , offset , false ) ) + return true; + + if ( ! blockSupported ) { + // this means we don't fallback to system call + // and assume things aren't in memory + // possible we yield too much - but better than not yielding through a fault + return false; + } + + return ProcessInfo::blockInMemory( data ); + } + + + Record* Record::accessed() { + const size_t page = (size_t)data >> 12; + const size_t region = page >> 6; + const size_t offset = page & 0x3f; + ps::rolling.access( region , offset , true ); + return this; + } + + Record* DiskLoc::rec() const { + Record *r = DataFileMgr::getRecord(*this); +#if defined(_PAGEFAULTEXCEPTION) + DEV ONCE { + log() << "_DEBUG info _PAGEFAULTEXCEPTION is ON -- experimental at this time" << endl; + } + bool fault = !r->likelyInPhysicalMemory(); + DEV if( rand() % 100 == 0 ) + fault = true; + if( fault && + !cc()._hasWrittenThisPass && + cc()._pageFaultRetryableSection ) + { + if( cc()._pageFaultRetryableSection->_laps > 100 ) { + log() << "info pagefaultexception _laps > 100" << endl; + } + else { + throw PageFaultException(r); + } + } +#else + DEV ONCE { + log() << "_DEBUG info _PAGEFAULTEXCEPTION is off" << endl; + } +#endif + return r; + } + +} diff --git a/src/mongo/db/repl.cpp b/src/mongo/db/repl.cpp new file mode 100644 index 00000000000..25ecb6b455f --- /dev/null +++ b/src/mongo/db/repl.cpp @@ -0,0 +1,1516 @@ +// repl.cpp + +/* TODO + PAIRING + _ on a syncexception, don't allow going back to master state? +*/ + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* Collections we use: + + local.sources - indicates what sources we pull from as a "slave", and the last update of each + local.oplog.$main - our op log as "master" + local.dbinfo.<dbname> - no longer used??? + local.pair.startup - [deprecated] can contain a special value indicating for a pair that we have the master copy. + used when replacing other half of the pair which has permanently failed. + local.pair.sync - [deprecated] { initialsynccomplete: 1 } +*/ + +#include "pch.h" +#include "jsobj.h" +#include "../util/goodies.h" +#include "repl.h" +#include "../util/net/message.h" +#include "../util/background.h" +#include "../client/dbclient.h" +#include "../client/connpool.h" +#include "pdfile.h" +#include "ops/query.h" +#include "db.h" +#include "commands.h" +#include "security.h" +#include "cmdline.h" +#include "repl_block.h" +#include "repl/rs.h" +#include "replutil.h" +#include "repl/connections.h" +#include "ops/update.h" + +namespace mongo { + + // our config from command line etc. + ReplSettings replSettings; + + /* if 1 sync() is running */ + volatile int syncing = 0; + static volatile int relinquishSyncingSome = 0; + + /* "dead" means something really bad happened like replication falling completely out of sync. + when non-null, we are dead and the string is informational + */ + const char *replAllDead = 0; + + time_t lastForcedResync = 0; + +} // namespace mongo + +namespace mongo { + + /* output by the web console */ + const char *replInfo = ""; + struct ReplInfo { + ReplInfo(const char *msg) { + replInfo = msg; + } + ~ReplInfo() { + replInfo = "?"; + } + }; + + /* operator requested resynchronization of replication (on the slave). { resync : 1 } */ + class CmdResync : public Command { + public: + virtual bool slaveOk() const { + return true; + } + virtual bool adminOnly() const { + return true; + } + virtual bool logTheOp() { return false; } + virtual LockType locktype() const { return WRITE; } + void help(stringstream&h) const { h << "resync (from scratch) an out of date replica slave.\nhttp://www.mongodb.org/display/DOCS/Master+Slave"; } + CmdResync() : Command("resync") { } + virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if( cmdLine.usingReplSets() ) { + errmsg = "resync command not currently supported with replica sets. See RS102 info in the mongodb documentations"; + result.append("info", "http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member"); + return false; + } + + if ( cmdObj.getBoolField( "force" ) ) { + if ( !waitForSyncToFinish( errmsg ) ) + return false; + replAllDead = "resync forced"; + } + if ( !replAllDead ) { + errmsg = "not dead, no need to resync"; + return false; + } + if ( !waitForSyncToFinish( errmsg ) ) + return false; + + ReplSource::forceResyncDead( "client" ); + result.append( "info", "triggered resync for all sources" ); + return true; + } + bool waitForSyncToFinish( string &errmsg ) const { + // Wait for slave thread to finish syncing, so sources will be be + // reloaded with new saved state on next pass. + Timer t; + while ( 1 ) { + if ( syncing == 0 || t.millis() > 30000 ) + break; + { + dbtemprelease t; + relinquishSyncingSome = 1; + sleepmillis(1); + } + } + if ( syncing ) { + errmsg = "timeout waiting for sync() to finish"; + return false; + } + return true; + } + } cmdResync; + + bool anyReplEnabled() { + return replSettings.slave || replSettings.master || theReplSet; + } + + bool replAuthenticate(DBClientBase *conn); + + void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level ) { + + if ( replSet ) { + if( theReplSet == 0 ) { + result.append("ismaster", false); + result.append("secondary", false); + result.append("info", ReplSet::startupStatusMsg.get()); + result.append( "isreplicaset" , true ); + return; + } + + theReplSet->fillIsMaster(result); + return; + } + + if ( replAllDead ) { + result.append("ismaster", 0); + string s = string("dead: ") + replAllDead; + result.append("info", s); + } + else { + result.appendBool("ismaster", _isMaster() ); + } + + if ( level && replSet ) { + result.append( "info" , "is replica set" ); + } + else if ( level ) { + BSONObjBuilder sources( result.subarrayStart( "sources" ) ); + + readlock lk( "local.sources" ); + Client::Context ctx( "local.sources", dbpath, authed ); + shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj()); + int n = 0; + while ( c->ok() ) { + BSONObj s = c->current(); + + BSONObjBuilder bb; + bb.append( s["host"] ); + string sourcename = s["source"].valuestr(); + if ( sourcename != "main" ) + bb.append( s["source"] ); + + { + BSONElement e = s["syncedTo"]; + BSONObjBuilder t( bb.subobjStart( "syncedTo" ) ); + t.appendDate( "time" , e.timestampTime() ); + t.append( "inc" , e.timestampInc() ); + t.done(); + } + + if ( level > 1 ) { + dbtemprelease unlock; + // note: there is no so-style timeout on this connection; perhaps we should have one. + ScopedDbConnection conn( s["host"].valuestr() ); + DBClientConnection *cliConn = dynamic_cast< DBClientConnection* >( &conn.conn() ); + if ( cliConn && replAuthenticate( cliConn ) ) { + BSONObj first = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << 1 ) ) ); + BSONObj last = conn->findOne( (string)"local.oplog.$" + sourcename , Query().sort( BSON( "$natural" << -1 ) ) ); + bb.appendDate( "masterFirst" , first["ts"].timestampTime() ); + bb.appendDate( "masterLast" , last["ts"].timestampTime() ); + double lag = (double) (last["ts"].timestampTime() - s["syncedTo"].timestampTime()); + bb.append( "lagSeconds" , lag / 1000 ); + } + conn.done(); + } + + sources.append( BSONObjBuilder::numStr( n++ ) , bb.obj() ); + c->advance(); + } + + sources.done(); + } + } + + class CmdIsMaster : public Command { + public: + virtual bool requiresAuth() { return false; } + virtual bool slaveOk() const { + return true; + } + virtual void help( stringstream &help ) const { + help << "Check if this server is primary for a replica pair/set; also if it is --master or --slave in simple master/slave setups.\n"; + help << "{ isMaster : 1 }"; + } + virtual LockType locktype() const { return NONE; } + CmdIsMaster() : Command("isMaster", true, "ismaster") { } + virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { + /* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not + authenticated. + we allow unauthenticated ismaster but we aren't as verbose informationally if + one is not authenticated for admin db to be safe. + */ + bool authed = cc().getAuthenticationInfo()->isAuthorizedReads("admin"); + appendReplicationInfo( result , authed ); + + result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize); + return true; + } + } cmdismaster; + + ReplSource::ReplSource() { + nClonedThisPass = 0; + } + + ReplSource::ReplSource(BSONObj o) : nClonedThisPass(0) { + only = o.getStringField("only"); + hostName = o.getStringField("host"); + _sourceName = o.getStringField("source"); + uassert( 10118 , "'host' field not set in sources collection object", !hostName.empty() ); + uassert( 10119 , "only source='main' allowed for now with replication", sourceName() == "main" ); + BSONElement e = o.getField("syncedTo"); + if ( !e.eoo() ) { + uassert( 10120 , "bad sources 'syncedTo' field value", e.type() == Date || e.type() == Timestamp ); + OpTime tmp( e.date() ); + syncedTo = tmp; + } + + BSONObj dbsObj = o.getObjectField("dbsNextPass"); + if ( !dbsObj.isEmpty() ) { + BSONObjIterator i(dbsObj); + while ( 1 ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + addDbNextPass.insert( e.fieldName() ); + } + } + + dbsObj = o.getObjectField("incompleteCloneDbs"); + if ( !dbsObj.isEmpty() ) { + BSONObjIterator i(dbsObj); + while ( 1 ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + incompleteCloneDbs.insert( e.fieldName() ); + } + } + } + + /* Turn our C++ Source object into a BSONObj */ + BSONObj ReplSource::jsobj() { + BSONObjBuilder b; + b.append("host", hostName); + b.append("source", sourceName()); + if ( !only.empty() ) + b.append("only", only); + if ( !syncedTo.isNull() ) + b.appendTimestamp("syncedTo", syncedTo.asDate()); + + BSONObjBuilder dbsNextPassBuilder; + int n = 0; + for ( set<string>::iterator i = addDbNextPass.begin(); i != addDbNextPass.end(); i++ ) { + n++; + dbsNextPassBuilder.appendBool(*i, 1); + } + if ( n ) + b.append("dbsNextPass", dbsNextPassBuilder.done()); + + BSONObjBuilder incompleteCloneDbsBuilder; + n = 0; + for ( set<string>::iterator i = incompleteCloneDbs.begin(); i != incompleteCloneDbs.end(); i++ ) { + n++; + incompleteCloneDbsBuilder.appendBool(*i, 1); + } + if ( n ) + b.append("incompleteCloneDbs", incompleteCloneDbsBuilder.done()); + + return b.obj(); + } + + void ReplSource::save() { + BSONObjBuilder b; + assert( !hostName.empty() ); + b.append("host", hostName); + // todo: finish allowing multiple source configs. + // this line doesn't work right when source is null, if that is allowed as it is now: + //b.append("source", _sourceName); + BSONObj pattern = b.done(); + + BSONObj o = jsobj(); + log( 1 ) << "Saving repl source: " << o << endl; + + { + OpDebug debug; + Client::Context ctx("local.sources"); + UpdateResult res = updateObjects("local.sources", o, pattern, true/*upsert for pair feature*/, false,false,debug); + assert( ! res.mod ); + assert( res.num == 1 ); + } + } + + static void addSourceToList(ReplSource::SourceVector &v, ReplSource& s, ReplSource::SourceVector &old) { + if ( !s.syncedTo.isNull() ) { // Don't reuse old ReplSource if there was a forced resync. + for ( ReplSource::SourceVector::iterator i = old.begin(); i != old.end(); ) { + if ( s == **i ) { + v.push_back(*i); + old.erase(i); + return; + } + i++; + } + } + + v.push_back( shared_ptr< ReplSource >( new ReplSource( s ) ) ); + } + + /* we reuse our existing objects so that we can keep our existing connection + and cursor in effect. + */ + void ReplSource::loadAll(SourceVector &v) { + Client::Context ctx("local.sources"); + SourceVector old = v; + v.clear(); + + if ( !cmdLine.source.empty() ) { + // --source <host> specified. + // check that no items are in sources other than that + // add if missing + shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj()); + int n = 0; + while ( c->ok() ) { + n++; + ReplSource tmp(c->current()); + if ( tmp.hostName != cmdLine.source ) { + log() << "repl: --source " << cmdLine.source << " != " << tmp.hostName << " from local.sources collection" << endl; + log() << "repl: for instructions on changing this slave's source, see:" << endl; + log() << "http://dochub.mongodb.org/core/masterslave" << endl; + log() << "repl: terminating mongod after 30 seconds" << endl; + sleepsecs(30); + dbexit( EXIT_REPLICATION_ERROR ); + } + if ( tmp.only != cmdLine.only ) { + log() << "--only " << cmdLine.only << " != " << tmp.only << " from local.sources collection" << endl; + log() << "terminating after 30 seconds" << endl; + sleepsecs(30); + dbexit( EXIT_REPLICATION_ERROR ); + } + c->advance(); + } + uassert( 10002 , "local.sources collection corrupt?", n<2 ); + if ( n == 0 ) { + // source missing. add. + ReplSource s; + s.hostName = cmdLine.source; + s.only = cmdLine.only; + s.save(); + } + } + else { + try { + massert( 10384 , "--only requires use of --source", cmdLine.only.empty()); + } + catch ( ... ) { + dbexit( EXIT_BADOPTIONS ); + } + } + + shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj()); + while ( c->ok() ) { + ReplSource tmp(c->current()); + if ( tmp.syncedTo.isNull() ) { + DBDirectClient c; + if ( c.exists( "local.oplog.$main" ) ) { + BSONObj op = c.findOne( "local.oplog.$main", QUERY( "op" << NE << "n" ).sort( BSON( "$natural" << -1 ) ) ); + if ( !op.isEmpty() ) { + tmp.syncedTo = op[ "ts" ].date(); + } + } + } + addSourceToList(v, tmp, old); + c->advance(); + } + } + + BSONObj opTimeQuery = fromjson("{\"getoptime\":1}"); + + bool ReplSource::throttledForceResyncDead( const char *requester ) { + if ( time( 0 ) - lastForcedResync > 600 ) { + forceResyncDead( requester ); + lastForcedResync = time( 0 ); + return true; + } + return false; + } + + void ReplSource::forceResyncDead( const char *requester ) { + if ( !replAllDead ) + return; + SourceVector sources; + ReplSource::loadAll(sources); + for( SourceVector::iterator i = sources.begin(); i != sources.end(); ++i ) { + log() << requester << " forcing resync from " << (*i)->hostName << endl; + (*i)->forceResync( requester ); + } + replAllDead = 0; + } + + void ReplSource::forceResync( const char *requester ) { + BSONObj info; + { + dbtemprelease t; + if (!oplogReader.connect(hostName)) { + msgassertedNoTrace( 14051 , "unable to connect to resync"); + } + /* todo use getDatabaseNames() method here */ + bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info ); + massert( 10385 , "Unable to get database list", ok ); + } + BSONObjIterator i( info.getField( "databases" ).embeddedObject() ); + while( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + string name = e.embeddedObject().getField( "name" ).valuestr(); + if ( !e.embeddedObject().getBoolField( "empty" ) ) { + if ( name != "local" ) { + if ( only.empty() || only == name ) { + resyncDrop( name.c_str(), requester ); + } + } + } + } + syncedTo = OpTime(); + addDbNextPass.clear(); + save(); + } + + string ReplSource::resyncDrop( const char *db, const char *requester ) { + log() << "resync: dropping database " << db << endl; + Client::Context ctx(db); + dropDatabase(db); + return db; + } + + /* grab initial copy of a database from the master */ + void ReplSource::resync(string db) { + string dummyNs = resyncDrop( db.c_str(), "internal" ); + Client::Context ctx( dummyNs ); + { + log() << "resync: cloning database " << db << " to get an initial copy" << endl; + ReplInfo r("resync: cloning a database"); + string errmsg; + int errCode = 0; + bool ok = cloneFrom(hostName.c_str(), errmsg, cc().database()->name, false, /*slaveOk*/ true, /*replauth*/ true, /*snapshot*/false, /*mayYield*/true, /*mayBeInterrupted*/false, &errCode); + if ( !ok ) { + if ( errCode == DatabaseDifferCaseCode ) { + resyncDrop( db.c_str(), "internal" ); + log() << "resync: database " << db << " not valid on the master due to a name conflict, dropping." << endl; + return; + } + else { + problem() << "resync of " << db << " from " << hostName << " failed " << errmsg << endl; + throw SyncException(); + } + } + } + + log() << "resync: done with initial clone for db: " << db << endl; + + return; + } + + DatabaseIgnorer ___databaseIgnorer; + + void DatabaseIgnorer::doIgnoreUntilAfter( const string &db, const OpTime &futureOplogTime ) { + if ( futureOplogTime > _ignores[ db ] ) { + _ignores[ db ] = futureOplogTime; + } + } + + bool DatabaseIgnorer::ignoreAt( const string &db, const OpTime ¤tOplogTime ) { + if ( _ignores[ db ].isNull() ) { + return false; + } + if ( _ignores[ db ] >= currentOplogTime ) { + return true; + } else { + // The ignore state has expired, so clear it. + _ignores.erase( db ); + return false; + } + } + + bool ReplSource::handleDuplicateDbName( const BSONObj &op, const char *ns, const char *db ) { + if ( dbHolder()._isLoaded( ns, dbpath ) ) { + // Database is already present. + return true; + } + BSONElement ts = op.getField( "ts" ); + if ( ( ts.type() == Date || ts.type() == Timestamp ) && ___databaseIgnorer.ignoreAt( db, ts.date() ) ) { + // Database is ignored due to a previous indication that it is + // missing from master after optime "ts". + return false; + } + if ( Database::duplicateUncasedName( false, db, dbpath ).empty() ) { + // No duplicate database names are present. + return true; + } + + OpTime lastTime; + bool dbOk = false; + { + dbtemprelease release; + + // We always log an operation after executing it (never before), so + // a database list will always be valid as of an oplog entry generated + // before it was retrieved. + + BSONObj last = oplogReader.findOne( this->ns().c_str(), Query().sort( BSON( "$natural" << -1 ) ) ); + if ( !last.isEmpty() ) { + BSONElement ts = last.getField( "ts" ); + massert( 14032, "Invalid 'ts' in remote log", ts.type() == Date || ts.type() == Timestamp ); + lastTime = OpTime( ts.date() ); + } + + BSONObj info; + bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info ); + massert( 14033, "Unable to get database list", ok ); + BSONObjIterator i( info.getField( "databases" ).embeddedObject() ); + while( i.more() ) { + BSONElement e = i.next(); + + const char * name = e.embeddedObject().getField( "name" ).valuestr(); + if ( strcasecmp( name, db ) != 0 ) + continue; + + if ( strcmp( name, db ) == 0 ) { + // The db exists on master, still need to check that no conflicts exist there. + dbOk = true; + continue; + } + + // The master has a db name that conflicts with the requested name. + dbOk = false; + break; + } + } + + if ( !dbOk ) { + ___databaseIgnorer.doIgnoreUntilAfter( db, lastTime ); + incompleteCloneDbs.erase(db); + addDbNextPass.erase(db); + return false; + } + + // Check for duplicates again, since we released the lock above. + set< string > duplicates; + Database::duplicateUncasedName( false, db, dbpath, &duplicates ); + + // The database is present on the master and no conflicting databases + // are present on the master. Drop any local conflicts. + for( set< string >::const_iterator i = duplicates.begin(); i != duplicates.end(); ++i ) { + ___databaseIgnorer.doIgnoreUntilAfter( *i, lastTime ); + incompleteCloneDbs.erase(*i); + addDbNextPass.erase(*i); + Client::Context ctx(*i); + dropDatabase(*i); + } + + massert( 14034, "Duplicate database names present after attempting to delete duplicates", + Database::duplicateUncasedName( false, db, dbpath ).empty() ); + return true; + } + + void ReplSource::applyOperation(const BSONObj& op) { + try { + bool failedUpdate = applyOperation_inlock( op ); + if (failedUpdate) { + Sync sync(hostName); + if (sync.shouldRetry(op)) { + uassert(15914, "Failure retrying initial sync update", !applyOperation_inlock(op)); + } + } + } + catch ( UserException& e ) { + log() << "sync: caught user assertion " << e << " while applying op: " << op << endl;; + } + catch ( DBException& e ) { + log() << "sync: caught db exception " << e << " while applying op: " << op << endl;; + } + + } + + /* local.$oplog.main is of the form: + { ts: ..., op: <optype>, ns: ..., o: <obj> , o2: <extraobj>, b: <boolflag> } + ... + see logOp() comments. + + @param alreadyLocked caller already put us in write lock if true + */ + void ReplSource::sync_pullOpLog_applyOperation(BSONObj& op, bool alreadyLocked) { + if( logLevel >= 6 ) // op.tostring is expensive so doing this check explicitly + log(6) << "processing op: " << op << endl; + + if( op.getStringField("op")[0] == 'n' ) + return; + + char clientName[MaxDatabaseNameLen]; + const char *ns = op.getStringField("ns"); + nsToDatabase(ns, clientName); + + if ( *ns == '.' ) { + problem() << "skipping bad op in oplog: " << op.toString() << endl; + return; + } + else if ( *ns == 0 ) { + /*if( op.getStringField("op")[0] != 'n' )*/ { + problem() << "halting replication, bad op in oplog:\n " << op.toString() << endl; + replAllDead = "bad object in oplog"; + throw SyncException(); + } + //ns = "local.system.x"; + //nsToDatabase(ns, clientName); + } + + if ( !only.empty() && only != clientName ) + return; + + if( cmdLine.pretouch && !alreadyLocked/*doesn't make sense if in write lock already*/ ) { + if( cmdLine.pretouch > 1 ) { + /* note: this is bad - should be put in ReplSource. but this is first test... */ + static int countdown; + assert( countdown >= 0 ); + if( countdown > 0 ) { + countdown--; // was pretouched on a prev pass + } + else { + const int m = 4; + if( tp.get() == 0 ) { + int nthr = min(8, cmdLine.pretouch); + nthr = max(nthr, 1); + tp.reset( new ThreadPool(nthr) ); + } + vector<BSONObj> v; + oplogReader.peek(v, cmdLine.pretouch); + unsigned a = 0; + while( 1 ) { + if( a >= v.size() ) break; + unsigned b = a + m - 1; // v[a..b] + if( b >= v.size() ) b = v.size() - 1; + tp->schedule(pretouchN, v, a, b); + DEV cout << "pretouch task: " << a << ".." << b << endl; + a += m; + } + // we do one too... + pretouchOperation(op); + tp->join(); + countdown = v.size(); + } + } + else { + pretouchOperation(op); + } + } + + scoped_ptr<writelock> lk( alreadyLocked ? 0 : new writelock() ); + + if ( replAllDead ) { + // hmmm why is this check here and not at top of this function? does it get set between top and here? + log() << "replAllDead, throwing SyncException: " << replAllDead << endl; + throw SyncException(); + } + + if ( !handleDuplicateDbName( op, ns, clientName ) ) { + return; + } + + Client::Context ctx( ns ); + ctx.getClient()->curop()->reset(); + + bool empty = ctx.db()->isEmpty(); + bool incompleteClone = incompleteCloneDbs.count( clientName ) != 0; + + if( logLevel >= 6 ) + log(6) << "ns: " << ns << ", justCreated: " << ctx.justCreated() << ", empty: " << empty << ", incompleteClone: " << incompleteClone << endl; + + // always apply admin command command + // this is a bit hacky -- the semantics of replication/commands aren't well specified + if ( strcmp( clientName, "admin" ) == 0 && *op.getStringField( "op" ) == 'c' ) { + applyOperation( op ); + return; + } + + if ( ctx.justCreated() || empty || incompleteClone ) { + // we must add to incomplete list now that setClient has been called + incompleteCloneDbs.insert( clientName ); + if ( nClonedThisPass ) { + /* we only clone one database per pass, even if a lot need done. This helps us + avoid overflowing the master's transaction log by doing too much work before going + back to read more transactions. (Imagine a scenario of slave startup where we try to + clone 100 databases in one pass.) + */ + addDbNextPass.insert( clientName ); + } + else { + if ( incompleteClone ) { + log() << "An earlier initial clone of '" << clientName << "' did not complete, now resyncing." << endl; + } + save(); + Client::Context ctx(ns); + nClonedThisPass++; + resync(ctx.db()->name); + addDbNextPass.erase(clientName); + incompleteCloneDbs.erase( clientName ); + } + save(); + } + else { + applyOperation( op ); + addDbNextPass.erase( clientName ); + } + } + + void ReplSource::syncToTailOfRemoteLog() { + string _ns = ns(); + BSONObjBuilder b; + if ( !only.empty() ) { + b.appendRegex("ns", string("^") + only); + } + BSONObj last = oplogReader.findOne( _ns.c_str(), Query( b.done() ).sort( BSON( "$natural" << -1 ) ) ); + if ( !last.isEmpty() ) { + BSONElement ts = last.getField( "ts" ); + massert( 10386 , "non Date ts found: " + last.toString(), ts.type() == Date || ts.type() == Timestamp ); + syncedTo = OpTime( ts.date() ); + } + } + + extern unsigned replApplyBatchSize; + + /* slave: pull some data from the master's oplog + note: not yet in db mutex at this point. + @return -1 error + 0 ok, don't sleep + 1 ok, sleep + */ + int ReplSource::sync_pullOpLog(int& nApplied) { + int okResultCode = 1; + string ns = string("local.oplog.$") + sourceName(); + log(2) << "repl: sync_pullOpLog " << ns << " syncedTo:" << syncedTo.toStringLong() << '\n'; + + bool tailing = true; + oplogReader.tailCheck(); + + bool initial = syncedTo.isNull(); + + if ( !oplogReader.haveCursor() || initial ) { + if ( initial ) { + // Important to grab last oplog timestamp before listing databases. + syncToTailOfRemoteLog(); + BSONObj info; + bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info ); + massert( 10389 , "Unable to get database list", ok ); + BSONObjIterator i( info.getField( "databases" ).embeddedObject() ); + while( i.moreWithEOO() ) { + BSONElement e = i.next(); + if ( e.eoo() ) + break; + string name = e.embeddedObject().getField( "name" ).valuestr(); + if ( !e.embeddedObject().getBoolField( "empty" ) ) { + if ( name != "local" ) { + if ( only.empty() || only == name ) { + log( 2 ) << "adding to 'addDbNextPass': " << name << endl; + addDbNextPass.insert( name ); + } + } + } + } + dblock lk; + save(); + } + + BSONObjBuilder q; + q.appendDate("$gte", syncedTo.asDate()); + BSONObjBuilder query; + query.append("ts", q.done()); + if ( !only.empty() ) { + // note we may here skip a LOT of data table scanning, a lot of work for the master. + query.appendRegex("ns", string("^") + only); // maybe append "\\." here? + } + BSONObj queryObj = query.done(); + // e.g. queryObj = { ts: { $gte: syncedTo } } + + oplogReader.tailingQuery(ns.c_str(), queryObj); + tailing = false; + } + else { + log(2) << "repl: tailing=true\n"; + } + + if( !oplogReader.haveCursor() ) { + problem() << "repl: dbclient::query returns null (conn closed?)" << endl; + oplogReader.resetConnection(); + return -1; + } + + // show any deferred database creates from a previous pass + { + set<string>::iterator i = addDbNextPass.begin(); + if ( i != addDbNextPass.end() ) { + BSONObjBuilder b; + b.append("ns", *i + '.'); + b.append("op", "db"); + BSONObj op = b.done(); + sync_pullOpLog_applyOperation(op, false); + } + } + + if ( !oplogReader.more() ) { + if ( tailing ) { + log(2) << "repl: tailing & no new activity\n"; + if( oplogReader.awaitCapable() ) + okResultCode = 0; // don't sleep + + } + else { + log() << "repl: " << ns << " oplog is empty\n"; + } + { + dblock lk; + save(); + } + return okResultCode; + } + + OpTime nextOpTime; + { + BSONObj op = oplogReader.next(); + BSONElement ts = op.getField("ts"); + if ( ts.type() != Date && ts.type() != Timestamp ) { + string err = op.getStringField("$err"); + if ( !err.empty() ) { + // 13051 is "tailable cursor requested on non capped collection" + if (op.getIntField("code") == 13051) { + problem() << "trying to slave off of a non-master" << '\n'; + massert( 13344 , "trying to slave off of a non-master", false ); + } + else { + problem() << "repl: $err reading remote oplog: " + err << '\n'; + massert( 10390 , "got $err reading remote oplog", false ); + } + } + else { + problem() << "repl: bad object read from remote oplog: " << op.toString() << '\n'; + massert( 10391 , "repl: bad object read from remote oplog", false); + } + } + + nextOpTime = OpTime( ts.date() ); + log(2) << "repl: first op time received: " << nextOpTime.toString() << '\n'; + if ( initial ) { + log(1) << "repl: initial run\n"; + } + if( tailing ) { + if( !( syncedTo < nextOpTime ) ) { + log() << "repl ASSERTION failed : syncedTo < nextOpTime" << endl; + log() << "repl syncTo: " << syncedTo.toStringLong() << endl; + log() << "repl nextOpTime: " << nextOpTime.toStringLong() << endl; + assert(false); + } + oplogReader.putBack( op ); // op will be processed in the loop below + nextOpTime = OpTime(); // will reread the op below + } + else if ( nextOpTime != syncedTo ) { // didn't get what we queried for - error + Nullstream& l = log(); + l << "repl: nextOpTime " << nextOpTime.toStringLong() << ' '; + if ( nextOpTime < syncedTo ) + l << "<??"; + else + l << ">"; + + l << " syncedTo " << syncedTo.toStringLong() << '\n'; + log() << "repl: time diff: " << (nextOpTime.getSecs() - syncedTo.getSecs()) << "sec\n"; + log() << "repl: tailing: " << tailing << '\n'; + log() << "repl: data too stale, halting replication" << endl; + replInfo = replAllDead = "data too stale halted replication"; + assert( syncedTo < nextOpTime ); + throw SyncException(); + } + else { + /* t == syncedTo, so the first op was applied previously or it is the first op of initial query and need not be applied. */ + } + } + + // apply operations + { + int n = 0; + time_t saveLast = time(0); + while ( 1 ) { + + bool moreInitialSyncsPending = !addDbNextPass.empty() && n; // we need "&& n" to assure we actually process at least one op to get a sync point recorded in the first place. + + if ( moreInitialSyncsPending || !oplogReader.more() ) { + dblock lk; + + // NOTE aaron 2011-03-29 This block may be unnecessary, but I'm leaving it in place to avoid changing timing behavior. + { + dbtemprelease t; + if ( !moreInitialSyncsPending && oplogReader.more() ) { + continue; + } + // otherwise, break out of loop so we can set to completed or clone more dbs + } + + if( oplogReader.awaitCapable() && tailing ) + okResultCode = 0; // don't sleep + syncedTo = nextOpTime; + save(); // note how far we are synced up to now + log() << "repl: applied " << n << " operations" << endl; + nApplied = n; + log() << "repl: end sync_pullOpLog syncedTo: " << syncedTo.toStringLong() << endl; + break; + } + else { + } + + OCCASIONALLY if( n > 0 && ( n > 100000 || time(0) - saveLast > 60 ) ) { + // periodically note our progress, in case we are doing a lot of work and crash + dblock lk; + syncedTo = nextOpTime; + // can't update local log ts since there are pending operations from our peer + save(); + log() << "repl: checkpoint applied " << n << " operations" << endl; + log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; + saveLast = time(0); + n = 0; + } + + BSONObj op = oplogReader.next(); + + unsigned b = replApplyBatchSize; + bool justOne = b == 1; + scoped_ptr<writelock> lk( justOne ? 0 : new writelock() ); + while( 1 ) { + + BSONElement ts = op.getField("ts"); + if( !( ts.type() == Date || ts.type() == Timestamp ) ) { + log() << "sync error: problem querying remote oplog record" << endl; + log() << "op: " << op.toString() << endl; + log() << "halting replication" << endl; + replInfo = replAllDead = "sync error: no ts found querying remote oplog record"; + throw SyncException(); + } + OpTime last = nextOpTime; + nextOpTime = OpTime( ts.date() ); + if ( !( last < nextOpTime ) ) { + log() << "sync error: last applied optime at slave >= nextOpTime from master" << endl; + log() << " last: " << last.toStringLong() << endl; + log() << " nextOpTime: " << nextOpTime.toStringLong() << endl; + log() << " halting replication" << endl; + replInfo = replAllDead = "sync error last >= nextOpTime"; + uassert( 10123 , "replication error last applied optime at slave >= nextOpTime from master", false); + } + if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) { + assert( justOne ); + oplogReader.putBack( op ); + _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1; + dblock lk; + if ( n > 0 ) { + syncedTo = last; + save(); + } + log() << "repl: applied " << n << " operations" << endl; + log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; + log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl; + return okResultCode; + } + + sync_pullOpLog_applyOperation(op, !justOne); + n++; + + if( --b == 0 ) + break; + // if to here, we are doing mulpile applications in a singel write lock acquisition + if( !oplogReader.moreInCurrentBatch() ) { + // break if no more in batch so we release lock while reading from the master + break; + } + op = oplogReader.next(); + + getDur().commitIfNeeded(); + } + } + } + + return okResultCode; + } + + BSONObj userReplQuery = fromjson("{\"user\":\"repl\"}"); + + bool replAuthenticate(DBClientBase *conn) { + if( noauth ) { + return true; + } + if( ! cc().isAdmin() ) { + log() << "replauthenticate: requires admin permissions, failing\n"; + return false; + } + + string u; + string p; + if (internalSecurity.pwd.length() > 0) { + u = internalSecurity.user; + p = internalSecurity.pwd; + } + else { + BSONObj user; + { + dblock lk; + Client::Context ctxt("local."); + if( !Helpers::findOne("local.system.users", userReplQuery, user) || + // try the first user in local + !Helpers::getSingleton("local.system.users", user) ) { + log() << "replauthenticate: no user in local.system.users to use for authentication\n"; + return false; + } + } + u = user.getStringField("user"); + p = user.getStringField("pwd"); + massert( 10392 , "bad user object? [1]", !u.empty()); + massert( 10393 , "bad user object? [2]", !p.empty()); + } + + string err; + if( !conn->auth("local", u.c_str(), p.c_str(), err, false) ) { + log() << "replauthenticate: can't authenticate to master server, user:" << u << endl; + return false; + } + return true; + } + + bool replHandshake(DBClientConnection *conn) { + + string myname = getHostName(); + + BSONObj me; + { + + dblock l; + // local.me is an identifier for a server for getLastError w:2+ + if ( ! Helpers::getSingleton( "local.me" , me ) || + ! me.hasField("host") || + me["host"].String() != myname ) { + + // clean out local.me + Helpers::emptyCollection("local.me"); + + // repopulate + BSONObjBuilder b; + b.appendOID( "_id" , 0 , true ); + b.append( "host", myname ); + me = b.obj(); + Helpers::putSingleton( "local.me" , me ); + } + } + + BSONObjBuilder cmd; + cmd.appendAs( me["_id"] , "handshake" ); + if (theReplSet) { + cmd.append("member", theReplSet->selfId()); + } + + BSONObj res; + bool ok = conn->runCommand( "admin" , cmd.obj() , res ); + // ignoring for now on purpose for older versions + log(ok) << "replHandshake res not: " << ok << " res: " << res << endl; + return true; + } + + bool OplogReader::commonConnect(const string& hostName) { + if( conn() == 0 ) { + _conn = shared_ptr<DBClientConnection>(new DBClientConnection( false, 0, 0 /* tcp timeout */)); + string errmsg; + ReplInfo r("trying to connect to sync source"); + if ( !_conn->connect(hostName.c_str(), errmsg) || + (!noauth && !replAuthenticate(_conn.get())) ) { + resetConnection(); + log() << "repl: " << errmsg << endl; + return false; + } + } + return true; + } + + bool OplogReader::connect(string hostName) { + if (conn() != 0) { + return true; + } + + if (commonConnect(hostName)) { + return replHandshake(_conn.get()); + } + return false; + } + + bool OplogReader::connect(const BSONObj& rid, const int from, const string& to) { + if (conn() != 0) { + return true; + } + if (commonConnect(to)) { + log() << "handshake between " << from << " and " << to << endl; + return passthroughHandshake(rid, from); + } + return false; + } + + bool OplogReader::passthroughHandshake(const BSONObj& rid, const int f) { + BSONObjBuilder cmd; + cmd.appendAs( rid["_id"], "handshake" ); + cmd.append( "member" , f ); + + BSONObj res; + return conn()->runCommand( "admin" , cmd.obj() , res ); + } + + /* note: not yet in mutex at this point. + returns >= 0 if ok. return -1 if you want to reconnect. + return value of zero indicates no sleep necessary before next call + */ + int ReplSource::sync(int& nApplied) { + _sleepAdviceTime = 0; + ReplInfo r("sync"); + if ( !cmdLine.quiet ) { + Nullstream& l = log(); + l << "repl: syncing from "; + if( sourceName() != "main" ) { + l << "source:" << sourceName() << ' '; + } + l << "host:" << hostName << endl; + } + nClonedThisPass = 0; + + // FIXME Handle cases where this db isn't on default port, or default port is spec'd in hostName. + if ( (string("localhost") == hostName || string("127.0.0.1") == hostName) && cmdLine.port == CmdLine::DefaultDBPort ) { + log() << "repl: can't sync from self (localhost). sources configuration may be wrong." << endl; + sleepsecs(5); + return -1; + } + + if ( !oplogReader.connect(hostName) ) { + log(4) << "repl: can't connect to sync source" << endl; + return -1; + } + + /* + // get current mtime at the server. + BSONObj o = conn->findOne("admin.$cmd", opTimeQuery); + BSONElement e = o.getField("optime"); + if( e.eoo() ) { + log() << "repl: failed to get cur optime from master" << endl; + log() << " " << o.toString() << endl; + return false; + } + uassert( 10124 , e.type() == Date ); + OpTime serverCurTime; + serverCurTime.asDate() = e.date(); + */ + return sync_pullOpLog(nApplied); + } + + /* --------------------------------------------------------------*/ + + /* + TODO: + _ source has autoptr to the cursor + _ reuse that cursor when we can + */ + + /* returns: # of seconds to sleep before next pass + 0 = no sleep recommended + 1 = special sentinel indicating adaptive sleep recommended + */ + int _replMain(ReplSource::SourceVector& sources, int& nApplied) { + { + ReplInfo r("replMain load sources"); + dblock lk; + ReplSource::loadAll(sources); + replSettings.fastsync = false; // only need this param for initial reset + } + + if ( sources.empty() ) { + /* replication is not configured yet (for --slave) in local.sources. Poll for config it + every 20 seconds. + */ + log() << "no source given, add a master to local.sources to start replication" << endl; + return 20; + } + + int sleepAdvice = 1; + for ( ReplSource::SourceVector::iterator i = sources.begin(); i != sources.end(); i++ ) { + ReplSource *s = i->get(); + int res = -1; + try { + res = s->sync(nApplied); + bool moreToSync = s->haveMoreDbsToSync(); + if( res < 0 ) { + sleepAdvice = 3; + } + else if( moreToSync ) { + sleepAdvice = 0; + } + else if ( s->sleepAdvice() ) { + sleepAdvice = s->sleepAdvice(); + } + else + sleepAdvice = res; + } + catch ( const SyncException& ) { + log() << "caught SyncException" << endl; + return 10; + } + catch ( AssertionException& e ) { + if ( e.severe() ) { + log() << "replMain AssertionException " << e.what() << endl; + return 60; + } + else { + log() << "repl: AssertionException " << e.what() << '\n'; + } + replInfo = "replMain caught AssertionException"; + } + catch ( const DBException& e ) { + log() << "repl: DBException " << e.what() << endl; + replInfo = "replMain caught DBException"; + } + catch ( const std::exception &e ) { + log() << "repl: std::exception " << e.what() << endl; + replInfo = "replMain caught std::exception"; + } + catch ( ... ) { + log() << "unexpected exception during replication. replication will halt" << endl; + replAllDead = "caught unexpected exception during replication"; + } + if ( res < 0 ) + s->oplogReader.resetConnection(); + } + return sleepAdvice; + } + + void replMain() { + ReplSource::SourceVector sources; + while ( 1 ) { + int s = 0; + { + dblock lk; + if ( replAllDead ) { + // throttledForceResyncDead can throw + if ( !replSettings.autoresync || !ReplSource::throttledForceResyncDead( "auto" ) ) { + log() << "all sources dead: " << replAllDead << ", sleeping for 5 seconds" << endl; + break; + } + } + assert( syncing == 0 ); // i.e., there is only one sync thread running. we will want to change/fix this. + syncing++; + } + try { + int nApplied = 0; + s = _replMain(sources, nApplied); + if( s == 1 ) { + if( nApplied == 0 ) s = 2; + else if( nApplied > 100 ) { + // sleep very little - just enought that we aren't truly hammering master + sleepmillis(75); + s = 0; + } + } + } + catch (...) { + out() << "caught exception in _replMain" << endl; + s = 4; + } + { + dblock lk; + assert( syncing == 1 ); + syncing--; + } + + if( relinquishSyncingSome ) { + relinquishSyncingSome = 0; + s = 1; // sleep before going back in to syncing=1 + } + + if ( s ) { + stringstream ss; + ss << "repl: sleep " << s << " sec before next pass"; + string msg = ss.str(); + if ( ! cmdLine.quiet ) + log() << msg << endl; + ReplInfo r(msg.c_str()); + sleepsecs(s); + } + } + } + + static void replMasterThread() { + sleepsecs(4); + Client::initThread("replmaster"); + int toSleep = 10; + while( 1 ) { + + sleepsecs( toSleep ); + /* write a keep-alive like entry to the log. this will make things like + printReplicationStatus() and printSlaveReplicationStatus() stay up-to-date + even when things are idle. + */ + { + writelocktry lk("",1); + if ( lk.got() ) { + toSleep = 10; + + replLocalAuth(); + + try { + logKeepalive(); + } + catch(...) { + log() << "caught exception in replMasterThread()" << endl; + } + } + else { + log(5) << "couldn't logKeepalive" << endl; + toSleep = 1; + } + } + } + } + + void replSlaveThread() { + sleepsecs(1); + Client::initThread("replslave"); + cc().iAmSyncThread(); + + { + dblock lk; + replLocalAuth(); + } + + while ( 1 ) { + try { + replMain(); + sleepsecs(5); + } + catch ( AssertionException& ) { + ReplInfo r("Assertion in replSlaveThread(): sleeping 5 minutes before retry"); + problem() << "Assertion in replSlaveThread(): sleeping 5 minutes before retry" << endl; + sleepsecs(300); + } + catch ( DBException& e ) { + problem() << "exception in replSlaveThread(): " << e.what() + << ", sleeping 5 minutes before retry" << endl; + sleepsecs(300); + } + catch ( ... ) { + problem() << "error in replSlaveThread(): sleeping 5 minutes before retry" << endl; + sleepsecs(300); + } + } + } + + void tempThread() { + while ( 1 ) { + out() << d.dbMutex.info().isLocked() << endl; + sleepmillis(100); + } + } + + void newRepl(); + void oldRepl(); + void startReplSets(ReplSetCmdline*); + void startReplication() { + /* if we are going to be a replica set, we aren't doing other forms of replication. */ + if( !cmdLine._replSet.empty() ) { + if( replSettings.slave || replSettings.master ) { + log() << "***" << endl; + log() << "ERROR: can't use --slave or --master replication options with --replSet" << endl; + log() << "***" << endl; + } + newRepl(); + + replSet = true; + ReplSetCmdline *replSetCmdline = new ReplSetCmdline(cmdLine._replSet); + boost::thread t( boost::bind( &startReplSets, replSetCmdline) ); + + return; + } + + oldRepl(); + + /* this was just to see if anything locks for longer than it should -- we need to be careful + not to be locked when trying to connect() or query() the other side. + */ + //boost::thread tempt(tempThread); + + if( !replSettings.slave && !replSettings.master ) + return; + + { + dblock lk; + replLocalAuth(); + } + + if ( replSettings.slave ) { + assert( replSettings.slave == SimpleSlave ); + log(1) << "slave=true" << endl; + boost::thread repl_thread(replSlaveThread); + } + + if ( replSettings.master ) { + log(1) << "master=true" << endl; + replSettings.master = true; + createOplog(); + boost::thread t(replMasterThread); + } + + while( replSettings.fastsync ) // don't allow writes until we've set up from log + sleepmillis( 50 ); + } + + void testPretouch() { + int nthr = min(8, 8); + nthr = max(nthr, 1); + int m = 8 / nthr; + ThreadPool tp(nthr); + vector<BSONObj> v; + + BSONObj x = BSON( "ns" << "test.foo" << "o" << BSON( "_id" << 1 ) << "op" << "i" ); + + v.push_back(x); + v.push_back(x); + v.push_back(x); + + unsigned a = 0; + while( 1 ) { + if( a >= v.size() ) break; + unsigned b = a + m - 1; // v[a..b] + if( b >= v.size() ) b = v.size() - 1; + tp.schedule(pretouchN, v, a, b); + DEV cout << "pretouch task: " << a << ".." << b << endl; + a += m; + } + tp.join(); + } + + class ReplApplyBatchSizeValidator : public ParameterValidator { + public: + ReplApplyBatchSizeValidator() : ParameterValidator( "replApplyBatchSize" ) {} + + virtual bool isValid( BSONElement e , string& errmsg ) const { + int b = e.numberInt(); + if( b < 1 || b > 1024 ) { + errmsg = "replApplyBatchSize has to be >= 1 and < 1024"; + return false; + } + + if ( replSettings.slavedelay != 0 && b > 1 ) { + errmsg = "can't use a batch size > 1 with slavedelay"; + return false; + } + if ( ! replSettings.slave ) { + errmsg = "can't set replApplyBatchSize on a non-slave machine"; + return false; + } + + return true; + } + } replApplyBatchSizeValidator; + +} // namespace mongo diff --git a/src/mongo/db/repl.h b/src/mongo/db/repl.h new file mode 100644 index 00000000000..83242d0a4ce --- /dev/null +++ b/src/mongo/db/repl.h @@ -0,0 +1,199 @@ +// repl.h - replication + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* replication data overview + + at the slave: + local.sources { host: ..., source: ..., only: ..., syncedTo: ..., localLogTs: ..., dbsNextPass: { ... }, incompleteCloneDbs: { ... } } + + at the master: + local.oplog.$<source> +*/ + +#pragma once + +#include "pdfile.h" +#include "db.h" +#include "dbhelpers.h" +#include "../client/dbclient.h" +#include "../util/optime.h" +#include "oplog.h" +#include "../util/concurrency/thread_pool.h" +#include "oplogreader.h" +#include "cloner.h" + +namespace mongo { + + /* replication slave? (possibly with slave) + --slave cmd line setting -> SimpleSlave + */ + typedef enum { NotSlave=0, SimpleSlave } SlaveTypes; + + class ReplSettings { + public: + SlaveTypes slave; + + /** true means we are master and doing replication. if we are not writing to oplog, this won't be true. */ + bool master; + + bool fastsync; + + bool autoresync; + + int slavedelay; + + set<string> discoveredSeeds; + mutex discoveredSeeds_mx; + + BSONObj reconfig; + + ReplSettings() + : slave(NotSlave), + master(false), + fastsync(), + autoresync(false), + slavedelay(), + discoveredSeeds(), + discoveredSeeds_mx("ReplSettings::discoveredSeeds") { + } + + }; + + extern ReplSettings replSettings; + + /* A replication exception */ + class SyncException : public DBException { + public: + SyncException() : DBException( "sync exception" , 10001 ) {} + }; + + /* A Source is a source from which we can pull (replicate) data. + stored in collection local.sources. + + Can be a group of things to replicate for several databases. + + { host: ..., source: ..., only: ..., syncedTo: ..., dbsNextPass: { ... }, incompleteCloneDbs: { ... } } + + 'source' defaults to 'main'; support for multiple source names is + not done (always use main for now). + */ + class ReplSource { + shared_ptr<ThreadPool> tp; + + void resync(string db); + + /** @param alreadyLocked caller already put us in write lock if true */ + void sync_pullOpLog_applyOperation(BSONObj& op, bool alreadyLocked); + + /* pull some operations from the master's oplog, and apply them. + calls sync_pullOpLog_applyOperation + */ + int sync_pullOpLog(int& nApplied); + + /* we only clone one database per pass, even if a lot need done. This helps us + avoid overflowing the master's transaction log by doing too much work before going + back to read more transactions. (Imagine a scenario of slave startup where we try to + clone 100 databases in one pass.) + */ + set<string> addDbNextPass; + + set<string> incompleteCloneDbs; + + ReplSource(); + + // returns the dummy ns used to do the drop + string resyncDrop( const char *db, const char *requester ); + // call without the db mutex + void syncToTailOfRemoteLog(); + string ns() const { return string( "local.oplog.$" ) + sourceName(); } + unsigned _sleepAdviceTime; + + /** + * If 'db' is a new database and its name would conflict with that of + * an existing database, synchronize these database names with the + * master. + * @return true iff an op with the specified ns may be applied. + */ + bool handleDuplicateDbName( const BSONObj &op, const char *ns, const char *db ); + + public: + OplogReader oplogReader; + + void applyOperation(const BSONObj& op); + string hostName; // ip addr or hostname plus optionally, ":<port>" + string _sourceName; // a logical source name. + string sourceName() const { return _sourceName.empty() ? "main" : _sourceName; } + string only; // only a certain db. note that in the sources collection, this may not be changed once you start replicating. + + /* the last time point we have already synced up to (in the remote/master's oplog). */ + OpTime syncedTo; + + int nClonedThisPass; + + typedef vector< shared_ptr< ReplSource > > SourceVector; + static void loadAll(SourceVector&); + explicit ReplSource(BSONObj); + + /* -1 = error */ + int sync(int& nApplied); + + void save(); // write ourself to local.sources + + // make a jsobj from our member fields of the form + // { host: ..., source: ..., syncedTo: ... } + BSONObj jsobj(); + + bool operator==(const ReplSource&r) const { + return hostName == r.hostName && sourceName() == r.sourceName(); + } + string toString() const { return sourceName() + "@" + hostName; } + + bool haveMoreDbsToSync() const { return !addDbNextPass.empty(); } + int sleepAdvice() const { + if ( !_sleepAdviceTime ) + return 0; + int wait = _sleepAdviceTime - unsigned( time( 0 ) ); + return wait > 0 ? wait : 0; + } + + static bool throttledForceResyncDead( const char *requester ); + static void forceResyncDead( const char *requester ); + void forceResync( const char *requester ); + }; + + bool anyReplEnabled(); + void appendReplicationInfo( BSONObjBuilder& result , bool authed , int level = 0 ); + + /** + * Helper class used to set and query an ignore state for a named database. + * The ignore state will expire after a specified OpTime. + */ + class DatabaseIgnorer { + public: + /** Indicate that operations for 'db' should be ignored until after 'futureOplogTime' */ + void doIgnoreUntilAfter( const string &db, const OpTime &futureOplogTime ); + /** + * Query ignore state of 'db'; if 'currentOplogTime' is after the ignore + * limit, the ignore state will be cleared. + */ + bool ignoreAt( const string &db, const OpTime ¤tOplogTime ); + private: + map< string, OpTime > _ignores; + }; + +} // namespace mongo diff --git a/src/mongo/db/repl/connections.h b/src/mongo/db/repl/connections.h new file mode 100644 index 00000000000..3e08f80b047 --- /dev/null +++ b/src/mongo/db/repl/connections.h @@ -0,0 +1,128 @@ +// @file + +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include <map> +#include "../../client/dbclient.h" +#include "../security_common.h" + +namespace mongo { + + /** here we keep a single connection (with reconnect) for a set of hosts, + one each, and allow one user at a time per host. if in use already for that + host, we block. so this is an easy way to keep a 1-deep pool of connections + that many threads can share. + + thread-safe. + + Example: + { + ScopedConn c("foo.acme.com:9999"); + c->runCommand(...); + } + + throws exception on connect error (but fine to try again later with a new + scopedconn object for same host). + */ + class ScopedConn { + public: + /** throws assertions if connect failure etc. */ + ScopedConn(string hostport); + ~ScopedConn() { + // conLock releases... + } + void reconnect() { + conn()->port().shutdown(); + connect(); + } + + /* If we were to run a query and not exhaust the cursor, future use of the connection would be problematic. + So here what we do is wrapper known safe methods and not allow cursor-style queries at all. This makes + ScopedConn limited in functionality but very safe. More non-cursor wrappers can be added here if needed. + */ + bool runCommand(const string &dbname, const BSONObj& cmd, BSONObj &info, int options=0) { + return conn()->runCommand(dbname, cmd, info, options); + } + unsigned long long count(const string &ns) { + return conn()->count(ns); + } + BSONObj findOne(const string &ns, const Query& q, const BSONObj *fieldsToReturn = 0, int queryOptions = 0) { + return conn()->findOne(ns, q, fieldsToReturn, queryOptions); + } + + private: + auto_ptr<scoped_lock> connLock; + static mongo::mutex mapMutex; + struct X { + mongo::mutex z; + DBClientConnection cc; + bool connected; + X() : z("X"), cc(/*reconnect*/ true, 0, /*timeout*/ 10.0), connected(false) { + cc._logLevel = 2; + } + } *x; + typedef map<string,ScopedConn::X*> M; + static M& _map; + DBClientConnection* conn() { return &x->cc; } + const string _hostport; + + // we should already be locked... + bool connect() { + string err; + if (!x->cc.connect(_hostport, err)) { + log() << "couldn't connect to " << _hostport << ": " << err << rsLog; + return false; + } + x->connected = true; + + // if we cannot authenticate against a member, then either its key file + // or our key file has to change. if our key file has to change, we'll + // be rebooting. if their file has to change, they'll be rebooted so the + // connection created above will go dead, reconnect, and reauth. + if (!noauth && !x->cc.auth("local", internalSecurity.user, internalSecurity.pwd, err, false)) { + log() << "could not authenticate against " << _hostport << ", " << err << rsLog; + return false; + } + + return true; + } + }; + + inline ScopedConn::ScopedConn(string hostport) : _hostport(hostport) { + bool first = false; + { + scoped_lock lk(mapMutex); + x = _map[_hostport]; + if( x == 0 ) { + x = _map[_hostport] = new X(); + first = true; + connLock.reset( new scoped_lock(x->z) ); + } + } + + // Keep trying to connect if we're not yet connected + if( !first && x->connected ) { + connLock.reset( new scoped_lock(x->z) ); + return; + } + + connect(); + } + +} diff --git a/src/mongo/db/repl/consensus.cpp b/src/mongo/db/repl/consensus.cpp new file mode 100644 index 00000000000..3995373f5ef --- /dev/null +++ b/src/mongo/db/repl/consensus.cpp @@ -0,0 +1,449 @@ +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "../commands.h" +#include "rs.h" +#include "multicmd.h" + +namespace mongo { + + class CmdReplSetFresh : public ReplSetCommand { + public: + CmdReplSetFresh() : ReplSetCommand("replSetFresh") { } + private: + + bool shouldVeto(const BSONObj& cmdObj, string& errmsg) { + unsigned id = cmdObj["id"].Int(); + const Member* primary = theReplSet->box.getPrimary(); + const Member* hopeful = theReplSet->findById(id); + const Member *highestPriority = theReplSet->getMostElectable(); + + if( !hopeful ) { + errmsg = str::stream() << "replSet couldn't find member with id " << id; + return true; + } + else if( theReplSet->isPrimary() && theReplSet->lastOpTimeWritten >= hopeful->hbinfo().opTime ) { + // hbinfo is not updated, so we have to check the primary's last optime separately + errmsg = str::stream() << "I am already primary, " << hopeful->fullName() << + " can try again once I've stepped down"; + return true; + } + else if( primary && primary->hbinfo().opTime >= hopeful->hbinfo().opTime ) { + // other members might be aware of more up-to-date nodes + errmsg = str::stream() << hopeful->fullName() << " is trying to elect itself but " << + primary->fullName() << " is already primary and more up-to-date"; + return true; + } + else if( highestPriority && highestPriority->config().priority > hopeful->config().priority) { + errmsg = str::stream() << hopeful->fullName() << " has lower priority than " << highestPriority->fullName(); + return true; + } + + // don't veto older versions + if (cmdObj["id"].eoo()) { + // they won't be looking for the veto field + return false; + } + + if ( !theReplSet->isElectable(id) || + (highestPriority && highestPriority->config().priority > hopeful->config().priority)) { + return true; + } + + return false; + } + + virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if( !check(errmsg, result) ) + return false; + + if( cmdObj["set"].String() != theReplSet->name() ) { + errmsg = "wrong repl set name"; + return false; + } + string who = cmdObj["who"].String(); + int cfgver = cmdObj["cfgver"].Int(); + OpTime opTime(cmdObj["opTime"].Date()); + + bool weAreFresher = false; + if( theReplSet->config().version > cfgver ) { + log() << "replSet member " << who << " is not yet aware its cfg version " << cfgver << " is stale" << rsLog; + result.append("info", "config version stale"); + weAreFresher = true; + } + // check not only our own optime, but any other member we can reach + else if( opTime < theReplSet->lastOpTimeWritten || + opTime < theReplSet->lastOtherOpTime()) { + weAreFresher = true; + } + result.appendDate("opTime", theReplSet->lastOpTimeWritten.asDate()); + result.append("fresher", weAreFresher); + result.append("veto", shouldVeto(cmdObj, errmsg)); + + return true; + } + } cmdReplSetFresh; + + class CmdReplSetElect : public ReplSetCommand { + public: + CmdReplSetElect() : ReplSetCommand("replSetElect") { } + private: + virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if( !check(errmsg, result) ) + return false; + theReplSet->elect.electCmdReceived(cmdObj, &result); + return true; + } + } cmdReplSetElect; + + int Consensus::totalVotes() const { + static int complain = 0; + int vTot = rs._self->config().votes; + for( Member *m = rs.head(); m; m=m->next() ) + vTot += m->config().votes; + if( vTot % 2 == 0 && vTot && complain++ == 0 ) + log() << "replSet " /*buildbot! warning */ "total number of votes is even - add arbiter or give one member an extra vote" << rsLog; + return vTot; + } + + bool Consensus::aMajoritySeemsToBeUp() const { + int vUp = rs._self->config().votes; + for( Member *m = rs.head(); m; m=m->next() ) + vUp += m->hbinfo().up() ? m->config().votes : 0; + return vUp * 2 > totalVotes(); + } + + bool Consensus::shouldRelinquish() const { + int vUp = rs._self->config().votes; + const long long T = rs.config().ho.heartbeatTimeoutMillis * rs.config().ho.heartbeatConnRetries; + for( Member *m = rs.head(); m; m=m->next() ) { + long long dt = m->hbinfo().timeDown(); + if( dt < T ) + vUp += m->config().votes; + } + + // the manager will handle calling stepdown if another node should be + // primary due to priority + + return !( vUp * 2 > totalVotes() ); + } + + static const int VETO = -10000; + + const time_t LeaseTime = 30; + + SimpleMutex Consensus::lyMutex("ly"); + + unsigned Consensus::yea(unsigned memberId) { /* throws VoteException */ + SimpleMutex::scoped_lock lk(lyMutex); + LastYea &L = this->ly.ref(lk); + time_t now = time(0); + if( L.when + LeaseTime >= now && L.who != memberId ) { + LOG(1) << "replSet not voting yea for " << memberId << + " voted for " << L.who << ' ' << now-L.when << " secs ago" << rsLog; + throw VoteException(); + } + L.when = now; + L.who = memberId; + return rs._self->config().votes; + } + + /* we vote for ourself at start of election. once it fails, we can cancel the lease we had in + place instead of leaving it for a long time. + */ + void Consensus::electionFailed(unsigned meid) { + SimpleMutex::scoped_lock lk(lyMutex); + LastYea &L = ly.ref(lk); + DEV assert( L.who == meid ); // this may not always always hold, so be aware, but adding for now as a quick sanity test + if( L.who == meid ) + L.when = 0; + } + + /* todo: threading **************** !!!!!!!!!!!!!!!! */ + void Consensus::electCmdReceived(BSONObj cmd, BSONObjBuilder* _b) { + BSONObjBuilder& b = *_b; + DEV log() << "replSet received elect msg " << cmd.toString() << rsLog; + else LOG(2) << "replSet received elect msg " << cmd.toString() << rsLog; + string set = cmd["set"].String(); + unsigned whoid = cmd["whoid"].Int(); + int cfgver = cmd["cfgver"].Int(); + OID round = cmd["round"].OID(); + int myver = rs.config().version; + + const Member* primary = rs.box.getPrimary(); + const Member* hopeful = rs.findById(whoid); + const Member* highestPriority = rs.getMostElectable(); + + int vote = 0; + if( set != rs.name() ) { + log() << "replSet error received an elect request for '" << set << "' but our set name is '" << rs.name() << "'" << rsLog; + } + else if( myver < cfgver ) { + // we are stale. don't vote + } + else if( myver > cfgver ) { + // they are stale! + log() << "replSet electCmdReceived info got stale version # during election" << rsLog; + vote = -10000; + } + else if( !hopeful ) { + log() << "replSet electCmdReceived couldn't find member with id " << whoid << rsLog; + vote = -10000; + } + else if( primary && primary == rs._self && rs.lastOpTimeWritten >= hopeful->hbinfo().opTime ) { + // hbinfo is not updated, so we have to check the primary's last optime separately + log() << "I am already primary, " << hopeful->fullName() + << " can try again once I've stepped down" << rsLog; + vote = -10000; + } + else if( primary && primary->hbinfo().opTime >= hopeful->hbinfo().opTime ) { + // other members might be aware of more up-to-date nodes + log() << hopeful->fullName() << " is trying to elect itself but " << + primary->fullName() << " is already primary and more up-to-date" << rsLog; + vote = -10000; + } + else if( highestPriority && highestPriority->config().priority > hopeful->config().priority) { + log() << hopeful->fullName() << " has lower priority than " << highestPriority->fullName(); + vote = -10000; + } + else { + try { + vote = yea(whoid); + dassert( hopeful->id() == whoid ); + rs.relinquish(); + log() << "replSet info voting yea for " << hopeful->fullName() << " (" << whoid << ')' << rsLog; + } + catch(VoteException&) { + log() << "replSet voting no for " << hopeful->fullName() << " already voted for another" << rsLog; + } + } + + b.append("vote", vote); + b.append("round", round); + } + + void ReplSetImpl::_getTargets(list<Target>& L, int& configVersion) { + configVersion = config().version; + for( Member *m = head(); m; m=m->next() ) + if( m->hbinfo().maybeUp() ) + L.push_back( Target(m->fullName()) ); + } + + /* config version is returned as it is ok to use this unlocked. BUT, if unlocked, you would need + to check later that the config didn't change. */ + void ReplSetImpl::getTargets(list<Target>& L, int& configVersion) { + if( lockedByMe() ) { + _getTargets(L, configVersion); + return; + } + lock lk(this); + _getTargets(L, configVersion); + } + + /* Do we have the newest data of them all? + @param allUp - set to true if all members are up. Only set if true returned. + @return true if we are freshest. Note we may tie. + */ + bool Consensus::weAreFreshest(bool& allUp, int& nTies) { + const OpTime ord = theReplSet->lastOpTimeWritten; + nTies = 0; + assert( !ord.isNull() ); + BSONObj cmd = BSON( + "replSetFresh" << 1 << + "set" << rs.name() << + "opTime" << Date_t(ord.asDate()) << + "who" << rs._self->fullName() << + "cfgver" << rs._cfg->version << + "id" << rs._self->id()); + list<Target> L; + int ver; + /* the following queries arbiters, even though they are never fresh. wonder if that makes sense. + it doesn't, but it could, if they "know" what freshness it one day. so consider removing + arbiters from getTargets() here. although getTargets is used elsewhere for elections; there + arbiters are certainly targets - so a "includeArbs" bool would be necessary if we want to make + not fetching them herein happen. + */ + rs.getTargets(L, ver); + multiCommand(cmd, L); + int nok = 0; + allUp = true; + for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) { + if( i->ok ) { + nok++; + if( i->result["fresher"].trueValue() ) { + log() << "not electing self, we are not freshest" << rsLog; + return false; + } + OpTime remoteOrd( i->result["opTime"].Date() ); + if( remoteOrd == ord ) + nTies++; + assert( remoteOrd <= ord ); + + if( i->result["veto"].trueValue() ) { + BSONElement msg = i->result["errmsg"]; + if (!msg.eoo()) { + log() << "not electing self, " << i->toHost << " would veto with '" << + msg.String() << "'" << rsLog; + } + else { + log() << "not electing self, " << i->toHost << " would veto" << rsLog; + } + return false; + } + } + else { + DEV log() << "replSet freshest returns " << i->result.toString() << rsLog; + allUp = false; + } + } + LOG(1) << "replSet dev we are freshest of up nodes, nok:" << nok << " nTies:" << nTies << rsLog; + assert( ord <= theReplSet->lastOpTimeWritten ); // <= as this may change while we are working... + return true; + } + + extern time_t started; + + void Consensus::multiCommand(BSONObj cmd, list<Target>& L) { + assert( !rs.lockedByMe() ); + mongo::multiCommand(cmd, L); + } + + void Consensus::_electSelf() { + if( time(0) < steppedDown ) + return; + + { + const OpTime ord = theReplSet->lastOpTimeWritten; + if( ord == 0 ) { + log() << "replSet info not trying to elect self, do not yet have a complete set of data from any point in time" << rsLog; + return; + } + } + + bool allUp; + int nTies; + if( !weAreFreshest(allUp, nTies) ) { + return; + } + + rs.sethbmsg("",9); + + if( !allUp && time(0) - started < 60 * 5 ) { + /* the idea here is that if a bunch of nodes bounce all at once, we don't want to drop data + if we don't have to -- we'd rather be offline and wait a little longer instead + todo: make this configurable. + */ + rs.sethbmsg("not electing self, not all members up and we have been up less than 5 minutes"); + return; + } + + Member& me = *rs._self; + + if( nTies ) { + /* tie? we then randomly sleep to try to not collide on our voting. */ + /* todo: smarter. */ + if( me.id() == 0 || sleptLast ) { + // would be fine for one node not to sleep + // todo: biggest / highest priority nodes should be the ones that get to not sleep + } + else { + assert( !rs.lockedByMe() ); // bad to go to sleep locked + unsigned ms = ((unsigned) rand()) % 1000 + 50; + DEV log() << "replSet tie " << nTies << " sleeping a little " << ms << "ms" << rsLog; + sleptLast = true; + sleepmillis(ms); + throw RetryAfterSleepException(); + } + } + sleptLast = false; + + time_t start = time(0); + unsigned meid = me.id(); + int tally = yea( meid ); + bool success = false; + try { + log() << "replSet info electSelf " << meid << rsLog; + + BSONObj electCmd = BSON( + "replSetElect" << 1 << + "set" << rs.name() << + "who" << me.fullName() << + "whoid" << me.hbinfo().id() << + "cfgver" << rs._cfg->version << + "round" << OID::gen() /* this is just for diagnostics */ + ); + + int configVersion; + list<Target> L; + rs.getTargets(L, configVersion); + multiCommand(electCmd, L); + + { + for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) { + DEV log() << "replSet elect res: " << i->result.toString() << rsLog; + if( i->ok ) { + int v = i->result["vote"].Int(); + tally += v; + } + } + if( tally*2 <= totalVotes() ) { + log() << "replSet couldn't elect self, only received " << tally << " votes" << rsLog; + } + else if( time(0) - start > 30 ) { + // defensive; should never happen as we have timeouts on connection and operation for our conn + log() << "replSet too much time passed during our election, ignoring result" << rsLog; + } + else if( configVersion != rs.config().version ) { + log() << "replSet config version changed during our election, ignoring result" << rsLog; + } + else { + /* succeeded. */ + log(1) << "replSet election succeeded, assuming primary role" << rsLog; + success = true; + rs.assumePrimary(); + } + } + } + catch( std::exception& ) { + if( !success ) electionFailed(meid); + throw; + } + if( !success ) electionFailed(meid); + } + + void Consensus::electSelf() { + assert( !rs.lockedByMe() ); + assert( !rs.myConfig().arbiterOnly ); + assert( rs.myConfig().slaveDelay == 0 ); + try { + _electSelf(); + } + catch(RetryAfterSleepException&) { + throw; + } + catch(VoteException& ) { + log() << "replSet not trying to elect self as responded yea to someone else recently" << rsLog; + } + catch(DBException& e) { + log() << "replSet warning caught unexpected exception in electSelf() " << e.toString() << rsLog; + } + catch(...) { + log() << "replSet warning caught unexpected exception in electSelf()" << rsLog; + } + } + +} diff --git a/src/mongo/db/repl/health.cpp b/src/mongo/db/repl/health.cpp new file mode 100644 index 00000000000..0b7ed87eac3 --- /dev/null +++ b/src/mongo/db/repl/health.cpp @@ -0,0 +1,449 @@ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful,b +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "rs.h" +#include "health.h" +#include "../../util/background.h" +#include "../../client/dbclient.h" +#include "../../client/connpool.h" +#include "../commands.h" +#include "../../util/concurrency/value.h" +#include "../../util/concurrency/task.h" +#include "../../util/mongoutils/html.h" +#include "../../util/goodies.h" +#include "../../util/ramlog.h" +#include "../helpers/dblogger.h" +#include "connections.h" +#include "../../util/unittest.h" +#include "../dbhelpers.h" + +namespace mongo { + /* decls for connections.h */ + ScopedConn::M& ScopedConn::_map = *(new ScopedConn::M()); + mutex ScopedConn::mapMutex("ScopedConn::mapMutex"); +} + +namespace mongo { + + using namespace mongoutils::html; + using namespace bson; + + static RamLog * _rsLog = new RamLog( "rs" ); + Tee *rsLog = _rsLog; + extern bool replSetBlind; // for testing + + string ago(time_t t) { + if( t == 0 ) return ""; + + time_t x = time(0) - t; + stringstream s; + if( x < 180 ) { + s << x << " sec"; + if( x != 1 ) s << 's'; + } + else if( x < 3600 ) { + s.precision(2); + s << x / 60.0 << " mins"; + } + else { + s.precision(2); + s << x / 3600.0 << " hrs"; + } + return s.str(); + } + + void Member::summarizeMember(stringstream& s) const { + s << tr(); + { + stringstream u; + u << "http://" << h().host() << ':' << (h().port() + 1000) << "/_replSet"; + s << td( a(u.str(), "", fullName()) ); + } + s << td( id() ); + double h = hbinfo().health; + bool ok = h > 0; + s << td(red(str::stream() << h,h == 0)); + s << td(ago(hbinfo().upSince)); + bool never = false; + { + string h; + time_t hb = hbinfo().lastHeartbeat; + if( hb == 0 ) { + h = "never"; + never = true; + } + else h = ago(hb) + " ago"; + s << td(h); + } + s << td(config().votes); + s << td(config().priority); + { + string stateText = state().toString(); + if( _config.hidden ) + stateText += " (hidden)"; + if( ok || stateText.empty() ) + s << td(stateText); // text blank if we've never connected + else + s << td( grey(str::stream() << "(was " << state().toString() << ')', true) ); + } + s << td( grey(hbinfo().lastHeartbeatMsg,!ok) ); + stringstream q; + q << "/_replSetOplog?_id=" << id(); + s << td( a(q.str(), "", never ? "?" : hbinfo().opTime.toString()) ); + if( hbinfo().skew > INT_MIN ) { + s << td( grey(str::stream() << hbinfo().skew,!ok) ); + } + else + s << td(""); + s << _tr(); + } + + string ReplSetImpl::stateAsHtml(MemberState s) { + if( s.s == MemberState::RS_STARTUP ) return a("", "serving still starting up, or still trying to initiate the set", "STARTUP"); + if( s.s == MemberState::RS_PRIMARY ) return a("", "this server thinks it is primary", "PRIMARY"); + if( s.s == MemberState::RS_SECONDARY ) return a("", "this server thinks it is a secondary (slave mode)", "SECONDARY"); + if( s.s == MemberState::RS_RECOVERING ) return a("", "recovering/resyncing; after recovery usually auto-transitions to secondary", "RECOVERING"); + if( s.s == MemberState::RS_FATAL ) return a("", "something bad has occurred and server is not completely offline with regard to the replica set. fatal error.", "FATAL"); + if( s.s == MemberState::RS_STARTUP2 ) return a("", "loaded config, still determining who is primary", "STARTUP2"); + if( s.s == MemberState::RS_ARBITER ) return a("", "this server is an arbiter only", "ARBITER"); + if( s.s == MemberState::RS_DOWN ) return a("", "member is down, slow, or unreachable", "DOWN"); + if( s.s == MemberState::RS_ROLLBACK ) return a("", "rolling back operations to get in sync", "ROLLBACK"); + return ""; + } + + extern time_t started; + + // oplogdiags in web ui + static void say(stringstream&ss, const bo& op) { + ss << "<tr>"; + + set<string> skip; + be e = op["ts"]; + if( e.type() == Date || e.type() == Timestamp ) { + OpTime ot = e._opTime(); + ss << td( time_t_to_String_short( ot.getSecs() ) ); + ss << td( ot.toString() ); + skip.insert("ts"); + } + else ss << td("?") << td("?"); + + e = op["h"]; + if( e.type() == NumberLong ) { + ss << "<td>" << hex << e.Long() << "</td>\n"; + skip.insert("h"); + } + else + ss << td("?"); + + ss << td(op["op"].valuestrsafe()); + ss << td(op["ns"].valuestrsafe()); + skip.insert("op"); + skip.insert("ns"); + + ss << "<td>"; + for( bo::iterator i(op); i.more(); ) { + be e = i.next(); + if( skip.count(e.fieldName()) ) continue; + ss << e.toString() << ' '; + } + ss << "</td></tr>\n"; + } + + void ReplSetImpl::_getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const { + const Member *m = findById(server_id); + if( m == 0 ) { + ss << "Error : can't find a member with id: " << server_id << '\n'; + return; + } + + ss << p("Server : " + m->fullName() + "<br>ns : " + rsoplog ); + + //const bo fields = BSON( "o" << false << "o2" << false ); + const bo fields; + + /** todo fix we might want an so timeout here */ + DBClientConnection conn(false, 0, /*timeout*/ 20); + { + string errmsg; + if( !conn.connect(m->fullName(), errmsg) ) { + ss << "couldn't connect to " << m->fullName() << ' ' << errmsg; + return; + } + } + + auto_ptr<DBClientCursor> c = conn.query(rsoplog, Query().sort("$natural",1), 20, 0, &fields); + if( c.get() == 0 ) { + ss << "couldn't query " << rsoplog; + return; + } + static const char *h[] = {"ts","optime", "h","op","ns","rest",0}; + + ss << "<style type=\"text/css\" media=\"screen\">" + "table { font-size:75% }\n" + // "th { background-color:#bbb; color:#000 }\n" + // "td,th { padding:.25em }\n" + "</style>\n"; + + ss << table(h, true); + //ss << "<pre>\n"; + int n = 0; + OpTime otFirst; + OpTime otLast; + OpTime otEnd; + while( c->more() ) { + bo o = c->next(); + otLast = o["ts"]._opTime(); + if( otFirst.isNull() ) + otFirst = otLast; + say(ss, o); + n++; + } + if( n == 0 ) { + ss << rsoplog << " is empty\n"; + } + else { + auto_ptr<DBClientCursor> c = conn.query(rsoplog, Query().sort("$natural",-1), 20, 0, &fields); + if( c.get() == 0 ) { + ss << "couldn't query [2] " << rsoplog; + return; + } + string x; + bo o = c->next(); + otEnd = o["ts"]._opTime(); + while( 1 ) { + stringstream z; + if( o["ts"]._opTime() == otLast ) + break; + say(z, o); + x = z.str() + x; + if( !c->more() ) + break; + o = c->next(); + } + if( !x.empty() ) { + ss << "<tr><td>...</td><td>...</td><td>...</td><td>...</td><td>...</td></tr>\n" << x; + //ss << "\n...\n\n" << x; + } + } + ss << _table(); + ss << p(time_t_to_String_short(time(0)) + " current time"); + + if( !otEnd.isNull() ) { + ss << "<p>Log length in time: "; + unsigned d = otEnd.getSecs() - otFirst.getSecs(); + double h = d / 3600.0; + ss.precision(3); + if( h < 72 ) + ss << h << " hours"; + else + ss << h / 24.0 << " days"; + ss << "</p>\n"; + } + } + + void ReplSetImpl::_summarizeAsHtml(stringstream& s) const { + s << table(0, false); + s << tr("Set name:", _name); + s << tr("Majority up:", elect.aMajoritySeemsToBeUp()?"yes":"no" ); + s << _table(); + + const char *h[] = {"Member", + "<a title=\"member id in the replset config\">id</a>", + "Up", + "<a title=\"length of time we have been continuously connected to the other member with no reconnects (for self, shows uptime)\">cctime</a>", + "<a title=\"when this server last received a heartbeat response - includes error code responses\">Last heartbeat</a>", + "Votes", "Priority", "State", "Messages", + "<a title=\"how up to date this server is. this value polled every few seconds so actually lag is typically much lower than value shown here.\">optime</a>", + "<a title=\"Clock skew in seconds relative to this server. Informational; server clock variances will make the diagnostics hard to read, but otherwise are benign..\">skew</a>", + 0 + }; + s << table(h); + + /* this is to sort the member rows by their ordinal _id, so they show up in the same + order on all the different web ui's; that is less confusing for the operator. */ + map<int,string> mp; + + string myMinValid; + try { + readlocktry lk("local.replset.minvalid", 300); + if( lk.got() ) { + BSONObj mv; + if( Helpers::getSingleton("local.replset.minvalid", mv) ) { + myMinValid = "minvalid:" + mv["ts"]._opTime().toString(); + } + } + else myMinValid = "."; + } + catch(...) { + myMinValid = "exception fetching minvalid"; + } + + const Member *_self = this->_self; + assert(_self); + { + stringstream s; + /* self row */ + s << tr() << td(_self->fullName() + " (me)") << + td(_self->id()) << + td("1") << //up + td(ago(started)) << + td("") << // last heartbeat + td(ToString(_self->config().votes)) << + td(ToString(_self->config().priority)) << + td( stateAsHtml(box.getState()) + (_self->config().hidden?" (hidden)":"") ); + s << td( _hbmsg ); + stringstream q; + q << "/_replSetOplog?_id=" << _self->id(); + s << td( a(q.str(), myMinValid, theReplSet->lastOpTimeWritten.toString()) ); + s << td(""); // skew + s << _tr(); + mp[_self->hbinfo().id()] = s.str(); + } + Member *m = head(); + while( m ) { + stringstream s; + m->summarizeMember(s); + mp[m->hbinfo().id()] = s.str(); + m = m->next(); + } + + for( map<int,string>::const_iterator i = mp.begin(); i != mp.end(); i++ ) + s << i->second; + s << _table(); + } + + + void fillRsLog(stringstream& s) { + _rsLog->toHTML( s ); + } + + const Member* ReplSetImpl::findById(unsigned id) const { + if( _self && id == _self->id() ) return _self; + + for( Member *m = head(); m; m = m->next() ) + if( m->id() == id ) + return m; + return 0; + } + + const OpTime ReplSetImpl::lastOtherOpTime() const { + OpTime closest(0,0); + + for( Member *m = _members.head(); m; m=m->next() ) { + if (!m->hbinfo().up()) { + continue; + } + + if (m->hbinfo().opTime > closest) { + closest = m->hbinfo().opTime; + } + } + + return closest; + } + + void ReplSetImpl::_summarizeStatus(BSONObjBuilder& b) const { + vector<BSONObj> v; + + const Member *_self = this->_self; + assert( _self ); + + MemberState myState = box.getState(); + + // add self + { + BSONObjBuilder bb; + bb.append("_id", (int) _self->id()); + bb.append("name", _self->fullName()); + bb.append("health", 1.0); + bb.append("state", (int)myState.s); + bb.append("stateStr", myState.toString()); + bb.append("uptime", (unsigned)(time(0) - cmdLine.started)); + if (!_self->config().arbiterOnly) { + bb.appendTimestamp("optime", lastOpTimeWritten.asDate()); + bb.appendDate("optimeDate", lastOpTimeWritten.getSecs() * 1000LL); + } + + int maintenance = _maintenanceMode; + if (maintenance) { + bb.append("maintenanceMode", maintenance); + } + + if (theReplSet) { + string s = theReplSet->hbmsg(); + if( !s.empty() ) + bb.append("errmsg", s); + } + bb.append("self", true); + v.push_back(bb.obj()); + } + + Member *m =_members.head(); + while( m ) { + BSONObjBuilder bb; + bb.append("_id", (int) m->id()); + bb.append("name", m->fullName()); + double h = m->hbinfo().health; + bb.append("health", h); + bb.append("state", (int) m->state().s); + if( h == 0 ) { + // if we can't connect the state info is from the past and could be confusing to show + bb.append("stateStr", "(not reachable/healthy)"); + } + else { + bb.append("stateStr", m->state().toString()); + } + bb.append("uptime", (unsigned) (m->hbinfo().upSince ? (time(0)-m->hbinfo().upSince) : 0)); + if (!m->config().arbiterOnly) { + bb.appendTimestamp("optime", m->hbinfo().opTime.asDate()); + bb.appendDate("optimeDate", m->hbinfo().opTime.getSecs() * 1000LL); + } + bb.appendTimeT("lastHeartbeat", m->hbinfo().lastHeartbeat); + bb.append("pingMs", m->hbinfo().ping); + string s = m->lhb(); + if( !s.empty() ) + bb.append("errmsg", s); + + if (m->hbinfo().authIssue) { + bb.append("authenticated", false); + } + + v.push_back(bb.obj()); + m = m->next(); + } + sort(v.begin(), v.end()); + b.append("set", name()); + b.appendTimeT("date", time(0)); + b.append("myState", myState.s); + const Member *syncTarget = _currentSyncTarget; + if (syncTarget && myState != MemberState::RS_PRIMARY) { + b.append("syncingTo", syncTarget->fullName()); + } + b.append("members", v); + if( replSetBlind ) + b.append("blind",true); // to avoid confusion if set...normally never set except for testing. + } + + static struct Test : public UnitTest { + void run() { + HealthOptions a,b; + assert( a == b ); + assert( a.isDefault() ); + } + } test; + +} diff --git a/src/mongo/db/repl/health.h b/src/mongo/db/repl/health.h new file mode 100644 index 00000000000..55cca93a27e --- /dev/null +++ b/src/mongo/db/repl/health.h @@ -0,0 +1,50 @@ +// replset.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +namespace mongo { + + /* throws */ + bool requestHeartbeat(string setname, string fromHost, string memberFullName, BSONObj& result, int myConfigVersion, int& theirConfigVersion, bool checkEmpty = false); + + struct HealthOptions { + HealthOptions() : + heartbeatSleepMillis(2000), + heartbeatTimeoutMillis( 10000 ), + heartbeatConnRetries(2) + { } + + bool isDefault() const { return *this == HealthOptions(); } + + // see http://www.mongodb.org/display/DOCS/Replica+Set+Internals + unsigned heartbeatSleepMillis; + unsigned heartbeatTimeoutMillis; + unsigned heartbeatConnRetries ; + + void check() { + uassert(13112, "bad replset heartbeat option", heartbeatSleepMillis >= 10); + uassert(13113, "bad replset heartbeat option", heartbeatTimeoutMillis >= 10); + } + + bool operator==(const HealthOptions& r) const { + return heartbeatSleepMillis==r.heartbeatSleepMillis && heartbeatTimeoutMillis==r.heartbeatTimeoutMillis && heartbeatConnRetries==r.heartbeatConnRetries; + } + }; + +} diff --git a/src/mongo/db/repl/heartbeat.cpp b/src/mongo/db/repl/heartbeat.cpp new file mode 100644 index 00000000000..331812af85a --- /dev/null +++ b/src/mongo/db/repl/heartbeat.cpp @@ -0,0 +1,382 @@ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful,b +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "rs.h" +#include "health.h" +#include "../../util/background.h" +#include "../../client/dbclient.h" +#include "../commands.h" +#include "../../util/concurrency/value.h" +#include "../../util/concurrency/task.h" +#include "../../util/concurrency/msg.h" +#include "../../util/mongoutils/html.h" +#include "../../util/goodies.h" +#include "../../util/ramlog.h" +#include "../helpers/dblogger.h" +#include "connections.h" +#include "../../util/unittest.h" +#include "../instance.h" +#include "../repl.h" + +namespace mongo { + + using namespace bson; + + extern bool replSetBlind; + extern ReplSettings replSettings; + + unsigned int HeartbeatInfo::numPings; + + long long HeartbeatInfo::timeDown() const { + if( up() ) return 0; + if( downSince == 0 ) + return 0; // still waiting on first heartbeat + return jsTime() - downSince; + } + + /* { replSetHeartbeat : <setname> } */ + class CmdReplSetHeartbeat : public ReplSetCommand { + public: + CmdReplSetHeartbeat() : ReplSetCommand("replSetHeartbeat") { } + virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if( replSetBlind ) { + if (theReplSet) { + errmsg = str::stream() << theReplSet->selfFullName() << " is blind"; + } + return false; + } + + /* we don't call ReplSetCommand::check() here because heartbeat + checks many things that are pre-initialization. */ + if( !replSet ) { + errmsg = "not running with --replSet"; + return false; + } + + if (!checkAuth(errmsg, result)) { + return false; + } + + /* we want to keep heartbeat connections open when relinquishing primary. tag them here. */ + { + AbstractMessagingPort *mp = cc().port(); + if( mp ) + mp->tag |= 1; + } + + if( cmdObj["pv"].Int() != 1 ) { + errmsg = "incompatible replset protocol version"; + return false; + } + { + string s = string(cmdObj.getStringField("replSetHeartbeat")); + if( cmdLine.ourSetName() != s ) { + errmsg = "repl set names do not match"; + log() << "replSet set names do not match, our cmdline: " << cmdLine._replSet << rsLog; + log() << "replSet s: " << s << rsLog; + result.append("mismatch", true); + return false; + } + } + + result.append("rs", true); + if( cmdObj["checkEmpty"].trueValue() ) { + result.append("hasData", replHasDatabases()); + } + if( theReplSet == 0 ) { + string from( cmdObj.getStringField("from") ); + if( !from.empty() ) { + scoped_lock lck( replSettings.discoveredSeeds_mx ); + replSettings.discoveredSeeds.insert(from); + } + result.append("hbmsg", "still initializing"); + return true; + } + + if( theReplSet->name() != cmdObj.getStringField("replSetHeartbeat") ) { + errmsg = "repl set names do not match (2)"; + result.append("mismatch", true); + return false; + } + result.append("set", theReplSet->name()); + result.append("state", theReplSet->state().s); + result.append("e", theReplSet->iAmElectable()); + result.append("hbmsg", theReplSet->hbmsg()); + result.append("time", (long long) time(0)); + result.appendDate("opTime", theReplSet->lastOpTimeWritten.asDate()); + int v = theReplSet->config().version; + result.append("v", v); + if( v > cmdObj["v"].Int() ) + result << "config" << theReplSet->config().asBson(); + + return true; + } + } cmdReplSetHeartbeat; + + bool requestHeartbeat(string setName, string from, string memberFullName, BSONObj& result, + int myCfgVersion, int& theirCfgVersion, bool checkEmpty) { + if( replSetBlind ) { + return false; + } + + BSONObj cmd = BSON( "replSetHeartbeat" << setName << + "v" << myCfgVersion << + "pv" << 1 << + "checkEmpty" << checkEmpty << + "from" << from ); + + // generally not a great idea to do outbound waiting calls in a + // write lock. heartbeats can be slow (multisecond to respond), so + // generally we don't want to be locked, at least not without + // thinking acarefully about it first. + uassert(15900, "can't heartbeat: too much lock", + !d.dbMutex.isWriteLocked() || theReplSet == 0 || !theReplSet->lockedByMe() ); + + ScopedConn conn(memberFullName); + return conn.runCommand("admin", cmd, result, 0); + } + + /** + * Poll every other set member to check its status. + * + * A detail about local machines and authentication: suppose we have 2 + * members, A and B, on the same machine using different keyFiles. A is + * primary. If we're just starting the set, there are no admin users, so A + * and B can access each other because it's local access. + * + * Then we add a user to A. B cannot sync this user from A, because as soon + * as we add a an admin user, A requires auth. However, A can still + * heartbeat B, because B *doesn't* have an admin user. So A can reach B + * but B cannot reach A. + * + * Once B is restarted with the correct keyFile, everything should work as + * expected. + */ + class ReplSetHealthPollTask : public task::Task { + private: + HostAndPort h; + HeartbeatInfo m; + int tries; + const int threshold; + public: + ReplSetHealthPollTask(const HostAndPort& hh, const HeartbeatInfo& mm) + : h(hh), m(mm), tries(0), threshold(15) { } + + string name() const { return "rsHealthPoll"; } + void doWork() { + if ( !theReplSet ) { + LOG(2) << "replSet not initialized yet, skipping health poll this round" << rsLog; + return; + } + + HeartbeatInfo mem = m; + HeartbeatInfo old = mem; + try { + BSONObj info; + int theirConfigVersion = -10000; + + bool ok = _requestHeartbeat(mem, info, theirConfigVersion); + + // weight new ping with old pings + // on the first ping, just use the ping value + if (old.ping != 0) { + mem.ping = (unsigned int)((old.ping * .8) + (mem.ping * .2)); + } + + if( ok ) { + up(info, mem); + } + else if (!info["errmsg"].eoo() && + info["errmsg"].str() == "need to login") { + authIssue(mem); + } + else { + down(mem, info.getStringField("errmsg")); + } + } + catch(DBException& e) { + down(mem, e.what()); + } + catch(...) { + down(mem, "replSet unexpected exception in ReplSetHealthPollTask"); + } + m = mem; + + theReplSet->mgr->send( boost::bind(&ReplSet::msgUpdateHBInfo, theReplSet, mem) ); + + static time_t last = 0; + time_t now = time(0); + bool changed = mem.changed(old); + if( changed ) { + if( old.hbstate != mem.hbstate ) + log() << "replSet member " << h.toString() << " is now in state " << mem.hbstate.toString() << rsLog; + } + if( changed || now-last>4 ) { + last = now; + theReplSet->mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) ); + } + } + + private: + bool _requestHeartbeat(HeartbeatInfo& mem, BSONObj& info, int& theirConfigVersion) { + if (tries++ % threshold == (threshold - 1)) { + ScopedConn conn(h.toString()); + conn.reconnect(); + } + + Timer timer; + time_t before = curTimeMicros64() / 1000000; + + bool ok = requestHeartbeat(theReplSet->name(), theReplSet->selfFullName(), + h.toString(), info, theReplSet->config().version, theirConfigVersion); + + mem.ping = (unsigned int)timer.millis(); + + // we set this on any response - we don't get this far if + // couldn't connect because exception is thrown + time_t after = mem.lastHeartbeat = before + (mem.ping / 1000); + + if ( info["time"].isNumber() ) { + long long t = info["time"].numberLong(); + if( t > after ) + mem.skew = (int) (t - after); + else if( t < before ) + mem.skew = (int) (t - before); // negative + } + else { + // it won't be there if remote hasn't initialized yet + if( info.hasElement("time") ) + warning() << "heatbeat.time isn't a number: " << info << endl; + mem.skew = INT_MIN; + } + + { + be state = info["state"]; + if( state.ok() ) + mem.hbstate = MemberState(state.Int()); + } + + return ok; + } + + void authIssue(HeartbeatInfo& mem) { + mem.authIssue = true; + mem.hbstate = MemberState::RS_UNKNOWN; + + // set health to 0 so that this doesn't count towards majority + mem.health = 0.0; + theReplSet->rmFromElectable(mem.id()); + } + + void down(HeartbeatInfo& mem, string msg) { + mem.authIssue = false; + mem.health = 0.0; + mem.ping = 0; + if( mem.upSince || mem.downSince == 0 ) { + mem.upSince = 0; + mem.downSince = jsTime(); + mem.hbstate = MemberState::RS_DOWN; + log() << "replSet info " << h.toString() << " is down (or slow to respond): " << msg << rsLog; + } + mem.lastHeartbeatMsg = msg; + theReplSet->rmFromElectable(mem.id()); + } + + void up(const BSONObj& info, HeartbeatInfo& mem) { + HeartbeatInfo::numPings++; + mem.authIssue = false; + + if( mem.upSince == 0 ) { + log() << "replSet member " << h.toString() << " is up" << rsLog; + mem.upSince = mem.lastHeartbeat; + } + mem.health = 1.0; + mem.lastHeartbeatMsg = info["hbmsg"].String(); + if( info.hasElement("opTime") ) + mem.opTime = info["opTime"].Date(); + + // see if this member is in the electable set + if( info["e"].eoo() ) { + // for backwards compatibility + const Member *member = theReplSet->findById(mem.id()); + if (member && member->config().potentiallyHot()) { + theReplSet->addToElectable(mem.id()); + } + else { + theReplSet->rmFromElectable(mem.id()); + } + } + // add this server to the electable set if it is within 10 + // seconds of the latest optime we know of + else if( info["e"].trueValue() && + mem.opTime >= theReplSet->lastOpTimeWritten.getSecs() - 10) { + unsigned lastOp = theReplSet->lastOtherOpTime().getSecs(); + if (lastOp > 0 && mem.opTime >= lastOp - 10) { + theReplSet->addToElectable(mem.id()); + } + } + else { + theReplSet->rmFromElectable(mem.id()); + } + + be cfg = info["config"]; + if( cfg.ok() ) { + // received a new config + boost::function<void()> f = + boost::bind(&Manager::msgReceivedNewConfig, theReplSet->mgr, cfg.Obj().copy()); + theReplSet->mgr->send(f); + } + } + }; + + void ReplSetImpl::endOldHealthTasks() { + unsigned sz = healthTasks.size(); + for( set<ReplSetHealthPollTask*>::iterator i = healthTasks.begin(); i != healthTasks.end(); i++ ) + (*i)->halt(); + healthTasks.clear(); + if( sz ) + DEV log() << "replSet debug: cleared old tasks " << sz << endl; + } + + void ReplSetImpl::startHealthTaskFor(Member *m) { + ReplSetHealthPollTask *task = new ReplSetHealthPollTask(m->h(), m->hbinfo()); + healthTasks.insert(task); + task::repeat(task, 2000); + } + + void startSyncThread(); + + /** called during repl set startup. caller expects it to return fairly quickly. + note ReplSet object is only created once we get a config - so this won't run + until the initiation. + */ + void ReplSetImpl::startThreads() { + task::fork(mgr); + mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) ); + + boost::thread t(startSyncThread); + + task::fork(ghost); + + // member heartbeats are started in ReplSetImpl::initFromConfig + } + +} + +/* todo: + stop bg job and delete on removefromset +*/ diff --git a/src/mongo/db/repl/manager.cpp b/src/mongo/db/repl/manager.cpp new file mode 100644 index 00000000000..91648a1b506 --- /dev/null +++ b/src/mongo/db/repl/manager.cpp @@ -0,0 +1,274 @@ +/* @file manager.cpp +*/ + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful,b +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "rs.h" +#include "connections.h" +#include "../client.h" + +namespace mongo { + + enum { + NOPRIMARY = -2, + SELFPRIMARY = -1 + }; + + /* check members OTHER THAN US to see if they think they are primary */ + const Member * Manager::findOtherPrimary(bool& two) { + two = false; + Member *m = rs->head(); + Member *p = 0; + while( m ) { + DEV assert( m != rs->_self ); + if( m->state().primary() && m->hbinfo().up() ) { + if( p ) { + two = true; + return 0; + } + p = m; + } + m = m->next(); + } + if( p ) + noteARemoteIsPrimary(p); + return p; + } + + Manager::Manager(ReplSetImpl *_rs) : + task::Server("rsMgr"), rs(_rs), busyWithElectSelf(false), _primary(NOPRIMARY) { + } + + Manager::~Manager() { + /* we don't destroy the replset object we sit in; however, the destructor could have thrown on init. + the log message below is just a reminder to come back one day and review this code more, and to + make it cleaner. + */ + log() << "info: ~Manager called" << rsLog; + rs->mgr = 0; + } + + void Manager::starting() { + Client::initThread("rsMgr"); + replLocalAuth(); + } + + void Manager::noteARemoteIsPrimary(const Member *m) { + if( rs->box.getPrimary() == m ) + return; + rs->_self->lhb() = ""; + if( rs->iAmArbiterOnly() ) { + rs->box.set(MemberState::RS_ARBITER, m); + } + else { + rs->box.noteRemoteIsPrimary(m); + } + } + + void Manager::checkElectableSet() { + unsigned otherOp = rs->lastOtherOpTime().getSecs(); + + // make sure the electable set is up-to-date + if (rs->elect.aMajoritySeemsToBeUp() && + rs->iAmPotentiallyHot() && + (otherOp == 0 || rs->lastOpTimeWritten.getSecs() >= otherOp - 10)) { + theReplSet->addToElectable(rs->selfId()); + } + else { + theReplSet->rmFromElectable(rs->selfId()); + } + + // check if we should ask the primary (possibly ourselves) to step down + const Member *highestPriority = theReplSet->getMostElectable(); + const Member *primary = rs->box.getPrimary(); + + if (primary && highestPriority && + highestPriority->config().priority > primary->config().priority) { + log() << "stepping down " << primary->fullName() << endl; + + if (primary->h().isSelf()) { + // replSetStepDown tries to acquire the same lock + // msgCheckNewState takes, so we can't call replSetStepDown on + // ourselves. + rs->relinquish(); + } + else { + BSONObj cmd = BSON( "replSetStepDown" << 1 ); + ScopedConn conn(primary->fullName()); + BSONObj result; + if (!conn.runCommand("admin", cmd, result, 0)) { + log() << "stepping down " << primary->fullName() + << " failed: " << result << endl; + } + } + } + } + + void Manager::checkAuth() { + int down = 0, authIssue = 0, total = 0; + + for( Member *m = rs->head(); m; m=m->next() ) { + total++; + + // all authIssue servers will also be not up + if (!m->hbinfo().up()) { + down++; + if (m->hbinfo().authIssue) { + authIssue++; + } + } + } + + // if all nodes are down or failed auth AND at least one failed + // auth, go into recovering. If all nodes are down, stay a + // secondary. + if (authIssue > 0 && down == total) { + log() << "replset error could not reach/authenticate against any members" << endl; + + if (rs->box.getPrimary() == rs->_self) { + log() << "auth problems, relinquishing primary" << rsLog; + rs->relinquish(); + } + + rs->blockSync(true); + } + else { + rs->blockSync(false); + } + } + + /** called as the health threads get new results */ + void Manager::msgCheckNewState() { + { + theReplSet->assertValid(); + rs->assertValid(); + + RSBase::lock lk(rs); + + if( busyWithElectSelf ) return; + + checkElectableSet(); + checkAuth(); + + const Member *p = rs->box.getPrimary(); + if( p && p != rs->_self ) { + if( !p->hbinfo().up() || + !p->hbinfo().hbstate.primary() ) { + p = 0; + rs->box.setOtherPrimary(0); + } + } + + const Member *p2; + { + bool two; + p2 = findOtherPrimary(two); + if( two ) { + /* two other nodes think they are primary (asynchronously polled) -- wait for things to settle down. */ + log() << "replSet info two primaries (transiently)" << rsLog; + return; + } + } + + if( p2 ) { + /* someone else thinks they are primary. */ + if( p == p2 ) { + // we thought the same; all set. + return; + } + if( p == 0 ) { + noteARemoteIsPrimary(p2); + return; + } + // todo xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + if( p != rs->_self ) { + // switch primary from oldremotep->newremotep2 + noteARemoteIsPrimary(p2); + return; + } + /* we thought we were primary, yet now someone else thinks they are. */ + if( !rs->elect.aMajoritySeemsToBeUp() ) { + /* we can't see a majority. so the other node is probably the right choice. */ + noteARemoteIsPrimary(p2); + return; + } + /* ignore for now, keep thinking we are master. + this could just be timing (we poll every couple seconds) or could indicate + a problem? if it happens consistently for a duration of time we should + alert the sysadmin. + */ + return; + } + + /* didn't find anyone who wants to be primary */ + + if( p ) { + /* we are already primary */ + + if( p != rs->_self ) { + rs->sethbmsg("error p != rs->self in checkNewState"); + log() << "replSet " << p->fullName() << rsLog; + log() << "replSet " << rs->_self->fullName() << rsLog; + return; + } + + if( rs->elect.shouldRelinquish() ) { + log() << "can't see a majority of the set, relinquishing primary" << rsLog; + rs->relinquish(); + } + + return; + } + + if( !rs->iAmPotentiallyHot() ) { // if not we never try to be primary + OCCASIONALLY log() << "replSet I don't see a primary and I can't elect myself" << endl; + return; + } + + /* no one seems to be primary. shall we try to elect ourself? */ + if( !rs->elect.aMajoritySeemsToBeUp() ) { + static time_t last; + static int n; + int ll = 0; + if( ++n > 5 ) ll++; + if( last + 60 > time(0 ) ) ll++; + log(ll) << "replSet can't see a majority, will not try to elect self" << rsLog; + last = time(0); + return; + } + + if( !rs->iAmElectable() ) { + return; + } + + busyWithElectSelf = true; // don't try to do further elections & such while we are already working on one. + } + try { + rs->elect.electSelf(); + } + catch(RetryAfterSleepException&) { + /* we want to process new inbounds before trying this again. so we just put a checkNewstate in the queue for eval later. */ + requeue(); + } + catch(...) { + log() << "replSet error unexpected assertion in rs manager" << rsLog; + } + busyWithElectSelf = false; + } + +} diff --git a/src/mongo/db/repl/multicmd.h b/src/mongo/db/repl/multicmd.h new file mode 100644 index 00000000000..2d70c551f64 --- /dev/null +++ b/src/mongo/db/repl/multicmd.h @@ -0,0 +1,75 @@ +// @file multicmd.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../../util/background.h" +#include "connections.h" + +namespace mongo { + + struct Target { + Target(string hostport) : toHost(hostport), ok(false) { } + //Target() : ok(false) { } + const string toHost; + bool ok; + BSONObj result; + }; + + /** send a command to several servers in parallel. waits for all to complete before + returning. + + in: Target::toHost + out: Target::result and Target::ok + */ + void multiCommand(BSONObj cmd, list<Target>& L); + + class _MultiCommandJob : public BackgroundJob { + public: + BSONObj& cmd; + Target& d; + _MultiCommandJob(BSONObj& _cmd, Target& _d) : cmd(_cmd), d(_d) { } + + private: + string name() const { return "MultiCommandJob"; } + void run() { + try { + ScopedConn c(d.toHost); + d.ok = c.runCommand("admin", cmd, d.result); + } + catch(DBException&) { + DEV log() << "dev caught dbexception on multiCommand " << d.toHost << rsLog; + } + } + }; + + inline void multiCommand(BSONObj cmd, list<Target>& L) { + list< shared_ptr<BackgroundJob> > jobs; + + for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) { + Target& d = *i; + _MultiCommandJob *j = new _MultiCommandJob(cmd, d); + jobs.push_back( shared_ptr<BackgroundJob>(j) ); + j->go(); + } + + for( list< shared_ptr<BackgroundJob> >::iterator i = jobs.begin(); i != jobs.end(); i++ ) { + (*i)->wait(); + } + } +} diff --git a/src/mongo/db/repl/replset_commands.cpp b/src/mongo/db/repl/replset_commands.cpp new file mode 100644 index 00000000000..84f16e53466 --- /dev/null +++ b/src/mongo/db/repl/replset_commands.cpp @@ -0,0 +1,404 @@ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "../cmdline.h" +#include "../commands.h" +#include "../repl.h" +#include "health.h" +#include "rs.h" +#include "rs_config.h" +#include "../dbwebserver.h" +#include "../../util/mongoutils/html.h" +#include "../../client/dbclient.h" +#include "../repl_block.h" + +using namespace bson; + +namespace mongo { + + void checkMembersUpForConfigChange(const ReplSetConfig& cfg, BSONObjBuilder& result, bool initial); + + /* commands in other files: + replSetHeartbeat - health.cpp + replSetInitiate - rs_mod.cpp + */ + + bool replSetBlind = false; + unsigned replSetForceInitialSyncFailure = 0; + + class CmdReplSetTest : public ReplSetCommand { + public: + virtual void help( stringstream &help ) const { + help << "Just for regression tests.\n"; + } + CmdReplSetTest() : ReplSetCommand("replSetTest") { } + virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + log() << "replSet replSetTest command received: " << cmdObj.toString() << rsLog; + + if (!checkAuth(errmsg, result)) { + return false; + } + + if( cmdObj.hasElement("forceInitialSyncFailure") ) { + replSetForceInitialSyncFailure = (unsigned) cmdObj["forceInitialSyncFailure"].Number(); + return true; + } + + if( !check(errmsg, result) ) + return false; + + if( cmdObj.hasElement("blind") ) { + replSetBlind = cmdObj.getBoolField("blind"); + return true; + } + + if (cmdObj.hasElement("sethbmsg")) { + replset::sethbmsg(cmdObj["sethbmsg"].String()); + return true; + } + + return false; + } + } cmdReplSetTest; + + /** get rollback id. used to check if a rollback happened during some interval of time. + as consumed, the rollback id is not in any particular order, it simply changes on each rollback. + @see incRBID() + */ + class CmdReplSetGetRBID : public ReplSetCommand { + public: + /* todo: ideally this should only change on rollbacks NOT on mongod restarts also. fix... */ + int rbid; + virtual void help( stringstream &help ) const { + help << "internal"; + } + CmdReplSetGetRBID() : ReplSetCommand("replSetGetRBID") { + // this is ok but micros or combo with some rand() and/or 64 bits might be better -- + // imagine a restart and a clock correction simultaneously (very unlikely but possible...) + rbid = (int) curTimeMillis64(); + } + virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if( !check(errmsg, result) ) + return false; + result.append("rbid",rbid); + return true; + } + } cmdReplSetRBID; + + /** we increment the rollback id on every rollback event. */ + void incRBID() { + cmdReplSetRBID.rbid++; + } + + /** helper to get rollback id from another server. */ + int getRBID(DBClientConnection *c) { + bo info; + c->simpleCommand("admin", &info, "replSetGetRBID"); + return info["rbid"].numberInt(); + } + + class CmdReplSetGetStatus : public ReplSetCommand { + public: + virtual void help( stringstream &help ) const { + help << "Report status of a replica set from the POV of this server\n"; + help << "{ replSetGetStatus : 1 }"; + help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands"; + } + CmdReplSetGetStatus() : ReplSetCommand("replSetGetStatus", true) { } + virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if ( cmdObj["forShell"].trueValue() ) + lastError.disableForCommand(); + + if( !check(errmsg, result) ) + return false; + theReplSet->summarizeStatus(result); + return true; + } + } cmdReplSetGetStatus; + + class CmdReplSetReconfig : public ReplSetCommand { + RWLock mutex; /* we don't need rw but we wanted try capability. :-( */ + public: + virtual void help( stringstream &help ) const { + help << "Adjust configuration of a replica set\n"; + help << "{ replSetReconfig : config_object }"; + help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands"; + } + CmdReplSetReconfig() : ReplSetCommand("replSetReconfig"), mutex("rsreconfig") { } + virtual bool run(const string& a, BSONObj& b, int e, string& errmsg, BSONObjBuilder& c, bool d) { + try { + rwlock_try_write lk(mutex); + return _run(a,b,e,errmsg,c,d); + } + catch(rwlock_try_write::exception&) { } + errmsg = "a replSetReconfig is already in progress"; + return false; + } + private: + bool _run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if ( !checkAuth(errmsg, result) ) { + return false; + } + + if( cmdObj["replSetReconfig"].type() != Object ) { + errmsg = "no configuration specified"; + return false; + } + + bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue(); + if( force && !theReplSet ) { + replSettings.reconfig = cmdObj["replSetReconfig"].Obj().getOwned(); + result.append("msg", "will try this config momentarily, try running rs.conf() again in a few seconds"); + return true; + } + + if ( !check(errmsg, result) ) { + return false; + } + + if( !force && !theReplSet->box.getState().primary() ) { + errmsg = "replSetReconfig command must be sent to the current replica set primary."; + return false; + } + + { + // just make sure we can get a write lock before doing anything else. we'll reacquire one + // later. of course it could be stuck then, but this check lowers the risk if weird things + // are up - we probably don't want a change to apply 30 minutes after the initial attempt. + time_t t = time(0); + writelock lk(""); + if( time(0)-t > 20 ) { + errmsg = "took a long time to get write lock, so not initiating. Initiate when server less busy?"; + return false; + } + } + + try { + ReplSetConfig newConfig(cmdObj["replSetReconfig"].Obj(), force); + + log() << "replSet replSetReconfig config object parses ok, " << newConfig.members.size() << " members specified" << rsLog; + + if( !ReplSetConfig::legalChange(theReplSet->getConfig(), newConfig, errmsg) ) { + return false; + } + + checkMembersUpForConfigChange(newConfig, result, false); + + log() << "replSet replSetReconfig [2]" << rsLog; + + theReplSet->haveNewConfig(newConfig, true); + ReplSet::startupStatusMsg.set("replSetReconfig'd"); + } + catch( DBException& e ) { + log() << "replSet replSetReconfig exception: " << e.what() << rsLog; + throw; + } + catch( string& se ) { + log() << "replSet reconfig exception: " << se << rsLog; + errmsg = se; + return false; + } + + resetSlaveCache(); + return true; + } + } cmdReplSetReconfig; + + class CmdReplSetFreeze : public ReplSetCommand { + public: + virtual void help( stringstream &help ) const { + help << "{ replSetFreeze : <seconds> }"; + help << "'freeze' state of member to the extent we can do that. What this really means is that\n"; + help << "this node will not attempt to become primary until the time period specified expires.\n"; + help << "You can call again with {replSetFreeze:0} to unfreeze sooner.\n"; + help << "A process restart unfreezes the member also.\n"; + help << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands"; + } + + CmdReplSetFreeze() : ReplSetCommand("replSetFreeze") { } + virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if( !check(errmsg, result) ) + return false; + int secs = (int) cmdObj.firstElement().numberInt(); + if( theReplSet->freeze(secs) ) { + if( secs == 0 ) + result.append("info","unfreezing"); + } + if( secs == 1 ) + result.append("warning", "you really want to freeze for only 1 second?"); + return true; + } + } cmdReplSetFreeze; + + class CmdReplSetStepDown: public ReplSetCommand { + public: + virtual void help( stringstream &help ) const { + help << "{ replSetStepDown : <seconds> }\n"; + help << "Step down as primary. Will not try to reelect self for the specified time period (1 minute if no numeric secs value specified).\n"; + help << "(If another member with same priority takes over in the meantime, it will stay primary.)\n"; + help << "http://www.mongodb.org/display/DOCS/Replica+Set+Commands"; + } + + CmdReplSetStepDown() : ReplSetCommand("replSetStepDown") { } + virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if( !check(errmsg, result) ) + return false; + if( !theReplSet->box.getState().primary() ) { + errmsg = "not primary so can't step down"; + return false; + } + + bool force = cmdObj.hasField("force") && cmdObj["force"].trueValue(); + + // only step down if there is another node synced to within 10 + // seconds of this node + if (!force) { + long long int lastOp = (long long int)theReplSet->lastOpTimeWritten.getSecs(); + long long int closest = (long long int)theReplSet->lastOtherOpTime().getSecs(); + + long long int diff = lastOp - closest; + result.append("closest", closest); + result.append("difference", diff); + + if (diff < 0) { + // not our problem, but we'll wait until thing settle down + errmsg = "someone is ahead of the primary?"; + return false; + } + + if (diff > 10) { + errmsg = "no secondaries within 10 seconds of my optime"; + return false; + } + } + + int secs = (int) cmdObj.firstElement().numberInt(); + if( secs == 0 ) + secs = 60; + return theReplSet->stepDown(secs); + } + } cmdReplSetStepDown; + + class CmdReplSetMaintenance: public ReplSetCommand { + public: + virtual void help( stringstream &help ) const { + help << "{ replSetMaintenance : bool }\n"; + help << "Enable or disable maintenance mode."; + } + + CmdReplSetMaintenance() : ReplSetCommand("replSetMaintenance") { } + virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + if( !check(errmsg, result) ) + return false; + if( theReplSet->box.getState().primary() ) { + errmsg = "primaries can't modify maintenance mode"; + return false; + } + + theReplSet->setMaintenanceMode(cmdObj["replSetMaintenance"].trueValue()); + return true; + } + } cmdReplSetMaintenance; + + using namespace bson; + using namespace mongoutils::html; + extern void fillRsLog(stringstream&); + + class ReplSetHandler : public DbWebHandler { + public: + ReplSetHandler() : DbWebHandler( "_replSet" , 1 , true ) {} + + virtual bool handles( const string& url ) const { + return startsWith( url , "/_replSet" ); + } + + virtual void handle( const char *rq, string url, BSONObj params, + string& responseMsg, int& responseCode, + vector<string>& headers, const SockAddr &from ) { + + if( url == "/_replSetOplog" ) { + responseMsg = _replSetOplog(params); + } + else + responseMsg = _replSet(); + responseCode = 200; + } + + string _replSetOplog(bo parms) { + int _id = (int) str::toUnsigned( parms["_id"].String() ); + + stringstream s; + string t = "Replication oplog"; + s << start(t); + s << p(t); + + if( theReplSet == 0 ) { + if( cmdLine._replSet.empty() ) + s << p("Not using --replSet"); + else { + s << p("Still starting up, or else set is not yet " + a("http://www.mongodb.org/display/DOCS/Replica+Set+Configuration#InitialSetup", "", "initiated") + + ".<br>" + ReplSet::startupStatusMsg.get()); + } + } + else { + try { + theReplSet->getOplogDiagsAsHtml(_id, s); + } + catch(std::exception& e) { + s << "error querying oplog: " << e.what() << '\n'; + } + } + + s << _end(); + return s.str(); + } + + /* /_replSet show replica set status in html format */ + string _replSet() { + stringstream s; + s << start("Replica Set Status " + prettyHostName()); + s << p( a("/", "back", "Home") + " | " + + a("/local/system.replset/?html=1", "", "View Replset Config") + " | " + + a("/replSetGetStatus?text=1", "", "replSetGetStatus") + " | " + + a("http://www.mongodb.org/display/DOCS/Replica+Sets", "", "Docs") + ); + + if( theReplSet == 0 ) { + if( cmdLine._replSet.empty() ) + s << p("Not using --replSet"); + else { + s << p("Still starting up, or else set is not yet " + a("http://www.mongodb.org/display/DOCS/Replica+Set+Configuration#InitialSetup", "", "initiated") + + ".<br>" + ReplSet::startupStatusMsg.get()); + } + } + else { + try { + theReplSet->summarizeAsHtml(s); + } + catch(...) { s << "error summarizing replset status\n"; } + } + s << p("Recent replset log activity:"); + fillRsLog(s); + s << _end(); + return s.str(); + } + + + + } replSetHandler; + +} diff --git a/src/mongo/db/repl/rs.cpp b/src/mongo/db/repl/rs.cpp new file mode 100644 index 00000000000..fff5d72bcc0 --- /dev/null +++ b/src/mongo/db/repl/rs.cpp @@ -0,0 +1,778 @@ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "../cmdline.h" +#include "../../util/net/sock.h" +#include "../client.h" +#include "../../client/dbclient.h" +#include "../dbhelpers.h" +#include "../../s/d_logic.h" +#include "rs.h" +#include "connections.h" +#include "../repl.h" +#include "../instance.h" + +using namespace std; + +namespace mongo { + + using namespace bson; + + bool replSet = false; + ReplSet *theReplSet = 0; + + bool isCurrentlyAReplSetPrimary() { + return theReplSet && theReplSet->isPrimary(); + } + + void replset::sethbmsg(const string& s, const int level) { + if (theReplSet) { + theReplSet->sethbmsg(s, logLevel); + } + } + + void ReplSetImpl::sethbmsg(string s, int logLevel) { + static time_t lastLogged; + _hbmsgTime = time(0); + + if( s == _hbmsg ) { + // unchanged + if( _hbmsgTime - lastLogged < 60 ) + return; + } + + unsigned sz = s.size(); + if( sz >= 256 ) + memcpy(_hbmsg, s.c_str(), 255); + else { + _hbmsg[sz] = 0; + memcpy(_hbmsg, s.c_str(), sz); + } + if( !s.empty() ) { + lastLogged = _hbmsgTime; + log(logLevel) << "replSet " << s << rsLog; + } + } + + void ReplSetImpl::assumePrimary() { + LOG(2) << "replSet assuming primary" << endl; + assert( iAmPotentiallyHot() ); + writelock lk("admin."); // so we are synchronized with _logOp() + + // Make sure that new OpTimes are higher than existing ones even with clock skew + DBDirectClient c; + BSONObj lastOp = c.findOne( "local.oplog.rs", Query().sort(reverseNaturalObj), NULL, QueryOption_SlaveOk ); + if ( !lastOp.isEmpty() ) { + OpTime::setLast( lastOp[ "ts" ].date() ); + } + + changeState(MemberState::RS_PRIMARY); + } + + void ReplSetImpl::changeState(MemberState s) { box.change(s, _self); } + + void ReplSetImpl::setMaintenanceMode(const bool inc) { + lock lk(this); + + if (inc) { + log() << "replSet going into maintenance mode (" << _maintenanceMode << " other tasks)" << rsLog; + + _maintenanceMode++; + changeState(MemberState::RS_RECOVERING); + } + else { + _maintenanceMode--; + // no need to change state, syncTail will try to go live as a secondary soon + + log() << "leaving maintenance mode (" << _maintenanceMode << " other tasks)" << rsLog; + } + } + + Member* ReplSetImpl::getMostElectable() { + lock lk(this); + + Member *max = 0; + + for (set<unsigned>::iterator it = _electableSet.begin(); it != _electableSet.end(); it++) { + const Member *temp = findById(*it); + if (!temp) { + log() << "couldn't find member: " << *it << endl; + _electableSet.erase(*it); + continue; + } + if (!max || max->config().priority < temp->config().priority) { + max = (Member*)temp; + } + } + + return max; + } + + const bool closeOnRelinquish = true; + + void ReplSetImpl::relinquish() { + LOG(2) << "replSet attempting to relinquish" << endl; + if( box.getState().primary() ) { + { + writelock lk("admin."); // so we are synchronized with _logOp() + + log() << "replSet relinquishing primary state" << rsLog; + changeState(MemberState::RS_SECONDARY); + } + + if( closeOnRelinquish ) { + /* close sockets that were talking to us so they don't blithly send many writes that will fail + with "not master" (of course client could check result code, but in case they are not) + */ + log() << "replSet closing client sockets after reqlinquishing primary" << rsLog; + MessagingPort::closeAllSockets(1); + } + + // now that all connections were closed, strip this mongod from all sharding details + // if and when it gets promoted to a primary again, only then it should reload the sharding state + // the rationale here is that this mongod won't bring stale state when it regains primaryhood + shardingState.resetShardingState(); + + } + else if( box.getState().startup2() ) { + // ? add comment + changeState(MemberState::RS_RECOVERING); + } + } + + /* look freshly for who is primary - includes relinquishing ourself. */ + void ReplSetImpl::forgetPrimary() { + if( box.getState().primary() ) + relinquish(); + else { + box.setOtherPrimary(0); + } + } + + // for the replSetStepDown command + bool ReplSetImpl::_stepDown(int secs) { + lock lk(this); + if( box.getState().primary() ) { + elect.steppedDown = time(0) + secs; + log() << "replSet info stepping down as primary secs=" << secs << rsLog; + relinquish(); + return true; + } + return false; + } + + bool ReplSetImpl::_freeze(int secs) { + lock lk(this); + /* note if we are primary we remain primary but won't try to elect ourself again until + this time period expires. + */ + if( secs == 0 ) { + elect.steppedDown = 0; + log() << "replSet info 'unfreezing'" << rsLog; + } + else { + if( !box.getState().primary() ) { + elect.steppedDown = time(0) + secs; + log() << "replSet info 'freezing' for " << secs << " seconds" << rsLog; + } + else { + log() << "replSet info received freeze command but we are primary" << rsLog; + } + } + return true; + } + + void ReplSetImpl::msgUpdateHBInfo(HeartbeatInfo h) { + for( Member *m = _members.head(); m; m=m->next() ) { + if( m->id() == h.id() ) { + m->_hbinfo = h; + return; + } + } + } + + list<HostAndPort> ReplSetImpl::memberHostnames() const { + list<HostAndPort> L; + L.push_back(_self->h()); + for( Member *m = _members.head(); m; m = m->next() ) + L.push_back(m->h()); + return L; + } + + void ReplSetImpl::_fillIsMasterHost(const Member *m, vector<string>& hosts, vector<string>& passives, vector<string>& arbiters) { + assert( m ); + if( m->config().hidden ) + return; + + if( m->potentiallyHot() ) { + hosts.push_back(m->h().toString()); + } + else if( !m->config().arbiterOnly ) { + if( m->config().slaveDelay ) { + /* hmmm - we don't list these as they are stale. */ + } + else { + passives.push_back(m->h().toString()); + } + } + else { + arbiters.push_back(m->h().toString()); + } + } + + void ReplSetImpl::_fillIsMaster(BSONObjBuilder& b) { + lock lk(this); + + const StateBox::SP sp = box.get(); + bool isp = sp.state.primary(); + b.append("setName", name()); + b.append("ismaster", isp); + b.append("secondary", sp.state.secondary()); + { + vector<string> hosts, passives, arbiters; + _fillIsMasterHost(_self, hosts, passives, arbiters); + + for( Member *m = _members.head(); m; m = m->next() ) { + assert( m ); + _fillIsMasterHost(m, hosts, passives, arbiters); + } + + if( hosts.size() > 0 ) { + b.append("hosts", hosts); + } + if( passives.size() > 0 ) { + b.append("passives", passives); + } + if( arbiters.size() > 0 ) { + b.append("arbiters", arbiters); + } + } + + if( !isp ) { + const Member *m = sp.primary; + if( m ) + b.append("primary", m->h().toString()); + } + else { + b.append("primary", _self->fullName()); + } + + if( myConfig().arbiterOnly ) + b.append("arbiterOnly", true); + if( myConfig().priority == 0 && !myConfig().arbiterOnly) + b.append("passive", true); + if( myConfig().slaveDelay ) + b.append("slaveDelay", myConfig().slaveDelay); + if( myConfig().hidden ) + b.append("hidden", true); + if( !myConfig().buildIndexes ) + b.append("buildIndexes", false); + if( !myConfig().tags.empty() ) { + BSONObjBuilder a; + for( map<string,string>::const_iterator i = myConfig().tags.begin(); i != myConfig().tags.end(); i++ ) + a.append((*i).first, (*i).second); + b.append("tags", a.done()); + } + b.append("me", myConfig().h.toString()); + } + + /** @param cfgString <setname>/<seedhost1>,<seedhost2> */ + + void parseReplsetCmdLine(string cfgString, string& setname, vector<HostAndPort>& seeds, set<HostAndPort>& seedSet ) { + const char *p = cfgString.c_str(); + const char *slash = strchr(p, '/'); + if( slash ) + setname = string(p, slash-p); + else + setname = p; + uassert(13093, "bad --replSet config string format is: <setname>[/<seedhost1>,<seedhost2>,...]", !setname.empty()); + + if( slash == 0 ) + return; + + p = slash + 1; + while( 1 ) { + const char *comma = strchr(p, ','); + if( comma == 0 ) comma = strchr(p,0); + if( p == comma ) + break; + { + HostAndPort m; + try { + m = HostAndPort( string(p, comma-p) ); + } + catch(...) { + uassert(13114, "bad --replSet seed hostname", false); + } + uassert(13096, "bad --replSet command line config string - dups?", seedSet.count(m) == 0 ); + seedSet.insert(m); + //uassert(13101, "can't use localhost in replset host list", !m.isLocalHost()); + if( m.isSelf() ) { + log(1) << "replSet ignoring seed " << m.toString() << " (=self)" << rsLog; + } + else + seeds.push_back(m); + if( *comma == 0 ) + break; + p = comma + 1; + } + } + } + + ReplSetImpl::ReplSetImpl(ReplSetCmdline& replSetCmdline) : elect(this), + _currentSyncTarget(0), + _blockSync(false), + _hbmsgTime(0), + _self(0), + _maintenanceMode(0), + mgr( new Manager(this) ), + ghost( new GhostSync(this) ) { + + _cfg = 0; + memset(_hbmsg, 0, sizeof(_hbmsg)); + strcpy( _hbmsg , "initial startup" ); + lastH = 0; + changeState(MemberState::RS_STARTUP); + + _seeds = &replSetCmdline.seeds; + + LOG(1) << "replSet beginning startup..." << rsLog; + + loadConfig(); + + unsigned sss = replSetCmdline.seedSet.size(); + for( Member *m = head(); m; m = m->next() ) { + replSetCmdline.seedSet.erase(m->h()); + } + for( set<HostAndPort>::iterator i = replSetCmdline.seedSet.begin(); i != replSetCmdline.seedSet.end(); i++ ) { + if( i->isSelf() ) { + if( sss == 1 ) { + LOG(1) << "replSet warning self is listed in the seed list and there are no other seeds listed did you intend that?" << rsLog; + } + } + else { + log() << "replSet warning command line seed " << i->toString() << " is not present in the current repl set config" << rsLog; + } + } + } + + void newReplUp(); + + void ReplSetImpl::loadLastOpTimeWritten(bool quiet) { + readlock lk(rsoplog); + BSONObj o; + if( Helpers::getLast(rsoplog, o) ) { + lastH = o["h"].numberLong(); + lastOpTimeWritten = o["ts"]._opTime(); + uassert(13290, "bad replSet oplog entry?", quiet || !lastOpTimeWritten.isNull()); + } + } + + /* call after constructing to start - returns fairly quickly after launching its threads */ + void ReplSetImpl::_go() { + try { + loadLastOpTimeWritten(); + } + catch(std::exception& e) { + log() << "replSet error fatal couldn't query the local " << rsoplog << " collection. Terminating mongod after 30 seconds." << rsLog; + log() << e.what() << rsLog; + sleepsecs(30); + dbexit( EXIT_REPLICATION_ERROR ); + return; + } + + changeState(MemberState::RS_STARTUP2); + startThreads(); + newReplUp(); // oplog.cpp + } + + ReplSetImpl::StartupStatus ReplSetImpl::startupStatus = PRESTART; + DiagStr ReplSetImpl::startupStatusMsg; + + extern BSONObj *getLastErrorDefault; + + void ReplSetImpl::setSelfTo(Member *m) { + // already locked in initFromConfig + _self = m; + _id = m->id(); + _config = m->config(); + if( m ) _buildIndexes = m->config().buildIndexes; + else _buildIndexes = true; + } + + /** @param reconf true if this is a reconfiguration and not an initial load of the configuration. + @return true if ok; throws if config really bad; false if config doesn't include self + */ + bool ReplSetImpl::initFromConfig(ReplSetConfig& c, bool reconf) { + /* NOTE: haveNewConfig() writes the new config to disk before we get here. So + we cannot error out at this point, except fatally. Check errors earlier. + */ + lock lk(this); + + if( getLastErrorDefault || !c.getLastErrorDefaults.isEmpty() ) { + // see comment in dbcommands.cpp for getlasterrordefault + getLastErrorDefault = new BSONObj( c.getLastErrorDefaults ); + } + + list<ReplSetConfig::MemberCfg*> newOnes; + // additive short-cuts the new config setup. If we are just adding a + // node/nodes and nothing else is changing, this is additive. If it's + // not a reconfig, we're not adding anything + bool additive = reconf; + { + unsigned nfound = 0; + int me = 0; + for( vector<ReplSetConfig::MemberCfg>::iterator i = c.members.begin(); i != c.members.end(); i++ ) { + + ReplSetConfig::MemberCfg& m = *i; + if( m.h.isSelf() ) { + me++; + } + + if( reconf ) { + if (m.h.isSelf() && (!_self || (int)_self->id() != m._id)) { + log() << "self doesn't match: " << m._id << rsLog; + assert(false); + } + + const Member *old = findById(m._id); + if( old ) { + nfound++; + assert( (int) old->id() == m._id ); + if( old->config() != m ) { + additive = false; + } + } + else { + newOnes.push_back(&m); + } + } + } + if( me == 0 ) { + _members.orphanAll(); + + // sending hbs must continue to pick up new config, so we leave + // hb threads alone + + // close sockets to force clients to re-evaluate this member + MessagingPort::closeAllSockets(0); + + // stop sync thread + box.set(MemberState::RS_STARTUP, 0); + + // go into holding pattern + log() << "replSet error self not present in the repl set configuration:" << rsLog; + log() << c.toString() << rsLog; + return false; + } + uassert( 13302, "replSet error self appears twice in the repl set configuration", me<=1 ); + + // if we found different members that the original config, reload everything + if( reconf && config().members.size() != nfound ) + additive = false; + } + + _cfg = new ReplSetConfig(c); + assert( _cfg->ok() ); + assert( _name.empty() || _name == _cfg->_id ); + _name = _cfg->_id; + assert( !_name.empty() ); + + // this is a shortcut for simple changes + if( additive ) { + log() << "replSet info : additive change to configuration" << rsLog; + for( list<ReplSetConfig::MemberCfg*>::const_iterator i = newOnes.begin(); i != newOnes.end(); i++ ) { + ReplSetConfig::MemberCfg *m = *i; + Member *mi = new Member(m->h, m->_id, m, false); + + /** we will indicate that new members are up() initially so that we don't relinquish our + primary state because we can't (transiently) see a majority. they should be up as we + check that new members are up before getting here on reconfig anyway. + */ + mi->get_hbinfo().health = 0.1; + + _members.push(mi); + startHealthTaskFor(mi); + } + + // if we aren't creating new members, we may have to update the + // groups for the current ones + _cfg->updateMembers(_members); + + return true; + } + + // start with no members. if this is a reconfig, drop the old ones. + _members.orphanAll(); + + endOldHealthTasks(); + + int oldPrimaryId = -1; + { + const Member *p = box.getPrimary(); + if( p ) + oldPrimaryId = p->id(); + } + forgetPrimary(); + + // not setting _self to 0 as other threads use _self w/o locking + int me = 0; + + // For logging + string members = ""; + + for( vector<ReplSetConfig::MemberCfg>::iterator i = _cfg->members.begin(); i != _cfg->members.end(); i++ ) { + ReplSetConfig::MemberCfg& m = *i; + Member *mi; + members += ( members == "" ? "" : ", " ) + m.h.toString(); + if( m.h.isSelf() ) { + assert( me++ == 0 ); + mi = new Member(m.h, m._id, &m, true); + if (!reconf) { + log() << "replSet I am " << m.h.toString() << rsLog; + } + setSelfTo(mi); + + if( (int)mi->id() == oldPrimaryId ) + box.setSelfPrimary(mi); + } + else { + mi = new Member(m.h, m._id, &m, false); + _members.push(mi); + startHealthTaskFor(mi); + if( (int)mi->id() == oldPrimaryId ) + box.setOtherPrimary(mi); + } + } + + if( me == 0 ){ + log() << "replSet warning did not detect own host in full reconfig, members " << members << " config: " << c << rsLog; + } + + return true; + } + + // Our own config must be the first one. + bool ReplSetImpl::_loadConfigFinish(vector<ReplSetConfig>& cfgs) { + int v = -1; + ReplSetConfig *highest = 0; + int myVersion = -2000; + int n = 0; + for( vector<ReplSetConfig>::iterator i = cfgs.begin(); i != cfgs.end(); i++ ) { + ReplSetConfig& cfg = *i; + if( ++n == 1 ) myVersion = cfg.version; + if( cfg.ok() && cfg.version > v ) { + highest = &cfg; + v = cfg.version; + } + } + assert( highest ); + + if( !initFromConfig(*highest) ) + return false; + + if( highest->version > myVersion && highest->version >= 0 ) { + log() << "replSet got config version " << highest->version << " from a remote, saving locally" << rsLog; + highest->saveConfigLocally(BSONObj()); + } + return true; + } + + void ReplSetImpl::loadConfig() { + while( 1 ) { + startupStatus = LOADINGCONFIG; + startupStatusMsg.set("loading " + rsConfigNs + " config (LOADINGCONFIG)"); + LOG(1) << "loadConfig() " << rsConfigNs << endl; + try { + vector<ReplSetConfig> configs; + try { + configs.push_back( ReplSetConfig(HostAndPort::me()) ); + } + catch(DBException& e) { + log() << "replSet exception loading our local replset configuration object : " << e.toString() << rsLog; + } + for( vector<HostAndPort>::const_iterator i = _seeds->begin(); i != _seeds->end(); i++ ) { + try { + configs.push_back( ReplSetConfig(*i) ); + } + catch( DBException& e ) { + log() << "replSet exception trying to load config from " << *i << " : " << e.toString() << rsLog; + } + } + { + scoped_lock lck( replSettings.discoveredSeeds_mx ); + if( replSettings.discoveredSeeds.size() > 0 ) { + for (set<string>::iterator i = replSettings.discoveredSeeds.begin(); + i != replSettings.discoveredSeeds.end(); + i++) { + try { + configs.push_back( ReplSetConfig(HostAndPort(*i)) ); + } + catch( DBException& ) { + log(1) << "replSet exception trying to load config from discovered seed " << *i << rsLog; + replSettings.discoveredSeeds.erase(*i); + } + } + } + } + + if (!replSettings.reconfig.isEmpty()) { + try { + configs.push_back(ReplSetConfig(replSettings.reconfig, true)); + } + catch( DBException& re) { + log() << "replSet couldn't load reconfig: " << re.what() << rsLog; + replSettings.reconfig = BSONObj(); + } + } + + int nok = 0; + int nempty = 0; + for( vector<ReplSetConfig>::iterator i = configs.begin(); i != configs.end(); i++ ) { + if( i->ok() ) + nok++; + if( i->empty() ) + nempty++; + } + if( nok == 0 ) { + + if( nempty == (int) configs.size() ) { + startupStatus = EMPTYCONFIG; + startupStatusMsg.set("can't get " + rsConfigNs + " config from self or any seed (EMPTYCONFIG)"); + log() << "replSet can't get " << rsConfigNs << " config from self or any seed (EMPTYCONFIG)" << rsLog; + static unsigned once; + if( ++once == 1 ) { + log() << "replSet info you may need to run replSetInitiate -- rs.initiate() in the shell -- if that is not already done" << rsLog; + } + if( _seeds->size() == 0 ) { + LOG(1) << "replSet info no seed hosts were specified on the --replSet command line" << rsLog; + } + } + else { + startupStatus = EMPTYUNREACHABLE; + startupStatusMsg.set("can't currently get " + rsConfigNs + " config from self or any seed (EMPTYUNREACHABLE)"); + log() << "replSet can't get " << rsConfigNs << " config from self or any seed (yet)" << rsLog; + } + + sleepsecs(10); + continue; + } + + if( !_loadConfigFinish(configs) ) { + log() << "replSet info Couldn't load config yet. Sleeping 20sec and will try again." << rsLog; + sleepsecs(20); + continue; + } + } + catch(DBException& e) { + startupStatus = BADCONFIG; + startupStatusMsg.set("replSet error loading set config (BADCONFIG)"); + log() << "replSet error loading configurations " << e.toString() << rsLog; + log() << "replSet error replication will not start" << rsLog; + sethbmsg("error loading set config"); + _fatal(); + throw; + } + break; + } + startupStatusMsg.set("? started"); + startupStatus = STARTED; + } + + void ReplSetImpl::_fatal() { + box.set(MemberState::RS_FATAL, 0); + log() << "replSet error fatal, stopping replication" << rsLog; + } + + void ReplSet::haveNewConfig(ReplSetConfig& newConfig, bool addComment) { + bo comment; + if( addComment ) + comment = BSON( "msg" << "Reconfig set" << "version" << newConfig.version ); + + newConfig.saveConfigLocally(comment); + + try { + if (initFromConfig(newConfig, true)) { + log() << "replSet replSetReconfig new config saved locally" << rsLog; + } + } + catch(DBException& e) { + if( e.getCode() == 13497 /* removed from set */ ) { + cc().shutdown(); + dbexit( EXIT_CLEAN , "removed from replica set" ); // never returns + assert(0); + } + log() << "replSet error unexpected exception in haveNewConfig() : " << e.toString() << rsLog; + _fatal(); + } + catch(...) { + log() << "replSet error unexpected exception in haveNewConfig()" << rsLog; + _fatal(); + } + } + + void Manager::msgReceivedNewConfig(BSONObj o) { + log() << "replset msgReceivedNewConfig version: " << o["version"].toString() << rsLog; + ReplSetConfig c(o); + if( c.version > rs->config().version ) + theReplSet->haveNewConfig(c, false); + else { + log() << "replSet info msgReceivedNewConfig but version isn't higher " << + c.version << ' ' << rs->config().version << rsLog; + } + } + + /* forked as a thread during startup + it can run quite a while looking for config. but once found, + a separate thread takes over as ReplSetImpl::Manager, and this thread + terminates. + */ + void startReplSets(ReplSetCmdline *replSetCmdline) { + Client::initThread("rsStart"); + try { + assert( theReplSet == 0 ); + if( replSetCmdline == 0 ) { + assert(!replSet); + return; + } + replLocalAuth(); + (theReplSet = new ReplSet(*replSetCmdline))->go(); + } + catch(std::exception& e) { + log() << "replSet caught exception in startReplSets thread: " << e.what() << rsLog; + if( theReplSet ) + theReplSet->fatal(); + } + cc().shutdown(); + } + + void replLocalAuth() { + if ( noauth ) + return; + cc().getAuthenticationInfo()->authorize("local","_repl"); + } + + +} + +namespace boost { + + void assertion_failed(char const * expr, char const * function, char const * file, long line) { + mongo::log() << "boost assertion failure " << expr << ' ' << function << ' ' << file << ' ' << line << endl; + } + +} diff --git a/src/mongo/db/repl/rs.h b/src/mongo/db/repl/rs.h new file mode 100644 index 00000000000..8e43204be3b --- /dev/null +++ b/src/mongo/db/repl/rs.h @@ -0,0 +1,667 @@ +// /db/repl/rs.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../../util/concurrency/list.h" +#include "../../util/concurrency/value.h" +#include "../../util/concurrency/msg.h" +#include "../../util/net/hostandport.h" +#include "../commands.h" +#include "../oplog.h" +#include "../oplogreader.h" +#include "rs_exception.h" +#include "rs_optime.h" +#include "rs_member.h" +#include "rs_config.h" + +/** + * Order of Events + * + * On startup, if the --replSet option is present, startReplSets is called. + * startReplSets forks off a new thread for replica set activities. It creates + * the global theReplSet variable and calls go() on it. + * + * theReplSet's constructor changes the replica set's state to RS_STARTUP, + * starts the replica set manager, and loads the config (if the replica set + * has been initialized). + */ + +namespace mongo { + + struct HowToFixUp; + struct Target; + class DBClientConnection; + class ReplSetImpl; + class OplogReader; + extern bool replSet; // true if using repl sets + extern class ReplSet *theReplSet; // null until initialized + extern Tee *rsLog; + + /* member of a replica set */ + class Member : public List1<Member>::Base { + private: + ~Member(); // intentionally unimplemented as should never be called -- see List1<>::Base. + Member(const Member&); + public: + Member(HostAndPort h, unsigned ord, ReplSetConfig::MemberCfg *c, bool self); + + string fullName() const { return h().toString(); } + const ReplSetConfig::MemberCfg& config() const { return _config; } + ReplSetConfig::MemberCfg& configw() { return _config; } + const HeartbeatInfo& hbinfo() const { return _hbinfo; } + HeartbeatInfo& get_hbinfo() { return _hbinfo; } + string lhb() const { return _hbinfo.lastHeartbeatMsg; } + MemberState state() const { return _hbinfo.hbstate; } + const HostAndPort& h() const { return _h; } + unsigned id() const { return _hbinfo.id(); } + + bool potentiallyHot() const { return _config.potentiallyHot(); } // not arbiter, not priority 0 + void summarizeMember(stringstream& s) const; + + private: + friend class ReplSetImpl; + ReplSetConfig::MemberCfg _config; + const HostAndPort _h; + HeartbeatInfo _hbinfo; + }; + + namespace replset { + /** + * "Normal" replica set syncing + */ + class SyncTail : public Sync { + public: + virtual ~SyncTail() {} + SyncTail(const string& host) : Sync(host) {} + virtual bool syncApply(const BSONObj &o); + }; + + /** + * Initial clone and sync + */ + class InitialSync : public SyncTail { + public: + InitialSync(const string& host) : SyncTail(host) {} + virtual ~InitialSync() {} + bool oplogApplication(OplogReader& r, const Member* source, const OpTime& applyGTE, const OpTime& minValid); + virtual void applyOp(const BSONObj& o, const OpTime& minvalid); + }; + + // TODO: move hbmsg into an error-keeping class (SERVER-4444) + void sethbmsg(const string& s, const int logLevel=0); + + } // namespace replset + + class Manager : public task::Server { + ReplSetImpl *rs; + bool busyWithElectSelf; + int _primary; + + /** @param two - if true two primaries were seen. this can happen transiently, in addition to our + polling being only occasional. in this case null is returned, but the caller should + not assume primary itself in that situation. + */ + const Member* findOtherPrimary(bool& two); + + void noteARemoteIsPrimary(const Member *); + void checkElectableSet(); + void checkAuth(); + virtual void starting(); + public: + Manager(ReplSetImpl *rs); + virtual ~Manager(); + void msgReceivedNewConfig(BSONObj); + void msgCheckNewState(); + }; + + class GhostSync : public task::Server { + struct GhostSlave : boost::noncopyable { + GhostSlave() : last(0), slave(0), init(false) { } + OplogReader reader; + OpTime last; + Member* slave; + bool init; + }; + /** + * This is a cache of ghost slaves + */ + typedef map< mongo::OID,shared_ptr<GhostSlave> > MAP; + MAP _ghostCache; + RWLock _lock; // protects _ghostCache + ReplSetImpl *rs; + virtual void starting(); + public: + GhostSync(ReplSetImpl *_rs) : task::Server("rsGhostSync"), _lock("GhostSync"), rs(_rs) {} + ~GhostSync() { + log() << "~GhostSync() called" << rsLog; + } + + /** + * Replica sets can sync in a hierarchical fashion, which throws off w + * calculation on the master. percolate() faux-syncs from an upstream + * node so that the primary will know what the slaves are up to. + * + * We can't just directly sync to the primary because it could be + * unreachable, e.g., S1--->S2--->S3--->P. S2 should ghost sync from S3 + * and S3 can ghost sync from the primary. + * + * Say we have an S1--->S2--->P situation and this node is S2. rid + * would refer to S1. S2 would create a ghost slave of S1 and connect + * it to P (_currentSyncTarget). Then it would use this connection to + * pretend to be S1, replicating off of P. + */ + void percolate(const BSONObj& rid, const OpTime& last); + void associateSlave(const BSONObj& rid, const int memberId); + void updateSlave(const mongo::OID& id, const OpTime& last); + }; + + struct Target; + + class Consensus { + ReplSetImpl &rs; + struct LastYea { + LastYea() : when(0), who(0xffffffff) { } + time_t when; + unsigned who; + }; + static SimpleMutex lyMutex; + Guarded<LastYea,lyMutex> ly; + unsigned yea(unsigned memberId); // throws VoteException + void electionFailed(unsigned meid); + void _electSelf(); + bool weAreFreshest(bool& allUp, int& nTies); + bool sleptLast; // slept last elect() pass + public: + Consensus(ReplSetImpl *t) : rs(*t) { + sleptLast = false; + steppedDown = 0; + } + + /* if we've stepped down, this is when we are allowed to try to elect ourself again. + todo: handle possible weirdnesses at clock skews etc. + */ + time_t steppedDown; + + int totalVotes() const; + bool aMajoritySeemsToBeUp() const; + bool shouldRelinquish() const; + void electSelf(); + void electCmdReceived(BSONObj, BSONObjBuilder*); + void multiCommand(BSONObj cmd, list<Target>& L); + }; + + /** + * most operations on a ReplSet object should be done while locked. that + * logic implemented here. + * + * Order of locking: lock the replica set, then take a rwlock. + */ + class RSBase : boost::noncopyable { + public: + const unsigned magic; + void assertValid() { assert( magic == 0x12345677 ); } + private: + mongo::mutex m; + int _locked; + ThreadLocalValue<bool> _lockedByMe; + protected: + RSBase() : magic(0x12345677), m("RSBase"), _locked(0) { } + ~RSBase() { + /* this can happen if we throw in the constructor; otherwise never happens. thus we log it as it is quite unusual. */ + log() << "replSet ~RSBase called" << rsLog; + } + + public: + class lock { + RSBase& rsbase; + auto_ptr<scoped_lock> sl; + public: + lock(RSBase* b) : rsbase(*b) { + if( rsbase._lockedByMe.get() ) + return; // recursive is ok... + + sl.reset( new scoped_lock(rsbase.m) ); + DEV assert(rsbase._locked == 0); + rsbase._locked++; + rsbase._lockedByMe.set(true); + } + ~lock() { + if( sl.get() ) { + assert( rsbase._lockedByMe.get() ); + DEV assert(rsbase._locked == 1); + rsbase._lockedByMe.set(false); + rsbase._locked--; + } + } + }; + + /* for asserts */ + bool locked() const { return _locked != 0; } + + /* if true, is locked, and was locked by this thread. note if false, it could be in the lock or not for another + just for asserts & such so we can make the contracts clear on who locks what when. + we don't use these locks that frequently, so the little bit of overhead is fine. + */ + bool lockedByMe() { return _lockedByMe.get(); } + }; + + class ReplSetHealthPollTask; + + /* safe container for our state that keeps member pointer and state variables always aligned */ + class StateBox : boost::noncopyable { + public: + struct SP { // SP is like pair<MemberState,const Member *> but nicer + SP() : state(MemberState::RS_STARTUP), primary(0) { } + MemberState state; + const Member *primary; + }; + const SP get() { + rwlock lk(m, false); + return sp; + } + MemberState getState() const { + rwlock lk(m, false); + return sp.state; + } + const Member* getPrimary() const { + rwlock lk(m, false); + return sp.primary; + } + void change(MemberState s, const Member *self) { + rwlock lk(m, true); + if( sp.state != s ) { + log() << "replSet " << s.toString() << rsLog; + } + sp.state = s; + if( s.primary() ) { + sp.primary = self; + } + else { + if( self == sp.primary ) + sp.primary = 0; + } + } + void set(MemberState s, const Member *p) { + rwlock lk(m, true); + sp.state = s; + sp.primary = p; + } + void setSelfPrimary(const Member *self) { change(MemberState::RS_PRIMARY, self); } + void setOtherPrimary(const Member *mem) { + rwlock lk(m, true); + assert( !sp.state.primary() ); + sp.primary = mem; + } + void noteRemoteIsPrimary(const Member *remote) { + rwlock lk(m, true); + if( !sp.state.secondary() && !sp.state.fatal() ) + sp.state = MemberState::RS_RECOVERING; + sp.primary = remote; + } + StateBox() : m("StateBox") { } + private: + RWLock m; + SP sp; + }; + + void parseReplsetCmdLine(string cfgString, string& setname, vector<HostAndPort>& seeds, set<HostAndPort>& seedSet ); + + /** Parameter given to the --replSet command line option (parsed). + Syntax is "<setname>/<seedhost1>,<seedhost2>" + where setname is a name and seedhost is "<host>[:<port>]" */ + class ReplSetCmdline { + public: + ReplSetCmdline(string cfgString) { parseReplsetCmdLine(cfgString, setname, seeds, seedSet); } + string setname; + vector<HostAndPort> seeds; + set<HostAndPort> seedSet; + }; + + /* information about the entire repl set, such as the various servers in the set, and their state */ + /* note: We currently do not free mem when the set goes away - it is assumed the replset is a + singleton and long lived. + */ + class ReplSetImpl : protected RSBase { + public: + /** info on our state if the replset isn't yet "up". for example, if we are pre-initiation. */ + enum StartupStatus { + PRESTART=0, LOADINGCONFIG=1, BADCONFIG=2, EMPTYCONFIG=3, + EMPTYUNREACHABLE=4, STARTED=5, SOON=6 + }; + static StartupStatus startupStatus; + static DiagStr startupStatusMsg; + static string stateAsHtml(MemberState state); + + /* todo thread */ + void msgUpdateHBInfo(HeartbeatInfo); + + StateBox box; + + OpTime lastOpTimeWritten; + long long lastH; // hash we use to make sure we are reading the right flow of ops and aren't on an out-of-date "fork" + private: + set<ReplSetHealthPollTask*> healthTasks; + void endOldHealthTasks(); + void startHealthTaskFor(Member *m); + + Consensus elect; + void relinquish(); + void forgetPrimary(); + protected: + bool _stepDown(int secs); + bool _freeze(int secs); + private: + void assumePrimary(); + void loadLastOpTimeWritten(bool quiet=false); + void changeState(MemberState s); + + /** + * Find the closest member (using ping time) with a higher latest optime. + */ + Member* getMemberToSyncTo(); + void veto(const string& host, unsigned secs=10); + Member* _currentSyncTarget; + + bool _blockSync; + void blockSync(bool block); + + // set of electable members' _ids + set<unsigned> _electableSet; + protected: + // "heartbeat message" + // sent in requestHeartbeat respond in field "hbm" + char _hbmsg[256]; // we change this unlocked, thus not an stl::string + time_t _hbmsgTime; // when it was logged + public: + void sethbmsg(string s, int logLevel = 0); + + /** + * Election with Priorities + * + * Each node (n) keeps a set of nodes that could be elected primary. + * Each node in this set: + * + * 1. can connect to a majority of the set + * 2. has a priority greater than 0 + * 3. has an optime within 10 seconds of the most up-to-date node + * that n can reach + * + * If a node fails to meet one or more of these criteria, it is removed + * from the list. This list is updated whenever the node receives a + * heartbeat. + * + * When a node sends an "am I freshest?" query, the node receiving the + * query checks their electable list to make sure that no one else is + * electable AND higher priority. If this check passes, the node will + * return an "ok" response, if not, it will veto. + * + * If a node is primary and there is another node with higher priority + * on the electable list (i.e., it must be synced to within 10 seconds + * of the current primary), the node (or nodes) with connections to both + * the primary and the secondary with higher priority will issue + * replSetStepDown requests to the primary to allow the higher-priority + * node to take over. + */ + void addToElectable(const unsigned m) { lock lk(this); _electableSet.insert(m); } + void rmFromElectable(const unsigned m) { lock lk(this); _electableSet.erase(m); } + bool iAmElectable() { lock lk(this); return _electableSet.find(_self->id()) != _electableSet.end(); } + bool isElectable(const unsigned id) { lock lk(this); return _electableSet.find(id) != _electableSet.end(); } + Member* getMostElectable(); + protected: + /** + * Load a new config as the replica set's main config. + * + * If there is a "simple" change (just adding a node), this shortcuts + * the config. Returns true if the config was changed. Returns false + * if the config doesn't include a this node. Throws an exception if + * something goes very wrong. + * + * Behavior to note: + * - locks this + * - intentionally leaks the old _cfg and any old _members (if the + * change isn't strictly additive) + */ + bool initFromConfig(ReplSetConfig& c, bool reconf=false); + void _fillIsMaster(BSONObjBuilder&); + void _fillIsMasterHost(const Member*, vector<string>&, vector<string>&, vector<string>&); + const ReplSetConfig& config() { return *_cfg; } + string name() const { return _name; } /* @return replica set's logical name */ + MemberState state() const { return box.getState(); } + void _fatal(); + void _getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const; + void _summarizeAsHtml(stringstream&) const; + void _summarizeStatus(BSONObjBuilder&) const; // for replSetGetStatus command + + /* throws exception if a problem initializing. */ + ReplSetImpl(ReplSetCmdline&); + + /* call afer constructing to start - returns fairly quickly after launching its threads */ + void _go(); + + private: + string _name; + const vector<HostAndPort> *_seeds; + ReplSetConfig *_cfg; + + /** + * Finds the configuration with the highest version number and attempts + * load it. + */ + bool _loadConfigFinish(vector<ReplSetConfig>& v); + /** + * Gather all possible configs (from command line seeds, our own config + * doc, and any hosts listed therein) and try to initiate from the most + * recent config we find. + */ + void loadConfig(); + + list<HostAndPort> memberHostnames() const; + const ReplSetConfig::MemberCfg& myConfig() const { return _config; } + bool iAmArbiterOnly() const { return myConfig().arbiterOnly; } + bool iAmPotentiallyHot() const { + return myConfig().potentiallyHot() && // not an arbiter + elect.steppedDown <= time(0) && // not stepped down/frozen + state() == MemberState::RS_SECONDARY; // not stale + } + protected: + Member *_self; + bool _buildIndexes; // = _self->config().buildIndexes + void setSelfTo(Member *); // use this as it sets buildIndexes var + private: + List1<Member> _members; // all members of the set EXCEPT _self. + ReplSetConfig::MemberCfg _config; // config of _self + unsigned _id; // _id of _self + + int _maintenanceMode; // if we should stay in recovering state + public: + // this is called from within a writelock in logOpRS + unsigned selfId() const { return _id; } + Manager *mgr; + GhostSync *ghost; + /** + * This forces a secondary to go into recovering state and stay there + * until this is called again, passing in "false". Multiple threads can + * call this and it will leave maintenance mode once all of the callers + * have called it again, passing in false. + */ + void setMaintenanceMode(const bool inc); + private: + Member* head() const { return _members.head(); } + public: + const Member* findById(unsigned id) const; + private: + void _getTargets(list<Target>&, int &configVersion); + void getTargets(list<Target>&, int &configVersion); + void startThreads(); + friend class FeedbackThread; + friend class CmdReplSetElect; + friend class Member; + friend class Manager; + friend class GhostSync; + friend class Consensus; + + private: + bool initialSyncOplogApplication(const OpTime& applyGTE, const OpTime& minValid); + void _syncDoInitialSync(); + void syncDoInitialSync(); + void _syncThread(); + bool tryToGoLiveAsASecondary(OpTime&); // readlocks + void syncTail(); + unsigned _syncRollback(OplogReader& r); + void syncRollback(OplogReader& r); + void syncFixUp(HowToFixUp& h, OplogReader& r); + + // get an oplog reader for a server with an oplog entry timestamp greater + // than or equal to minTS, if set. + Member* _getOplogReader(OplogReader& r, const OpTime& minTS); + + // check lastOpTimeWritten against the remote's earliest op, filling in + // remoteOldestOp. + bool _isStale(OplogReader& r, const OpTime& minTS, BSONObj& remoteOldestOp); + + // keep a list of hosts that we've tried recently that didn't work + map<string,time_t> _veto; + public: + void syncThread(); + const OpTime lastOtherOpTime() const; + }; + + class ReplSet : public ReplSetImpl { + public: + ReplSet(ReplSetCmdline& replSetCmdline) : ReplSetImpl(replSetCmdline) { } + + // for the replSetStepDown command + bool stepDown(int secs) { return _stepDown(secs); } + + // for the replSetFreeze command + bool freeze(int secs) { return _freeze(secs); } + + string selfFullName() { + assert( _self ); + return _self->fullName(); + } + + bool buildIndexes() const { return _buildIndexes; } + + /* call after constructing to start - returns fairly quickly after la[unching its threads */ + void go() { _go(); } + + void fatal() { _fatal(); } + bool isPrimary() { return box.getState().primary(); } + bool isSecondary() { return box.getState().secondary(); } + MemberState state() const { return ReplSetImpl::state(); } + string name() const { return ReplSetImpl::name(); } + const ReplSetConfig& config() { return ReplSetImpl::config(); } + void getOplogDiagsAsHtml(unsigned server_id, stringstream& ss) const { _getOplogDiagsAsHtml(server_id,ss); } + void summarizeAsHtml(stringstream& ss) const { _summarizeAsHtml(ss); } + void summarizeStatus(BSONObjBuilder& b) const { _summarizeStatus(b); } + void fillIsMaster(BSONObjBuilder& b) { _fillIsMaster(b); } + + /** + * We have a new config (reconfig) - apply it. + * @param comment write a no-op comment to the oplog about it. only + * makes sense if one is primary and initiating the reconf. + * + * The slaves are updated when they get a heartbeat indicating the new + * config. The comment is a no-op. + */ + void haveNewConfig(ReplSetConfig& c, bool comment); + + /** + * Pointer assignment isn't necessarily atomic, so this needs to assure + * locking, even though we don't delete old configs. + */ + const ReplSetConfig& getConfig() { return config(); } + + bool lockedByMe() { return RSBase::lockedByMe(); } + + // heartbeat msg to send to others; descriptive diagnostic info + string hbmsg() const { + if( time(0)-_hbmsgTime > 120 ) return ""; + return _hbmsg; + } + }; + + /** + * Base class for repl set commands. Checks basic things such if we're in + * rs mode before the command does its real work. + */ + class ReplSetCommand : public Command { + protected: + ReplSetCommand(const char * s, bool show=false) : Command(s, show) { } + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return true; } + virtual bool logTheOp() { return false; } + virtual LockType locktype() const { return NONE; } + virtual void help( stringstream &help ) const { help << "internal"; } + + /** + * Some replica set commands call this and then call check(). This is + * intentional, as they might do things before theReplSet is initialized + * that still need to be checked for auth. + */ + bool checkAuth(string& errmsg, BSONObjBuilder& result) { + if( !noauth ) { + AuthenticationInfo *ai = cc().getAuthenticationInfo(); + if (!ai->isAuthorizedForLock("admin", locktype())) { + errmsg = "replSet command unauthorized"; + return false; + } + } + return true; + } + + bool check(string& errmsg, BSONObjBuilder& result) { + if( !replSet ) { + errmsg = "not running with --replSet"; + if( cmdLine.configsvr ) { + result.append("info", "configsvr"); // for shell prompt + } + return false; + } + + if( theReplSet == 0 ) { + result.append("startupStatus", ReplSet::startupStatus); + string s; + errmsg = ReplSet::startupStatusMsg.empty() ? "replset unknown error 2" : ReplSet::startupStatusMsg.get(); + if( ReplSet::startupStatus == 3 ) + result.append("info", "run rs.initiate(...) if not yet done for the set"); + return false; + } + + return checkAuth(errmsg, result); + } + }; + + /** + * does local authentication + * directly authorizes against AuthenticationInfo + */ + void replLocalAuth(); + + /** inlines ----------------- */ + + inline Member::Member(HostAndPort h, unsigned ord, ReplSetConfig::MemberCfg *c, bool self) : + _config(*c), _h(h), _hbinfo(ord) { + assert(c); + if( self ) + _hbinfo.health = 1.0; + } + +} diff --git a/src/mongo/db/repl/rs_config.cpp b/src/mongo/db/repl/rs_config.cpp new file mode 100644 index 00000000000..22137773aec --- /dev/null +++ b/src/mongo/db/repl/rs_config.cpp @@ -0,0 +1,662 @@ +// rs_config.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "rs.h" +#include "../../client/dbclient.h" +#include "../../client/syncclusterconnection.h" +#include "../../util/net/hostandport.h" +#include "../dbhelpers.h" +#include "connections.h" +#include "../oplog.h" +#include "../instance.h" +#include "../../util/text.h" +#include <boost/algorithm/string.hpp> + +using namespace bson; + +namespace mongo { + + void logOpInitiate(const bo&); + + void assertOnlyHas(BSONObj o, const set<string>& fields) { + BSONObj::iterator i(o); + while( i.more() ) { + BSONElement e = i.next(); + if( !fields.count( e.fieldName() ) ) { + uasserted(13434, str::stream() << "unexpected field '" << e.fieldName() << "' in object"); + } + } + } + + list<HostAndPort> ReplSetConfig::otherMemberHostnames() const { + list<HostAndPort> L; + for( vector<MemberCfg>::const_iterator i = members.begin(); i != members.end(); i++ ) { + if( !i->h.isSelf() ) + L.push_back(i->h); + } + return L; + } + + /* comment MUST only be set when initiating the set by the initiator */ + void ReplSetConfig::saveConfigLocally(bo comment) { + checkRsConfig(); + log() << "replSet info saving a newer config version to local.system.replset" << rsLog; + { + writelock lk(""); + Client::Context cx( rsConfigNs ); + cx.db()->flushFiles(true); + + //theReplSet->lastOpTimeWritten = ??; + //rather than above, do a logOp()? probably + BSONObj o = asBson(); + Helpers::putSingletonGod(rsConfigNs.c_str(), o, false/*logOp=false; local db so would work regardless...*/); + if( !comment.isEmpty() && (!theReplSet || theReplSet->isPrimary()) ) + logOpInitiate(comment); + + cx.db()->flushFiles(true); + } + log() << "replSet saveConfigLocally done" << rsLog; + } + + bo ReplSetConfig::MemberCfg::asBson() const { + bob b; + b << "_id" << _id; + b.append("host", h.dynString()); + if( votes != 1 ) b << "votes" << votes; + if( priority != 1.0 ) b << "priority" << priority; + if( arbiterOnly ) b << "arbiterOnly" << true; + if( slaveDelay ) b << "slaveDelay" << slaveDelay; + if( hidden ) b << "hidden" << hidden; + if( !buildIndexes ) b << "buildIndexes" << buildIndexes; + if( !tags.empty() ) { + BSONObjBuilder a; + for( map<string,string>::const_iterator i = tags.begin(); i != tags.end(); i++ ) + a.append((*i).first, (*i).second); + b.append("tags", a.done()); + } + return b.obj(); + } + + void ReplSetConfig::updateMembers(List1<Member> &dest) { + for (vector<MemberCfg>::iterator source = members.begin(); source < members.end(); source++) { + for( Member *d = dest.head(); d; d = d->next() ) { + if (d->fullName() == (*source).h.toString()) { + d->configw().groupsw() = (*source).groups(); + } + } + } + } + + bo ReplSetConfig::asBson() const { + bob b; + b.append("_id", _id).append("version", version); + + BSONArrayBuilder a; + for( unsigned i = 0; i < members.size(); i++ ) + a.append( members[i].asBson() ); + b.append("members", a.arr()); + + if( !ho.isDefault() || !getLastErrorDefaults.isEmpty() || !rules.empty()) { + bob settings; + if( !rules.empty() ) { + bob modes; + for (map<string,TagRule*>::const_iterator it = rules.begin(); it != rules.end(); it++) { + bob clauses; + vector<TagClause*> r = (*it).second->clauses; + for (vector<TagClause*>::iterator it2 = r.begin(); it2 < r.end(); it2++) { + clauses << (*it2)->name << (*it2)->target; + } + modes << (*it).first << clauses.obj(); + } + settings << "getLastErrorModes" << modes.obj(); + } + if( !getLastErrorDefaults.isEmpty() ) + settings << "getLastErrorDefaults" << getLastErrorDefaults; + b << "settings" << settings.obj(); + } + + return b.obj(); + } + + static inline void mchk(bool expr) { + uassert(13126, "bad Member config", expr); + } + + void ReplSetConfig::MemberCfg::check() const { + mchk(_id >= 0 && _id <= 255); + mchk(priority >= 0 && priority <= 1000); + mchk(votes <= 100); // votes >= 0 because it is unsigned + uassert(13419, "priorities must be between 0.0 and 100.0", priority >= 0.0 && priority <= 100.0); + uassert(13437, "slaveDelay requires priority be zero", slaveDelay == 0 || priority == 0); + uassert(13438, "bad slaveDelay value", slaveDelay >= 0 && slaveDelay <= 3600 * 24 * 366); + uassert(13439, "priority must be 0 when hidden=true", priority == 0 || !hidden); + uassert(13477, "priority must be 0 when buildIndexes=false", buildIndexes || priority == 0); + } +/* + string ReplSetConfig::TagSubgroup::toString() const { + bool first = true; + string result = "\""+name+"\": ["; + for (set<const MemberCfg*>::const_iterator i = m.begin(); i != m.end(); i++) { + if (!first) { + result += ", "; + } + first = false; + result += (*i)->h.toString(); + } + return result+"]"; + } + */ + string ReplSetConfig::TagClause::toString() const { + string result = name+": {"; + for (map<string,TagSubgroup*>::const_iterator i = subgroups.begin(); i != subgroups.end(); i++) { +//TEMP? result += (*i).second->toString()+", "; + } + result += "TagClause toString TEMPORARILY DISABLED"; + return result + "}"; + } + + string ReplSetConfig::TagRule::toString() const { + string result = "{"; + for (vector<TagClause*>::const_iterator it = clauses.begin(); it < clauses.end(); it++) { + result += ((TagClause*)(*it))->toString()+","; + } + return result+"}"; + } + + void ReplSetConfig::TagSubgroup::updateLast(const OpTime& op) { + RACECHECK + if (last < op) { + last = op; + + for (vector<TagClause*>::iterator it = clauses.begin(); it < clauses.end(); it++) { + (*it)->updateLast(op); + } + } + } + + void ReplSetConfig::TagClause::updateLast(const OpTime& op) { + RACECHECK + if (last >= op) { + return; + } + + // check at least n subgroups greater than clause.last + int count = 0; + map<string,TagSubgroup*>::iterator it; + for (it = subgroups.begin(); it != subgroups.end(); it++) { + if ((*it).second->last >= op) { + count++; + } + } + + if (count >= actualTarget) { + last = op; + rule->updateLast(op); + } + } + + void ReplSetConfig::TagRule::updateLast(const OpTime& op) { + OpTime *earliest = (OpTime*)&op; + vector<TagClause*>::iterator it; + + for (it = clauses.begin(); it < clauses.end(); it++) { + if ((*it)->last < *earliest) { + earliest = &(*it)->last; + } + } + + // rules are simply and-ed clauses, so whatever the most-behind + // clause is at is what the rule is at + last = *earliest; + } + + /** @param o old config + @param n new config + */ + /*static*/ + bool ReplSetConfig::legalChange(const ReplSetConfig& o, const ReplSetConfig& n, string& errmsg) { + assert( theReplSet ); + + if( o._id != n._id ) { + errmsg = "set name may not change"; + return false; + } + /* TODO : wonder if we need to allow o.version < n.version only, which is more lenient. + if someone had some intermediate config this node doesnt have, that could be + necessary. but then how did we become primary? so perhaps we are fine as-is. + */ + if( o.version >= n.version ) { + errmsg = str::stream() << "version number must increase, old: " + << o.version << " new: " << n.version; + return false; + } + + map<HostAndPort,const ReplSetConfig::MemberCfg*> old; + bool isLocalHost = false; + for( vector<ReplSetConfig::MemberCfg>::const_iterator i = o.members.begin(); i != o.members.end(); i++ ) { + if (i->h.isLocalHost()) { + isLocalHost = true; + } + old[i->h] = &(*i); + } + int me = 0; + for( vector<ReplSetConfig::MemberCfg>::const_iterator i = n.members.begin(); i != n.members.end(); i++ ) { + const ReplSetConfig::MemberCfg& m = *i; + if ( (isLocalHost && !m.h.isLocalHost()) || (!isLocalHost && m.h.isLocalHost())) { + log() << "reconfig error, cannot switch between localhost and hostnames: " + << m.h.toString() << rsLog; + uasserted(13645, "hosts cannot switch between localhost and hostname"); + } + if( old.count(m.h) ) { + const ReplSetConfig::MemberCfg& oldCfg = *old[m.h]; + if( oldCfg._id != m._id ) { + log() << "replSet reconfig error with member: " << m.h.toString() << rsLog; + uasserted(13432, "_id may not change for members"); + } + if( oldCfg.buildIndexes != m.buildIndexes ) { + log() << "replSet reconfig error with member: " << m.h.toString() << rsLog; + uasserted(13476, "buildIndexes may not change for members"); + } + /* are transitions to and from arbiterOnly guaranteed safe? if not, we should disallow here. + there is a test at replsets/replsetarb3.js */ + if( oldCfg.arbiterOnly != m.arbiterOnly ) { + log() << "replSet reconfig error with member: " << m.h.toString() << " arbiterOnly cannot change. remove and readd the member instead " << rsLog; + uasserted(13510, "arbiterOnly may not change for members"); + } + } + if( m.h.isSelf() ) + me++; + } + + uassert(13433, "can't find self in new replset config", me == 1); + + return true; + } + + void ReplSetConfig::clear() { + version = -5; + _ok = false; + } + + void ReplSetConfig::setMajority() { + int total = members.size(); + int nonArbiters = total; + int strictMajority = total/2+1; + + for (vector<MemberCfg>::iterator it = members.begin(); it < members.end(); it++) { + if ((*it).arbiterOnly) { + nonArbiters--; + } + } + + // majority should be all "normal" members if we have something like 4 + // arbiters & 3 normal members + _majority = (strictMajority > nonArbiters) ? nonArbiters : strictMajority; + } + + int ReplSetConfig::getMajority() const { + return _majority; + } + + void ReplSetConfig::checkRsConfig() const { + uassert(13132, + str::stream() << "nonmatching repl set name in _id field: " << _id << " vs. " << cmdLine.ourSetName(), + _id == cmdLine.ourSetName()); + uassert(13308, "replSet bad config version #", version > 0); + uassert(13133, "replSet bad config no members", members.size() >= 1); + uassert(13309, "replSet bad config maximum number of members is 12", members.size() <= 12); + { + unsigned voters = 0; + for( vector<MemberCfg>::const_iterator i = members.begin(); i != members.end(); ++i ) { + if( i->votes ) + voters++; + } + uassert(13612, "replSet bad config maximum number of voting members is 7", voters <= 7); + uassert(13613, "replSet bad config no voting members", voters > 0); + } + } + + void ReplSetConfig::_populateTagMap(map<string,TagClause> &tagMap) { + // create subgroups for each server corresponding to each of + // its tags. E.g.: + // + // A is tagged with {"server" : "A", "dc" : "ny"} + // B is tagged with {"server" : "B", "dc" : "ny"} + // + // At the end of this step, tagMap will contain: + // + // "server" => {"A" : [A], "B" : [B]} + // "dc" => {"ny" : [A,B]} + + for (unsigned i=0; i<members.size(); i++) { + MemberCfg member = members[i]; + + for (map<string,string>::iterator tag = member.tags.begin(); tag != member.tags.end(); tag++) { + string label = (*tag).first; + string value = (*tag).second; + + TagClause& clause = tagMap[label]; + clause.name = label; + + TagSubgroup* subgroup; + // search for "ny" in "dc"'s clause + if (clause.subgroups.find(value) == clause.subgroups.end()) { + clause.subgroups[value] = subgroup = new TagSubgroup(value); + } + else { + subgroup = clause.subgroups[value]; + } + + subgroup->m.insert(&members[i]); + } + } + } + + void ReplSetConfig::parseRules(const BSONObj& modes) { + map<string,TagClause> tagMap; + _populateTagMap(tagMap); + + for (BSONObj::iterator i = modes.begin(); i.more(); ) { + unsigned int primaryOnly = 0; + + // ruleName : {dc : 2, m : 3} + BSONElement rule = i.next(); + uassert(14046, "getLastErrorMode rules must be objects", rule.type() == mongo::Object); + + TagRule* r = new TagRule(); + + BSONObj clauseObj = rule.Obj(); + for (BSONObj::iterator c = clauseObj.begin(); c.more(); ) { + BSONElement clauseElem = c.next(); + uassert(14829, "getLastErrorMode criteria must be numeric", clauseElem.isNumber()); + + // get the clause, e.g., "x.y" : 3 + const char *criteria = clauseElem.fieldName(); + int value = clauseElem.numberInt(); + uassert(14828, str::stream() << "getLastErrorMode criteria must be greater than 0: " << clauseElem, value > 0); + + TagClause* node = new TagClause(tagMap[criteria]); + + int numGroups = node->subgroups.size(); + uassert(14831, str::stream() << "mode " << clauseObj << " requires " + << value << " tagged with " << criteria << ", but only " + << numGroups << " with this tag were found", numGroups >= value); + + node->name = criteria; + node->target = value; + // if any subgroups contain "me", we can decrease the target + node->actualTarget = node->target; + + // then we want to add pointers between clause & subgroup + for (map<string,TagSubgroup*>::iterator sgs = node->subgroups.begin(); + sgs != node->subgroups.end(); sgs++) { + bool foundMe = false; + (*sgs).second->clauses.push_back(node); + + // if this subgroup contains the primary, it's automatically always up-to-date + for( set<MemberCfg*>::const_iterator cfg = (*sgs).second->m.begin(); + cfg != (*sgs).second->m.end(); + cfg++) + { + if ((*cfg)->h.isSelf()) { + node->actualTarget--; + foundMe = true; + } + } + + for (set<MemberCfg *>::iterator cfg = (*sgs).second->m.begin(); + !foundMe && cfg != (*sgs).second->m.end(); cfg++) { + (*cfg)->groupsw().insert((*sgs).second); + } + } + + // if all of the members of this clause involve the primary, it's always up-to-date + if (node->actualTarget == 0) { + node->last = OpTime(INT_MAX, INT_MAX); + primaryOnly++; + } + + // this is a valid clause, so we want to add it to its rule + node->rule = r; + r->clauses.push_back(node); + } + + // if all of the clauses are satisfied by the primary, this rule is trivially true + if (primaryOnly == r->clauses.size()) { + r->last = OpTime(INT_MAX, INT_MAX); + } + + // if we got here, this is a valid rule + LOG(1) << "replSet new rule " << rule.fieldName() << ": " << r->toString() << rsLog; + rules[rule.fieldName()] = r; + } + } + + void ReplSetConfig::from(BSONObj o) { + static const string legal[] = {"_id","version", "members","settings"}; + static const set<string> legals(legal, legal + 4); + assertOnlyHas(o, legals); + + md5 = o.md5(); + _id = o["_id"].String(); + if( o["version"].ok() ) { + version = o["version"].numberInt(); + uassert(13115, "bad " + rsConfigNs + " config: version", version > 0); + } + + set<string> hosts; + set<int> ords; + vector<BSONElement> members; + try { + members = o["members"].Array(); + } + catch(...) { + uasserted(13131, "replSet error parsing (or missing) 'members' field in config object"); + } + + unsigned localhosts = 0; + for( unsigned i = 0; i < members.size(); i++ ) { + BSONObj mobj = members[i].Obj(); + MemberCfg m; + try { + static const string legal[] = { + "_id","votes","priority","host", "hidden","slaveDelay", + "arbiterOnly","buildIndexes","tags","initialSync" // deprecated + }; + static const set<string> legals(legal, legal + 10); + assertOnlyHas(mobj, legals); + + try { + m._id = (int) mobj["_id"].Number(); + } + catch(...) { + /* TODO: use of string exceptions may be problematic for reconfig case! */ + throw "_id must be numeric"; + } + try { + string s = mobj["host"].String(); + boost::trim(s); + m.h = HostAndPort(s); + if ( !m.h.hasPort() ) { + // make port explicit even if default + m.h.setPort(m.h.port()); + } + } + catch(...) { + throw string("bad or missing host field? ") + mobj.toString(); + } + if( m.h.isLocalHost() ) + localhosts++; + m.arbiterOnly = mobj["arbiterOnly"].trueValue(); + m.slaveDelay = mobj["slaveDelay"].numberInt(); + if( mobj.hasElement("hidden") ) + m.hidden = mobj["hidden"].trueValue(); + if( mobj.hasElement("buildIndexes") ) + m.buildIndexes = mobj["buildIndexes"].trueValue(); + if( mobj.hasElement("priority") ) + m.priority = mobj["priority"].Number(); + if( mobj.hasElement("votes") ) + m.votes = (unsigned) mobj["votes"].Number(); + if( mobj.hasElement("tags") ) { + const BSONObj &t = mobj["tags"].Obj(); + for (BSONObj::iterator c = t.begin(); c.more(); c.next()) { + m.tags[(*c).fieldName()] = (*c).String(); + } + uassert(14827, "arbiters cannot have tags", !m.arbiterOnly || m.tags.empty() ); + } + m.check(); + } + catch( const char * p ) { + log() << "replSet cfg parsing exception for members[" << i << "] " << p << rsLog; + stringstream ss; + ss << "replSet members[" << i << "] " << p; + uassert(13107, ss.str(), false); + } + catch(DBException& e) { + log() << "replSet cfg parsing exception for members[" << i << "] " << e.what() << rsLog; + stringstream ss; + ss << "bad config for member[" << i << "] " << e.what(); + uassert(13135, ss.str(), false); + } + if( !(ords.count(m._id) == 0 && hosts.count(m.h.toString()) == 0) ) { + log() << "replSet " << o.toString() << rsLog; + uassert(13108, "bad replset config -- duplicate hosts in the config object?", false); + } + hosts.insert(m.h.dynString()); + ords.insert(m._id); + this->members.push_back(m); + } + uassert(13393, "can't use localhost in repl set member names except when using it for all members", localhosts == 0 || localhosts == members.size()); + uassert(13117, "bad " + rsConfigNs + " config", !_id.empty()); + + if( o["settings"].ok() ) { + BSONObj settings = o["settings"].Obj(); + if( settings["getLastErrorModes"].ok() ) { + parseRules(settings["getLastErrorModes"].Obj()); + } + ho.check(); + try { getLastErrorDefaults = settings["getLastErrorDefaults"].Obj().copy(); } + catch(...) { } + } + + // figure out the majority for this config + setMajority(); + } + + static inline void configAssert(bool expr) { + uassert(13122, "bad repl set config?", expr); + } + + ReplSetConfig::ReplSetConfig(BSONObj cfg, bool force) { + _constructed = false; + clear(); + from(cfg); + if( force ) { + version += rand() % 100000 + 10000; + } + configAssert( version < 0 /*unspecified*/ || (version >= 1) ); + if( version < 1 ) + version = 1; + _ok = true; + _constructed = true; + } + + ReplSetConfig::ReplSetConfig(const HostAndPort& h) { + LOG(2) << "ReplSetConfig load " << h.toStringLong() << rsLog; + + _constructed = false; + clear(); + int level = 2; + DEV level = 0; + + BSONObj cfg; + int v = -5; + try { + if( h.isSelf() ) { + ; + } + else { + /* first, make sure other node is configured to be a replset. just to be safe. */ + string setname = cmdLine.ourSetName(); + BSONObj cmd = BSON( "replSetHeartbeat" << setname ); + int theirVersion; + BSONObj info; + log() << "trying to contact " << h.toString() << rsLog; + bool ok = requestHeartbeat(setname, "", h.toString(), info, -2, theirVersion); + if( info["rs"].trueValue() ) { + // yes, it is a replicate set, although perhaps not yet initialized + } + else { + if( !ok ) { + log() << "replSet TEMP !ok heartbeating " << h.toString() << " on cfg load" << rsLog; + if( !info.isEmpty() ) + log() << "replSet info " << h.toString() << " : " << info.toString() << rsLog; + return; + } + { + stringstream ss; + ss << "replSet error: member " << h.toString() << " is not in --replSet mode"; + msgassertedNoTrace(13260, ss.str().c_str()); // not caught as not a user exception - we want it not caught + //for python err# checker: uassert(13260, "", false); + } + } + } + + v = -4; + unsigned long long count = 0; + try { + ScopedConn conn(h.toString()); + v = -3; + cfg = conn.findOne(rsConfigNs, Query()).getOwned(); + count = conn.count(rsConfigNs); + } + catch ( DBException& ) { + if ( !h.isSelf() ) { + throw; + } + + // on startup, socket is not listening yet + DBDirectClient cli; + cfg = cli.findOne( rsConfigNs, Query() ).getOwned(); + count = cli.count(rsConfigNs); + } + + if( count > 1 ) + uasserted(13109, str::stream() << "multiple rows in " << rsConfigNs << " not supported host: " << h.toString()); + + if( cfg.isEmpty() ) { + version = EMPTYCONFIG; + return; + } + version = -1; + } + catch( DBException& e) { + version = v; + log(level) << "replSet load config couldn't get from " << h.toString() << ' ' << e.what() << rsLog; + return; + } + + from(cfg); + checkRsConfig(); + _ok = true; + log(level) << "replSet load config ok from " << (h.isSelf() ? "self" : h.toString()) << rsLog; + _constructed = true; + } + +} diff --git a/src/mongo/db/repl/rs_config.h b/src/mongo/db/repl/rs_config.h new file mode 100644 index 00000000000..cfe2e86a568 --- /dev/null +++ b/src/mongo/db/repl/rs_config.h @@ -0,0 +1,251 @@ +// rs_config.h +// repl set configuration +// + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../../util/net/hostandport.h" +#include "../../util/concurrency/race.h" +#include "health.h" + +namespace mongo { + class Member; + const string rsConfigNs = "local.system.replset"; + + class ReplSetConfig { + enum { EMPTYCONFIG = -2 }; + struct TagSubgroup; + public: + /** + * This contacts the given host and tries to get a config from them. + * + * This sends a test heartbeat to the host and, if all goes well and the + * host has a more recent config, fetches the config and loads it (see + * from(). + * + * If it's contacting itself, it skips the heartbeat (for obvious + * reasons.) If something is misconfigured, throws an exception. If the + * host couldn't be queried or is just blank, ok() will be false. + */ + ReplSetConfig(const HostAndPort& h); + + ReplSetConfig(BSONObj cfg, bool force=false); + + bool ok() const { return _ok; } + + struct TagRule; + + struct MemberCfg { + MemberCfg() : _id(-1), votes(1), priority(1.0), arbiterOnly(false), slaveDelay(0), hidden(false), buildIndexes(true) { } + int _id; /* ordinal */ + unsigned votes; /* how many votes this node gets. default 1. */ + HostAndPort h; + double priority; /* 0 means can never be primary */ + bool arbiterOnly; + int slaveDelay; /* seconds. int rather than unsigned for convenient to/front bson conversion. */ + bool hidden; /* if set, don't advertise to drives in isMaster. for non-primaries (priority 0) */ + bool buildIndexes; /* if false, do not create any non-_id indexes */ + map<string,string> tags; /* tagging for data center, rack, etc. */ + private: + set<TagSubgroup*> _groups; // the subgroups this member belongs to + public: + const set<TagSubgroup*>& groups() const { + return _groups; + } + set<TagSubgroup*>& groupsw() { + return _groups; + } + void check() const; /* check validity, assert if not. */ + BSONObj asBson() const; + bool potentiallyHot() const { return !arbiterOnly && priority > 0; } + void updateGroups(const OpTime& last) { + RACECHECK + for (set<TagSubgroup*>::const_iterator it = groups().begin(); it != groups().end(); it++) { + ((TagSubgroup*)(*it))->updateLast(last); + } + } + bool operator==(const MemberCfg& r) const { + if (!tags.empty() || !r.tags.empty()) { + if (tags.size() != r.tags.size()) { + return false; + } + + // if they are the same size and not equal, at least one + // element in A must be different in B + for (map<string,string>::const_iterator lit = tags.begin(); lit != tags.end(); lit++) { + map<string,string>::const_iterator rit = r.tags.find((*lit).first); + + if (rit == r.tags.end() || (*lit).second != (*rit).second) { + return false; + } + } + } + + return _id==r._id && votes == r.votes && h == r.h && priority == r.priority && + arbiterOnly == r.arbiterOnly && slaveDelay == r.slaveDelay && hidden == r.hidden && + buildIndexes == buildIndexes; + } + bool operator!=(const MemberCfg& r) const { return !(*this == r); } + }; + + vector<MemberCfg> members; + string _id; + int version; + HealthOptions ho; + string md5; + BSONObj getLastErrorDefaults; + map<string,TagRule*> rules; + + list<HostAndPort> otherMemberHostnames() const; // except self + + /** @return true if could connect, and there is no cfg object there at all */ + bool empty() const { return version == EMPTYCONFIG; } + + string toString() const { return asBson().toString(); } + + /** validate the settings. does not call check() on each member, you have to do that separately. */ + void checkRsConfig() const; + + /** check if modification makes sense */ + static bool legalChange(const ReplSetConfig& old, const ReplSetConfig& n, string& errmsg); + + //static void receivedNewConfig(BSONObj); + void saveConfigLocally(BSONObj comment); // to local db + string saveConfigEverywhere(); // returns textual info on what happened + + /** + * Update members' groups when the config changes but members stay the same. + */ + void updateMembers(List1<Member> &dest); + + BSONObj asBson() const; + + /** + * Getter and setter for _majority. This is almost always + * members.size()/2+1, but can be the number of non-arbiter members if + * there are more arbiters than non-arbiters (writing to 3 out of 7 + * servers is safe if 4 of the servers are arbiters). + */ + void setMajority(); + int getMajority() const; + + bool _constructed; + private: + bool _ok; + int _majority; + + void from(BSONObj); + void clear(); + + struct TagClause; + + /** + * This is a logical grouping of servers. It is pointed to by a set of + * servers with a certain tag. + * + * For example, suppose servers A, B, and C have the tag "dc" : "nyc". If we + * have a rule {"dc" : 2}, then we want A _or_ B _or_ C to have the + * write for one of the "dc" critiria to be fulfilled, so all three will + * point to this subgroup. When one of their oplog-tailing cursors is + * updated, this subgroup is updated. + */ + struct TagSubgroup : boost::noncopyable { + ~TagSubgroup(); // never called; not defined + TagSubgroup(string nm) : name(nm) { } + const string name; + OpTime last; + vector<TagClause*> clauses; + + // this probably won't actually point to valid members after the + // subgroup is created, as initFromConfig() makes a copy of the + // config + set<MemberCfg*> m; + + void updateLast(const OpTime& op); + + //string toString() const; + + /** + * If two tags have the same name, they should compare as equal so + * that members don't have to update two identical groups on writes. + */ + bool operator() (TagSubgroup& lhs, TagSubgroup& rhs) const { + return lhs.name < rhs.name; + } + }; + + /** + * An argument in a rule. For example, if we had the rule {dc : 2, + * machines : 3}, "dc" : 2 and "machines" : 3 would be two TagClauses. + * + * Each tag clause has a set of associated subgroups. For example, if + * we had "dc" : 2, our subgroups might be "nyc", "sf", and "hk". + */ + struct TagClause { + OpTime last; + map<string,TagSubgroup*> subgroups; + TagRule *rule; + string name; + /** + * If we have get a clause like {machines : 3} and this server is + * tagged with "machines", then it's really {machines : 2}, as we + * will always be up-to-date. So, target would be 3 and + * actualTarget would be 2, in that example. + */ + int target; + int actualTarget; + + void updateLast(const OpTime& op); + string toString() const; + }; + + /** + * Parses getLastErrorModes. + */ + void parseRules(const BSONObj& modes); + + /** + * Create a hash containing every possible clause that could be used in a + * rule and the servers related to that clause. + * + * For example, suppose we have the following servers: + * A {"dc" : "ny", "ny" : "rk1"} + * B {"dc" : "ny", "ny" : "rk1"} + * C {"dc" : "ny", "ny" : "rk2"} + * D {"dc" : "sf", "sf" : "rk1"} + * E {"dc" : "sf", "sf" : "rk2"} + * + * This would give us the possible criteria: + * "dc" -> {A, B, C},{D, E} + * "ny" -> {A, B},{C} + * "sf" -> {D},{E} + */ + void _populateTagMap(map<string,TagClause> &tagMap); + + public: + struct TagRule { + vector<TagClause*> clauses; + OpTime last; + + void updateLast(const OpTime& op); + string toString() const; + }; + }; + +} diff --git a/src/mongo/db/repl/rs_exception.h b/src/mongo/db/repl/rs_exception.h new file mode 100644 index 00000000000..fc372fc241c --- /dev/null +++ b/src/mongo/db/repl/rs_exception.h @@ -0,0 +1,17 @@ +// @file rs_exception.h + +#pragma once + +namespace mongo { + + class VoteException : public std::exception { + public: + const char * what() const throw () { return "VoteException"; } + }; + + class RetryAfterSleepException : public std::exception { + public: + const char * what() const throw () { return "RetryAfterSleepException"; } + }; + +} diff --git a/src/mongo/db/repl/rs_initialsync.cpp b/src/mongo/db/repl/rs_initialsync.cpp new file mode 100644 index 00000000000..b67c0d71b83 --- /dev/null +++ b/src/mongo/db/repl/rs_initialsync.cpp @@ -0,0 +1,271 @@ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "../repl.h" +#include "../client.h" +#include "../../client/dbclient.h" +#include "rs.h" +#include "../oplogreader.h" +#include "../../util/mongoutils/str.h" +#include "../dbhelpers.h" +#include "rs_optime.h" +#include "../oplog.h" + +namespace mongo { + + using namespace mongoutils; + using namespace bson; + + void dropAllDatabasesExceptLocal(); + + // add try/catch with sleep + + void isyncassert(const string& msg, bool expr) { + if( !expr ) { + string m = str::stream() << "initial sync " << msg; + theReplSet->sethbmsg(m, 0); + uasserted(13404, m); + } + } + + void ReplSetImpl::syncDoInitialSync() { + createOplog(); + + while( 1 ) { + try { + _syncDoInitialSync(); + break; + } + catch(DBException& e) { + sethbmsg("initial sync exception " + e.toString(), 0); + sleepsecs(30); + } + } + } + + /* todo : progress metering to sethbmsg. */ + static bool clone(const char *master, string db) { + string err; + return cloneFrom(master, err, db, false, + /* slave_ok */ true, true, false, /*mayYield*/true, /*mayBeInterrupted*/false); + } + + void _logOpObjRS(const BSONObj& op); + + static void emptyOplog() { + writelock lk(rsoplog); + Client::Context ctx(rsoplog); + NamespaceDetails *d = nsdetails(rsoplog); + + // temp + if( d && d->stats.nrecords == 0 ) + return; // already empty, ok. + + LOG(1) << "replSet empty oplog" << rsLog; + d->emptyCappedCollection(rsoplog); + } + + Member* ReplSetImpl::getMemberToSyncTo() { + Member *closest = 0; + time_t now = 0; + bool buildIndexes = true; + + // wait for 2N pings before choosing a sync target + if (_cfg) { + int needMorePings = config().members.size()*2 - HeartbeatInfo::numPings; + + if (needMorePings > 0) { + OCCASIONALLY log() << "waiting for " << needMorePings << " pings from other members before syncing" << endl; + return NULL; + } + + buildIndexes = myConfig().buildIndexes; + } + + // find the member with the lowest ping time that has more data than me + for (Member *m = _members.head(); m; m = m->next()) { + if (m->hbinfo().up() && + // make sure members with buildIndexes sync from other members w/indexes + (!buildIndexes || (buildIndexes && m->config().buildIndexes)) && + (m->state() == MemberState::RS_PRIMARY || + (m->state() == MemberState::RS_SECONDARY && m->hbinfo().opTime > lastOpTimeWritten)) && + (!closest || m->hbinfo().ping < closest->hbinfo().ping)) { + + map<string,time_t>::iterator vetoed = _veto.find(m->fullName()); + if (vetoed == _veto.end()) { + closest = m; + break; + } + + if (now == 0) { + now = time(0); + } + + // if this was on the veto list, check if it was vetoed in the last "while" + if ((*vetoed).second < now) { + _veto.erase(vetoed); + closest = m; + break; + } + + // if it was recently vetoed, skip + log() << "replSet not trying to sync from " << (*vetoed).first + << ", it is vetoed for " << ((*vetoed).second - now) << " more seconds" << rsLog; + } + } + + { + lock lk(this); + + if (!closest) { + _currentSyncTarget = NULL; + return NULL; + } + + _currentSyncTarget = closest; + } + + sethbmsg( str::stream() << "syncing to: " << closest->fullName(), 0); + + return closest; + } + + void ReplSetImpl::veto(const string& host, const unsigned secs) { + _veto[host] = time(0)+secs; + } + + /** + * Do the initial sync for this member. + */ + void ReplSetImpl::_syncDoInitialSync() { + sethbmsg("initial sync pending",0); + + // if this is the first node, it may have already become primary + if ( box.getState().primary() ) { + sethbmsg("I'm already primary, no need for initial sync",0); + return; + } + + const Member *source = getMemberToSyncTo(); + if (!source) { + sethbmsg("initial sync need a member to be primary or secondary to do our initial sync", 0); + sleepsecs(15); + return; + } + + string sourceHostname = source->h().toString(); + OplogReader r; + if( !r.connect(sourceHostname) ) { + sethbmsg( str::stream() << "initial sync couldn't connect to " << source->h().toString() , 0); + sleepsecs(15); + return; + } + + BSONObj lastOp = r.getLastOp(rsoplog); + if( lastOp.isEmpty() ) { + sethbmsg("initial sync couldn't read remote oplog", 0); + sleepsecs(15); + return; + } + OpTime startingTS = lastOp["ts"]._opTime(); + + if (replSettings.fastsync) { + log() << "fastsync: skipping database clone" << rsLog; + } + else { + sethbmsg("initial sync drop all databases", 0); + dropAllDatabasesExceptLocal(); + + sethbmsg("initial sync clone all databases", 0); + + list<string> dbs = r.conn()->getDatabaseNames(); + for( list<string>::iterator i = dbs.begin(); i != dbs.end(); i++ ) { + string db = *i; + if( db != "local" ) { + sethbmsg( str::stream() << "initial sync cloning db: " << db , 0); + bool ok; + { + writelock lk(db); + Client::Context ctx(db); + ok = clone(sourceHostname.c_str(), db); + } + if( !ok ) { + sethbmsg( str::stream() << "initial sync error clone of " << db << " failed sleeping 5 minutes" ,0); + veto(source->fullName(), 600); + sleepsecs(300); + return; + } + } + } + } + + sethbmsg("initial sync query minValid",0); + + /* our cloned copy will be strange until we apply oplog events that occurred + through the process. we note that time point here. */ + BSONObj minValid = r.getLastOp(rsoplog); + isyncassert( "getLastOp is empty ", !minValid.isEmpty() ); + OpTime mvoptime = minValid["ts"]._opTime(); + assert( !mvoptime.isNull() ); + assert( mvoptime >= startingTS ); + + // apply startingTS..mvoptime portion of the oplog + { + // note we assume here that this call does not throw + if( ! initialSyncOplogApplication(startingTS, mvoptime) ) { + log() << "replSet initial sync failed during oplog application phase" << rsLog; + + emptyOplog(); // otherwise we'll be up! + + lastOpTimeWritten = OpTime(); + lastH = 0; + + log() << "replSet cleaning up [1]" << rsLog; + { + writelock lk("local."); + Client::Context cx( "local." ); + cx.db()->flushFiles(true); + } + log() << "replSet cleaning up [2]" << rsLog; + + log() << "replSet initial sync failed will try again" << endl; + + sleepsecs(5); + return; + } + } + + sethbmsg("initial sync finishing up",0); + + assert( !box.getState().primary() ); // wouldn't make sense if we were. + + { + writelock lk("local."); + Client::Context cx( "local." ); + cx.db()->flushFiles(true); + try { + log() << "replSet set minValid=" << minValid["ts"]._opTime().toString() << rsLog; + } + catch(...) { } + Helpers::putSingleton("local.replset.minvalid", minValid); + cx.db()->flushFiles(true); + } + + sethbmsg("initial sync done",0); + } + +} diff --git a/src/mongo/db/repl/rs_initiate.cpp b/src/mongo/db/repl/rs_initiate.cpp new file mode 100644 index 00000000000..77bc6c03938 --- /dev/null +++ b/src/mongo/db/repl/rs_initiate.cpp @@ -0,0 +1,269 @@ +/* @file rs_initiate.cpp + */ + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "../cmdline.h" +#include "../commands.h" +#include "../../util/mmap.h" +#include "../../util/mongoutils/str.h" +#include "health.h" +#include "rs.h" +#include "rs_config.h" +#include "../dbhelpers.h" +#include "../oplog.h" + +using namespace bson; +using namespace mongoutils; + +namespace mongo { + + /* called on a reconfig AND on initiate + throws + @param initial true when initiating + */ + void checkMembersUpForConfigChange(const ReplSetConfig& cfg, BSONObjBuilder& result, bool initial) { + int failures = 0, allVotes = 0, allowableFailures = 0; + int me = 0; + stringstream selfs; + for( vector<ReplSetConfig::MemberCfg>::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) { + if( i->h.isSelf() ) { + me++; + if( me > 1 ) + selfs << ','; + selfs << i->h.toString(); + if( !i->potentiallyHot() ) { + uasserted(13420, "initiation and reconfiguration of a replica set must be sent to a node that can become primary"); + } + } + allVotes += i->votes; + } + allowableFailures = allVotes - (allVotes/2 + 1); + + uassert(13278, "bad config: isSelf is true for multiple hosts: " + selfs.str(), me <= 1); // dups? + if( me != 1 ) { + stringstream ss; + ss << "can't find self in the replset config"; + if( !cmdLine.isDefaultPort() ) ss << " my port: " << cmdLine.port; + if( me != 0 ) ss << " found: " << me; + uasserted(13279, ss.str()); + } + + vector<string> down; + for( vector<ReplSetConfig::MemberCfg>::const_iterator i = cfg.members.begin(); i != cfg.members.end(); i++ ) { + // we know we're up + if (i->h.isSelf()) { + continue; + } + + BSONObj res; + { + bool ok = false; + try { + int theirVersion = -1000; + ok = requestHeartbeat(cfg._id, "", i->h.toString(), res, -1, theirVersion, initial/*check if empty*/); + if( theirVersion >= cfg.version ) { + stringstream ss; + ss << "replSet member " << i->h.toString() << " has too new a config version (" << theirVersion << ") to reconfigure"; + uasserted(13259, ss.str()); + } + } + catch(DBException& e) { + log() << "replSet cmufcc requestHeartbeat " << i->h.toString() << " : " << e.toString() << rsLog; + } + catch(...) { + log() << "replSet cmufcc error exception in requestHeartbeat?" << rsLog; + } + if( res.getBoolField("mismatch") ) + uasserted(13145, "set name does not match the set name host " + i->h.toString() + " expects"); + if( *res.getStringField("set") ) { + if( cfg.version <= 1 ) { + // this was to be initiation, no one shoudl be initiated already. + uasserted(13256, "member " + i->h.toString() + " is already initiated"); + } + else { + // Assure no one has a newer config. + if( res["v"].Int() >= cfg.version ) { + uasserted(13341, "member " + i->h.toString() + " has a config version >= to the new cfg version; cannot change config"); + } + } + } + if( !ok && !res["rs"].trueValue() ) { + down.push_back(i->h.toString()); + + if( !res.isEmpty() ) { + /* strange. got a response, but not "ok". log it. */ + log() << "replSet warning " << i->h.toString() << " replied: " << res.toString() << rsLog; + } + + bool allowFailure = false; + failures += i->votes; + if( !initial && failures <= allowableFailures ) { + const Member* m = theReplSet->findById( i->_id ); + if( m ) { + assert( m->h().toString() == i->h.toString() ); + } + // it's okay if the down member isn't part of the config, + // we might be adding a new member that isn't up yet + allowFailure = true; + } + + if( !allowFailure ) { + string msg = string("need all members up to initiate, not ok : ") + i->h.toStringLong(); + if( !initial ) + msg = string("need most members up to reconfigure, not ok : ") + i->h.toString(); + uasserted(13144, msg); + } + } + } + if( initial ) { + bool hasData = res["hasData"].Bool(); + uassert(13311, "member " + i->h.toString() + " has data already, cannot initiate set. All members except initiator must be empty.", + !hasData || i->h.isSelf()); + } + } + if (down.size() > 0) { + result.append("down", down); + } + } + + class CmdReplSetInitiate : public ReplSetCommand { + public: + virtual LockType locktype() const { return NONE; } + CmdReplSetInitiate() : ReplSetCommand("replSetInitiate") { } + virtual void help(stringstream& h) const { + h << "Initiate/christen a replica set."; + h << "\nhttp://www.mongodb.org/display/DOCS/Replica+Set+Commands"; + } + virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + log() << "replSet replSetInitiate admin command received from client" << rsLog; + + if( !replSet ) { + errmsg = "server is not running with --replSet"; + return false; + } + if( theReplSet ) { + errmsg = "already initialized"; + result.append("info", "try querying " + rsConfigNs + " to see current configuration"); + return false; + } + + { + // just make sure we can get a write lock before doing anything else. we'll reacquire one + // later. of course it could be stuck then, but this check lowers the risk if weird things + // are up. + time_t t = time(0); + writelock lk(""); + if( time(0)-t > 10 ) { + errmsg = "took a long time to get write lock, so not initiating. Initiate when server less busy?"; + return false; + } + + /* check that we don't already have an oplog. that could cause issues. + it is ok if the initiating member has *other* data than that. + */ + BSONObj o; + if( Helpers::getFirst(rsoplog, o) ) { + errmsg = rsoplog + string(" is not empty on the initiating member. cannot initiate."); + return false; + } + } + + if( ReplSet::startupStatus == ReplSet::BADCONFIG ) { + errmsg = "server already in BADCONFIG state (check logs); not initiating"; + result.append("info", ReplSet::startupStatusMsg.get()); + return false; + } + if( ReplSet::startupStatus != ReplSet::EMPTYCONFIG ) { + result.append("startupStatus", ReplSet::startupStatus); + errmsg = "all members and seeds must be reachable to initiate set"; + result.append("info", cmdLine._replSet); + return false; + } + + BSONObj configObj; + + if( cmdObj["replSetInitiate"].type() != Object ) { + result.append("info2", "no configuration explicitly specified -- making one"); + log() << "replSet info initiate : no configuration specified. Using a default configuration for the set" << rsLog; + + string name; + vector<HostAndPort> seeds; + set<HostAndPort> seedSet; + parseReplsetCmdLine(cmdLine._replSet, name, seeds, seedSet); // may throw... + + bob b; + b.append("_id", name); + bob members; + members.append("0", BSON( "_id" << 0 << "host" << HostAndPort::Me().dynString() )); + result.append("me", HostAndPort::Me().toString()); + for( unsigned i = 0; i < seeds.size(); i++ ) + members.append(bob::numStr(i+1), BSON( "_id" << i+1 << "host" << seeds[i].toString())); + b.appendArray("members", members.obj()); + configObj = b.obj(); + log() << "replSet created this configuration for initiation : " << configObj.toString() << rsLog; + } + else { + configObj = cmdObj["replSetInitiate"].Obj(); + } + + bool parsed = false; + try { + ReplSetConfig newConfig(configObj); + parsed = true; + + if( newConfig.version > 1 ) { + errmsg = "can't initiate with a version number greater than 1"; + return false; + } + + log() << "replSet replSetInitiate config object parses ok, " << newConfig.members.size() << " members specified" << rsLog; + + checkMembersUpForConfigChange(newConfig, result, true); + + log() << "replSet replSetInitiate all members seem up" << rsLog; + + createOplog(); + + writelock lk(""); + bo comment = BSON( "msg" << "initiating set"); + newConfig.saveConfigLocally(comment); + log() << "replSet replSetInitiate config now saved locally. Should come online in about a minute." << rsLog; + result.append("info", "Config now saved locally. Should come online in about a minute."); + ReplSet::startupStatus = ReplSet::SOON; + ReplSet::startupStatusMsg.set("Received replSetInitiate - should come online shortly."); + } + catch( DBException& e ) { + log() << "replSet replSetInitiate exception: " << e.what() << rsLog; + if( !parsed ) + errmsg = string("couldn't parse cfg object ") + e.what(); + else + errmsg = string("couldn't initiate : ") + e.what(); + return false; + } + catch( string& e2 ) { + log() << e2 << rsLog; + errmsg = e2; + return false; + } + + return true; + } + } cmdReplSetInitiate; + +} diff --git a/src/mongo/db/repl/rs_member.h b/src/mongo/db/repl/rs_member.h new file mode 100644 index 00000000000..24e593392b6 --- /dev/null +++ b/src/mongo/db/repl/rs_member.h @@ -0,0 +1,131 @@ +// @file rsmember.h +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** replica set member */ + +#pragma once + +#include "../../util/concurrency/value.h" + +namespace mongo { + + + /* + RS_STARTUP serving still starting up, or still trying to initiate the set + RS_PRIMARY this server thinks it is primary + RS_SECONDARY this server thinks it is a secondary (slave mode) + RS_RECOVERING recovering/resyncing; after recovery usually auto-transitions to secondary + RS_FATAL something bad has occurred and server is not completely offline with regard to the replica set. fatal error. + RS_STARTUP2 loaded config, still determining who is primary + */ + struct MemberState { + enum MS { + RS_STARTUP = 0, + RS_PRIMARY = 1, + RS_SECONDARY = 2, + RS_RECOVERING = 3, + RS_FATAL = 4, + RS_STARTUP2 = 5, + RS_UNKNOWN = 6, /* remote node not yet reached */ + RS_ARBITER = 7, + RS_DOWN = 8, /* node not reachable for a report */ + RS_ROLLBACK = 9 + } s; + + MemberState(MS ms = RS_UNKNOWN) : s(ms) { } + explicit MemberState(int ms) : s((MS) ms) { } + + bool startup() const { return s == RS_STARTUP; } + bool primary() const { return s == RS_PRIMARY; } + bool secondary() const { return s == RS_SECONDARY; } + bool recovering() const { return s == RS_RECOVERING; } + bool startup2() const { return s == RS_STARTUP2; } + bool fatal() const { return s == RS_FATAL; } + bool rollback() const { return s == RS_ROLLBACK; } + bool readable() const { return s == RS_PRIMARY || s == RS_SECONDARY; } + + string toString() const; + + bool operator==(const MemberState& r) const { return s == r.s; } + bool operator!=(const MemberState& r) const { return s != r.s; } + }; + + /* this is supposed to be just basic information on a member, + and copy constructable. */ + class HeartbeatInfo { + unsigned _id; + public: + HeartbeatInfo() : _id(0xffffffff), hbstate(MemberState::RS_UNKNOWN), health(-1.0), + downSince(0), skew(INT_MIN), authIssue(false), ping(0) { } + HeartbeatInfo(unsigned id); + unsigned id() const { return _id; } + MemberState hbstate; + double health; + time_t upSince; + long long downSince; + time_t lastHeartbeat; + DiagStr lastHeartbeatMsg; + OpTime opTime; + int skew; + bool authIssue; + unsigned int ping; // milliseconds + static unsigned int numPings; + + bool up() const { return health > 0; } + + /** health is set to -1 on startup. that means we haven't even checked yet. 0 means we checked and it failed. */ + bool maybeUp() const { return health != 0; } + + long long timeDown() const; // ms + + /* true if changed in a way of interest to the repl set manager. */ + bool changed(const HeartbeatInfo& old) const; + }; + + inline HeartbeatInfo::HeartbeatInfo(unsigned id) : + _id(id), + authIssue(false), + ping(0) { + hbstate = MemberState::RS_UNKNOWN; + health = -1.0; + downSince = 0; + lastHeartbeat = upSince = 0; + skew = INT_MIN; + } + + inline bool HeartbeatInfo::changed(const HeartbeatInfo& old) const { + return health != old.health || + hbstate != old.hbstate; + } + + inline string MemberState::toString() const { + switch ( s ) { + case RS_STARTUP: return "STARTUP"; + case RS_PRIMARY: return "PRIMARY"; + case RS_SECONDARY: return "SECONDARY"; + case RS_RECOVERING: return "RECOVERING"; + case RS_FATAL: return "FATAL"; + case RS_STARTUP2: return "STARTUP2"; + case RS_ARBITER: return "ARBITER"; + case RS_DOWN: return "DOWN"; + case RS_ROLLBACK: return "ROLLBACK"; + case RS_UNKNOWN: return "UNKNOWN"; + } + return ""; + } + +} diff --git a/src/mongo/db/repl/rs_optime.h b/src/mongo/db/repl/rs_optime.h new file mode 100644 index 00000000000..f0ca56927ad --- /dev/null +++ b/src/mongo/db/repl/rs_optime.h @@ -0,0 +1,58 @@ +// @file rs_optime.h + +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "../../util/optime.h" + +namespace mongo { + + const char rsoplog[] = "local.oplog.rs"; + + /* + class RSOpTime : public OpTime { + public: + bool initiated() const { return getSecs() != 0; } + };*/ + + /*struct RSOpTime { + unsigned long long ord; + + RSOpTime() : ord(0) { } + + bool initiated() const { return ord > 0; } + + void initiate() { + assert( !initiated() ); + ord = 1000000; + } + + ReplTime inc() { + DEV assertInWriteLock(); + return ++ord; + } + + string toString() const { return str::stream() << ord; } + + // query the oplog and set the highest value herein. acquires a db read lock. throws. + void load(); + }; + + extern RSOpTime rsOpTime;*/ + +} diff --git a/src/mongo/db/repl/rs_rollback.cpp b/src/mongo/db/repl/rs_rollback.cpp new file mode 100644 index 00000000000..10727c59669 --- /dev/null +++ b/src/mongo/db/repl/rs_rollback.cpp @@ -0,0 +1,667 @@ +/* @file rs_rollback.cpp +* +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "../client.h" +#include "../../client/dbclient.h" +#include "rs.h" +#include "../repl.h" +#include "../ops/query.h" +#include "../cloner.h" +#include "../ops/update.h" +#include "../ops/delete.h" + +/* Scenarios + + We went offline with ops not replicated out. + + F = node that failed and coming back. + P = node that took over, new primary + + #1: + F : a b c d e f g + P : a b c d q + + The design is "keep P". One could argue here that "keep F" has some merits, however, in most cases P + will have significantly more data. Also note that P may have a proper subset of F's stream if there were + no subsequent writes. + + For now the model is simply : get F back in sync with P. If P was really behind or something, we should have + just chosen not to fail over anyway. + + #2: + F : a b c d e f g -> a b c d + P : a b c d + + #3: + F : a b c d e f g -> a b c d q r s t u v w x z + P : a b c d.q r s t u v w x z + + Steps + find an event in common. 'd'. + undo our events beyond that by: + (1) taking copy from other server of those objects + (2) do not consider copy valid until we pass reach an optime after when we fetched the new version of object + -- i.e., reset minvalid. + (3) we could skip operations on objects that are previous in time to our capture of the object as an optimization. + +*/ + +namespace mongo { + + using namespace bson; + + void incRBID(); + + class rsfatal : public std::exception { + public: + virtual const char* what() const throw() { return "replica set fatal exception"; } + }; + + struct DocID { + const char *ns; + be _id; + bool operator<(const DocID& d) const { + int c = strcmp(ns, d.ns); + if( c < 0 ) return true; + if( c > 0 ) return false; + return _id < d._id; + } + }; + + struct HowToFixUp { + /* note this is a set -- if there are many $inc's on a single document we need to rollback, we only + need to refetch it once. */ + set<DocID> toRefetch; + + /* collections to drop */ + set<string> toDrop; + + set<string> collectionsToResync; + + OpTime commonPoint; + DiskLoc commonPointOurDiskloc; + + int rbid; // remote server's current rollback sequence # + }; + + static void refetch(HowToFixUp& h, const BSONObj& ourObj) { + const char *op = ourObj.getStringField("op"); + if( *op == 'n' ) + return; + + unsigned long long totSize = 0; + totSize += ourObj.objsize(); + if( totSize > 512 * 1024 * 1024 ) + throw "rollback too large"; + + DocID d; + // NOTE The assigned ns value may become invalid if we yield. + d.ns = ourObj.getStringField("ns"); + if( *d.ns == 0 ) { + log() << "replSet WARNING ignoring op on rollback no ns TODO : " << ourObj.toString() << rsLog; + return; + } + + bo o = ourObj.getObjectField(*op=='u' ? "o2" : "o"); + if( o.isEmpty() ) { + log() << "replSet warning ignoring op on rollback : " << ourObj.toString() << rsLog; + return; + } + + if( *op == 'c' ) { + be first = o.firstElement(); + NamespaceString s(d.ns); // foo.$cmd + string cmdname = first.fieldName(); + Command *cmd = Command::findCommand(cmdname.c_str()); + if( cmd == 0 ) { + log() << "replSet warning rollback no suchcommand " << first.fieldName() << " - different mongod versions perhaps?" << rsLog; + return; + } + else { + /* findandmodify - tranlated? + godinsert?, + renamecollection a->b. just resync a & b + */ + if( cmdname == "create" ) { + /* Create collection operation + { ts: ..., h: ..., op: "c", ns: "foo.$cmd", o: { create: "abc", ... } } + */ + string ns = s.db + '.' + o["create"].String(); // -> foo.abc + h.toDrop.insert(ns); + return; + } + else if( cmdname == "drop" ) { + string ns = s.db + '.' + first.valuestr(); + h.collectionsToResync.insert(ns); + return; + } + else if( cmdname == "dropIndexes" || cmdname == "deleteIndexes" ) { + /* TODO: this is bad. we simply full resync the collection here, which could be very slow. */ + log() << "replSet info rollback of dropIndexes is slow in this version of mongod" << rsLog; + string ns = s.db + '.' + first.valuestr(); + h.collectionsToResync.insert(ns); + return; + } + else if( cmdname == "renameCollection" ) { + /* TODO: slow. */ + log() << "replSet info rollback of renameCollection is slow in this version of mongod" << rsLog; + string from = first.valuestr(); + string to = o["to"].String(); + h.collectionsToResync.insert(from); + h.collectionsToResync.insert(to); + return; + } + else if( cmdname == "reIndex" ) { + return; + } + else if( cmdname == "dropDatabase" ) { + log() << "replSet error rollback : can't rollback drop database full resync will be required" << rsLog; + log() << "replSet " << o.toString() << rsLog; + throw rsfatal(); + } + else { + log() << "replSet error can't rollback this command yet: " << o.toString() << rsLog; + log() << "replSet cmdname=" << cmdname << rsLog; + throw rsfatal(); + } + } + } + + d._id = o["_id"]; + if( d._id.eoo() ) { + log() << "replSet WARNING ignoring op on rollback no _id TODO : " << d.ns << ' '<< ourObj.toString() << rsLog; + return; + } + + h.toRefetch.insert(d); + } + + int getRBID(DBClientConnection*); + + static void syncRollbackFindCommonPoint(DBClientConnection *them, HowToFixUp& h) { + static time_t last; + if( time(0)-last < 60 ) { + throw "findcommonpoint waiting a while before trying again"; + } + last = time(0); + + assert( d.dbMutex.atLeastReadLocked() ); + Client::Context c(rsoplog); + NamespaceDetails *nsd = nsdetails(rsoplog); + assert(nsd); + ReverseCappedCursor u(nsd); + if( !u.ok() ) + throw "our oplog empty or unreadable"; + + const Query q = Query().sort(reverseNaturalObj); + const bo fields = BSON( "ts" << 1 << "h" << 1 ); + + //auto_ptr<DBClientCursor> u = us->query(rsoplog, q, 0, 0, &fields, 0, 0); + + h.rbid = getRBID(them); + auto_ptr<DBClientCursor> t = them->query(rsoplog, q, 0, 0, &fields, 0, 0); + + if( t.get() == 0 || !t->more() ) throw "remote oplog empty or unreadable"; + + BSONObj ourObj = u.current(); + OpTime ourTime = ourObj["ts"]._opTime(); + BSONObj theirObj = t->nextSafe(); + OpTime theirTime = theirObj["ts"]._opTime(); + + { + long long diff = (long long) ourTime.getSecs() - ((long long) theirTime.getSecs()); + /* diff could be positive, negative, or zero */ + log() << "replSet info rollback our last optime: " << ourTime.toStringPretty() << rsLog; + log() << "replSet info rollback their last optime: " << theirTime.toStringPretty() << rsLog; + log() << "replSet info rollback diff in end of log times: " << diff << " seconds" << rsLog; + if( diff > 1800 ) { + log() << "replSet rollback too long a time period for a rollback." << rsLog; + throw "error not willing to roll back more than 30 minutes of data"; + } + } + + unsigned long long scanned = 0; + while( 1 ) { + scanned++; + /* todo add code to assure no excessive scanning for too long */ + if( ourTime == theirTime ) { + if( ourObj["h"].Long() == theirObj["h"].Long() ) { + // found the point back in time where we match. + // todo : check a few more just to be careful about hash collisions. + log() << "replSet rollback found matching events at " << ourTime.toStringPretty() << rsLog; + log() << "replSet rollback findcommonpoint scanned : " << scanned << rsLog; + h.commonPoint = ourTime; + h.commonPointOurDiskloc = u.currLoc(); + return; + } + + refetch(h, ourObj); + + if( !t->more() ) { + log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog; + log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog; + log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog; + log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog; + throw "RS100 reached beginning of remote oplog [2]"; + } + theirObj = t->nextSafe(); + theirTime = theirObj["ts"]._opTime(); + + u.advance(); + if( !u.ok() ) { + log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog; + log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog; + log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog; + log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog; + throw "RS101 reached beginning of local oplog [1]"; + } + ourObj = u.current(); + ourTime = ourObj["ts"]._opTime(); + } + else if( theirTime > ourTime ) { + if( !t->more() ) { + log() << "replSet rollback error RS100 reached beginning of remote oplog" << rsLog; + log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog; + log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog; + log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog; + throw "RS100 reached beginning of remote oplog [1]"; + } + theirObj = t->nextSafe(); + theirTime = theirObj["ts"]._opTime(); + } + else { + // theirTime < ourTime + refetch(h, ourObj); + u.advance(); + if( !u.ok() ) { + log() << "replSet rollback error RS101 reached beginning of local oplog" << rsLog; + log() << "replSet them: " << them->toString() << " scanned: " << scanned << rsLog; + log() << "replSet theirTime: " << theirTime.toStringLong() << rsLog; + log() << "replSet ourTime: " << ourTime.toStringLong() << rsLog; + throw "RS101 reached beginning of local oplog [2]"; + } + ourObj = u.current(); + ourTime = ourObj["ts"]._opTime(); + } + } + } + + struct X { + const bson::bo *op; + bson::bo goodVersionOfObject; + }; + + static void setMinValid(bo newMinValid) { + try { + log() << "replSet minvalid=" << newMinValid["ts"]._opTime().toStringLong() << rsLog; + } + catch(...) { } + { + Helpers::putSingleton("local.replset.minvalid", newMinValid); + Client::Context cx( "local." ); + cx.db()->flushFiles(true); + } + } + + void ReplSetImpl::syncFixUp(HowToFixUp& h, OplogReader& r) { + DBClientConnection *them = r.conn(); + + // fetch all first so we needn't handle interruption in a fancy way + + unsigned long long totSize = 0; + + list< pair<DocID,bo> > goodVersions; + + bo newMinValid; + + /* fetch all the goodVersions of each document from current primary */ + DocID d; + unsigned long long n = 0; + try { + for( set<DocID>::iterator i = h.toRefetch.begin(); i != h.toRefetch.end(); i++ ) { + d = *i; + + assert( !d._id.eoo() ); + + { + /* TODO : slow. lots of round trips. */ + n++; + bo good= them->findOne(d.ns, d._id.wrap(), NULL, QueryOption_SlaveOk).getOwned(); + totSize += good.objsize(); + uassert( 13410, "replSet too much data to roll back", totSize < 300 * 1024 * 1024 ); + + // note good might be eoo, indicating we should delete it + goodVersions.push_back(pair<DocID,bo>(d,good)); + } + } + newMinValid = r.getLastOp(rsoplog); + if( newMinValid.isEmpty() ) { + sethbmsg("rollback error newMinValid empty?"); + return; + } + } + catch(DBException& e) { + sethbmsg(str::stream() << "rollback re-get objects: " << e.toString(),0); + log() << "rollback couldn't re-get ns:" << d.ns << " _id:" << d._id << ' ' << n << '/' << h.toRefetch.size() << rsLog; + throw e; + } + + MemoryMappedFile::flushAll(true); + + sethbmsg("rollback 3.5"); + if( h.rbid != getRBID(r.conn()) ) { + // our source rolled back itself. so the data we received isn't necessarily consistent. + sethbmsg("rollback rbid on source changed during rollback, cancelling this attempt"); + return; + } + + // update them + sethbmsg(str::stream() << "rollback 4 n:" << goodVersions.size()); + + bool warn = false; + + assert( !h.commonPointOurDiskloc.isNull() ); + + mongo::d.dbMutex.assertWriteLocked(); + + /* we have items we are writing that aren't from a point-in-time. thus best not to come online + until we get to that point in freshness. */ + setMinValid(newMinValid); + + /** any full collection resyncs required? */ + if( !h.collectionsToResync.empty() ) { + for( set<string>::iterator i = h.collectionsToResync.begin(); i != h.collectionsToResync.end(); i++ ) { + string ns = *i; + sethbmsg(str::stream() << "rollback 4.1 coll resync " << ns); + + Client::Context c(ns); + { + bob res; + string errmsg; + dropCollection(ns, errmsg, res); + { + dbtemprelease r; + bool ok = copyCollectionFromRemote(them->getServerAddress(), ns, errmsg); + uassert(15909, str::stream() << "replSet rollback error resyncing collection " << ns << ' ' << errmsg, ok); + } + } + } + + /* we did more reading from primary, so check it again for a rollback (which would mess us up), and + make minValid newer. + */ + sethbmsg("rollback 4.2"); + { + string err; + try { + newMinValid = r.getLastOp(rsoplog); + if( newMinValid.isEmpty() ) { + err = "can't get minvalid from primary"; + } + else { + setMinValid(newMinValid); + } + } + catch (DBException&) { + err = "can't get/set minvalid"; + } + if( h.rbid != getRBID(r.conn()) ) { + // our source rolled back itself. so the data we received isn't necessarily consistent. + // however, we've now done writes. thus we have a problem. + err += "rbid at primary changed during resync/rollback"; + } + if( !err.empty() ) { + log() << "replSet error rolling back : " << err << ". A full resync will be necessary." << rsLog; + /* todo: reset minvalid so that we are permanently in fatal state */ + /* todo: don't be fatal, but rather, get all the data first. */ + sethbmsg("rollback error"); + throw rsfatal(); + } + } + sethbmsg("rollback 4.3"); + } + + sethbmsg("rollback 4.6"); + /** drop collections to drop before doing individual fixups - that might make things faster below actually if there were subsequent inserts to rollback */ + for( set<string>::iterator i = h.toDrop.begin(); i != h.toDrop.end(); i++ ) { + Client::Context c(*i); + try { + bob res; + string errmsg; + log(1) << "replSet rollback drop: " << *i << rsLog; + dropCollection(*i, errmsg, res); + } + catch(...) { + log() << "replset rollback error dropping collection " << *i << rsLog; + } + } + + sethbmsg("rollback 4.7"); + Client::Context c(rsoplog); + NamespaceDetails *oplogDetails = nsdetails(rsoplog); + uassert(13423, str::stream() << "replSet error in rollback can't find " << rsoplog, oplogDetails); + + map<string,shared_ptr<RemoveSaver> > removeSavers; + + unsigned deletes = 0, updates = 0; + for( list<pair<DocID,bo> >::iterator i = goodVersions.begin(); i != goodVersions.end(); i++ ) { + const DocID& d = i->first; + bo pattern = d._id.wrap(); // { _id : ... } + try { + assert( d.ns && *d.ns ); + if( h.collectionsToResync.count(d.ns) ) { + /* we just synced this entire collection */ + continue; + } + + getDur().commitIfNeeded(); + + /* keep an archive of items rolled back */ + shared_ptr<RemoveSaver>& rs = removeSavers[d.ns]; + if ( ! rs ) + rs.reset( new RemoveSaver( "rollback" , "" , d.ns ) ); + + // todo: lots of overhead in context, this can be faster + Client::Context c(d.ns); + if( i->second.isEmpty() ) { + // wasn't on the primary; delete. + /* TODO1.6 : can't delete from a capped collection. need to handle that here. */ + deletes++; + + NamespaceDetails *nsd = nsdetails(d.ns); + if( nsd ) { + if( nsd->capped ) { + /* can't delete from a capped collection - so we truncate instead. if this item must go, + so must all successors!!! */ + try { + /** todo: IIRC cappedTrunateAfter does not handle completely empty. todo. */ + // this will crazy slow if no _id index. + long long start = Listener::getElapsedTimeMillis(); + DiskLoc loc = Helpers::findOne(d.ns, pattern, false); + if( Listener::getElapsedTimeMillis() - start > 200 ) + log() << "replSet warning roll back slow no _id index for " << d.ns << " perhaps?" << rsLog; + //would be faster but requires index: DiskLoc loc = Helpers::findById(nsd, pattern); + if( !loc.isNull() ) { + try { + nsd->cappedTruncateAfter(d.ns, loc, true); + } + catch(DBException& e) { + if( e.getCode() == 13415 ) { + // hack: need to just make cappedTruncate do this... + nsd->emptyCappedCollection(d.ns); + } + else { + throw; + } + } + } + } + catch(DBException& e) { + log() << "replSet error rolling back capped collection rec " << d.ns << ' ' << e.toString() << rsLog; + } + } + else { + try { + deletes++; + deleteObjects(d.ns, pattern, /*justone*/true, /*logop*/false, /*god*/true, rs.get() ); + } + catch(...) { + log() << "replSet error rollback delete failed ns:" << d.ns << rsLog; + } + } + // did we just empty the collection? if so let's check if it even exists on the source. + if( nsd->stats.nrecords == 0 ) { + try { + string sys = cc().database()->name + ".system.namespaces"; + bo o = them->findOne(sys, QUERY("name"<<d.ns)); + if( o.isEmpty() ) { + // we should drop + try { + bob res; + string errmsg; + dropCollection(d.ns, errmsg, res); + } + catch(...) { + log() << "replset error rolling back collection " << d.ns << rsLog; + } + } + } + catch(DBException& ) { + /* this isn't *that* big a deal, but is bad. */ + log() << "replSet warning rollback error querying for existence of " << d.ns << " at the primary, ignoring" << rsLog; + } + } + } + } + else { + // todo faster... + OpDebug debug; + updates++; + _updateObjects(/*god*/true, d.ns, i->second, pattern, /*upsert=*/true, /*multi=*/false , /*logtheop=*/false , debug, rs.get() ); + } + } + catch(DBException& e) { + log() << "replSet exception in rollback ns:" << d.ns << ' ' << pattern.toString() << ' ' << e.toString() << " ndeletes:" << deletes << rsLog; + warn = true; + } + } + + removeSavers.clear(); // this effectively closes all of them + + sethbmsg(str::stream() << "rollback 5 d:" << deletes << " u:" << updates); + MemoryMappedFile::flushAll(true); + sethbmsg("rollback 6"); + + // clean up oplog + LOG(2) << "replSet rollback truncate oplog after " << h.commonPoint.toStringPretty() << rsLog; + // todo: fatal error if this throws? + oplogDetails->cappedTruncateAfter(rsoplog, h.commonPointOurDiskloc, false); + + /* reset cached lastoptimewritten and h value */ + loadLastOpTimeWritten(); + + sethbmsg("rollback 7"); + MemoryMappedFile::flushAll(true); + + // done + if( warn ) + sethbmsg("issues during syncRollback, see log"); + else + sethbmsg("rollback done"); + } + + void ReplSetImpl::syncRollback(OplogReader&r) { + unsigned s = _syncRollback(r); + if( s ) + sleepsecs(s); + } + + unsigned ReplSetImpl::_syncRollback(OplogReader&r) { + assert( !lockedByMe() ); + assert( !d.dbMutex.atLeastReadLocked() ); + + sethbmsg("rollback 0"); + + writelocktry lk(rsoplog, 20000); + if( !lk.got() ) { + sethbmsg("rollback couldn't get write lock in a reasonable time"); + return 2; + } + + if( state().secondary() ) { + /* by doing this, we will not service reads (return an error as we aren't in secondary staate. + that perhaps is moot becasue of the write lock above, but that write lock probably gets deferred + or removed or yielded later anyway. + + also, this is better for status reporting - we know what is happening. + */ + changeState(MemberState::RS_ROLLBACK); + } + + HowToFixUp how; + sethbmsg("rollback 1"); + { + r.resetCursor(); + + sethbmsg("rollback 2 FindCommonPoint"); + try { + syncRollbackFindCommonPoint(r.conn(), how); + } + catch( const char *p ) { + sethbmsg(string("rollback 2 error ") + p); + return 10; + } + catch( rsfatal& ) { + _fatal(); + return 2; + } + catch( DBException& e ) { + sethbmsg(string("rollback 2 exception ") + e.toString() + "; sleeping 1 min"); + dbtemprelease r; + sleepsecs(60); + throw; + } + } + + sethbmsg("replSet rollback 3 fixup"); + + { + incRBID(); + try { + syncFixUp(how, r); + } + catch( rsfatal& ) { + sethbmsg("rollback fixup error"); + _fatal(); + return 2; + } + catch(...) { + incRBID(); throw; + } + incRBID(); + + /* success - leave "ROLLBACK" state + can go to SECONDARY once minvalid is achieved + */ + changeState(MemberState::RS_RECOVERING); + } + + return 0; + } + +} diff --git a/src/mongo/db/repl/rs_sync.cpp b/src/mongo/db/repl/rs_sync.cpp new file mode 100644 index 00000000000..8bac981d951 --- /dev/null +++ b/src/mongo/db/repl/rs_sync.cpp @@ -0,0 +1,701 @@ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "../client.h" +#include "../../client/dbclient.h" +#include "rs.h" +#include "../repl.h" +#include "connections.h" + +namespace mongo { + + using namespace bson; + extern unsigned replSetForceInitialSyncFailure; + + void NOINLINE_DECL blank(const BSONObj& o) { + if( *o.getStringField("op") != 'n' ) { + log() << "replSet skipping bad op in oplog: " << o.toString() << rsLog; + } + } + + /* apply the log op that is in param o + @return bool success (true) or failure (false) + */ + bool replset::SyncTail::syncApply(const BSONObj &o) { + const char *ns = o.getStringField("ns"); + if ( *ns == '.' || *ns == 0 ) { + blank(o); + return true; + } + + Client::Context ctx(ns); + ctx.getClient()->curop()->reset(); + return !applyOperation_inlock(o); + } + + /* initial oplog application, during initial sync, after cloning. + @return false on failure. + this method returns an error and doesn't throw exceptions (i think). + */ + bool ReplSetImpl::initialSyncOplogApplication(const OpTime& applyGTE, const OpTime& minValid) { + Member *source = 0; + OplogReader r; + + // keep trying to initial sync from oplog until we run out of targets + while ((source = _getOplogReader(r, applyGTE)) != 0) { + replset::InitialSync init(source->fullName()); + if (init.oplogApplication(r, source, applyGTE, minValid)) { + return true; + } + + r.resetConnection(); + veto(source->fullName(), 60); + log() << "replSet applying oplog from " << source->fullName() << " failed, trying again" << endl; + } + + log() << "replSet initial sync error: couldn't find oplog to sync from" << rsLog; + return false; + } + + bool replset::InitialSync::oplogApplication(OplogReader& r, const Member* source, + const OpTime& applyGTE, const OpTime& minValid) { + + const string hn = source->fullName(); + try { + r.tailingQueryGTE( rsoplog, applyGTE ); + if ( !r.haveCursor() ) { + log() << "replSet initial sync oplog query error" << rsLog; + return false; + } + + { + if( !r.more() ) { + sethbmsg("replSet initial sync error reading remote oplog"); + log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog; + return false; + } + bo op = r.next(); + OpTime t = op["ts"]._opTime(); + r.putBack(op); + + if( op.firstElementFieldName() == string("$err") ) { + log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog; + return false; + } + + uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() ); + if( t > applyGTE ) { + sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync"); + log() << "replSet initial sync expected first optime of " << applyGTE << rsLog; + log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog; + return false; + } + + sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at " + << t.toStringPretty() << " to " << minValid.toStringPretty()); + } + } + catch(DBException& e) { + log() << "replSet initial sync failing: " << e.toString() << rsLog; + return false; + } + + /* we lock outside the loop to avoid the overhead of locking on every operation. */ + writelock lk(""); + + // todo : use exhaust + OpTime ts; + time_t start = time(0); + unsigned long long n = 0; + int fails = 0; + while( ts < minValid ) { + try { + // There are some special cases with initial sync (see the catch block), so we + // don't want to break out of this while until we've reached minvalid. Thus, we'll + // keep trying to requery. + if( !r.more() ) { + OCCASIONALLY log() << "replSet initial sync oplog: no more records" << endl; + sleepsecs(1); + + r.resetCursor(); + r.tailingQueryGTE(rsoplog, theReplSet->lastOpTimeWritten); + if ( !r.haveCursor() ) { + if (fails++ > 30) { + log() << "replSet initial sync tried to query oplog 30 times, giving up" << endl; + return false; + } + } + + continue; + } + + BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */ + ts = o["ts"]._opTime(); + + { + if( (source->state() != MemberState::RS_PRIMARY && + source->state() != MemberState::RS_SECONDARY) || + replSetForceInitialSyncFailure ) { + + int f = replSetForceInitialSyncFailure; + if( f > 0 ) { + replSetForceInitialSyncFailure = f-1; + log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog; + throw DBException("forced error",0); + } + log() << "replSet we are now primary" << rsLog; + throw DBException("primary changed",0); + } + + applyOp(o, applyGTE); + } + + if ( ++n % 1000 == 0 ) { + time_t now = time(0); + if (now - start > 10) { + // simple progress metering + log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to " + << ts.toStringPretty() << rsLog; + start = now; + } + } + + getDur().commitIfNeeded(); + } + catch (DBException& e) { + // Skip duplicate key exceptions. + // These are relatively common on initial sync: if a document is inserted + // early in the clone step, the insert will be replayed but the document + // will probably already have been cloned over. + if( e.getCode() == 11000 || e.getCode() == 11001 || e.getCode() == 12582) { + continue; + } + + // handle cursor not found (just requery) + if( e.getCode() == 13127 ) { + log() << "replSet requerying oplog after cursor not found condition, ts: " << ts.toStringPretty() << endl; + r.resetCursor(); + r.tailingQueryGTE(rsoplog, ts); + if( r.haveCursor() ) { + continue; + } + } + + // TODO: handle server restart + + if( ts <= minValid ) { + // didn't make it far enough + log() << "replSet initial sync failing, error applying oplog : " << e.toString() << rsLog; + return false; + } + + // otherwise, whatever, we'll break out of the loop and catch + // anything that's really wrong in syncTail + } + } + return true; + } + + void replset::InitialSync::applyOp(const BSONObj& o, const OpTime& applyGTE) { + OpTime ts = o["ts"]._opTime(); + + // optimes before we started copying need not be applied. + if( ts >= applyGTE ) { + if (!syncApply(o)) { + if (shouldRetry(o)) { + uassert(15915, "replSet update still fails after adding missing object", syncApply(o)); + } + } + } + + // with repl sets we write the ops to our oplog, too + _logOpObjRS(o); + } + + /* should be in RECOVERING state on arrival here. + readlocks + @return true if transitioned to SECONDARY + */ + bool ReplSetImpl::tryToGoLiveAsASecondary(OpTime& /*out*/ minvalid) { + bool golive = false; + + { + lock lk( this ); + + if (_maintenanceMode > 0) { + // we're not actually going live + return true; + } + } + + { + readlock lk("local.replset.minvalid"); + BSONObj mv; + if( Helpers::getSingleton("local.replset.minvalid", mv) ) { + minvalid = mv["ts"]._opTime(); + if( minvalid <= lastOpTimeWritten ) { + golive=true; + } + } + else + golive = true; /* must have been the original member */ + } + if( golive ) { + sethbmsg(""); + changeState(MemberState::RS_SECONDARY); + } + return golive; + } + + bool ReplSetImpl::_isStale(OplogReader& r, const OpTime& startTs, BSONObj& remoteOldestOp) { + remoteOldestOp = r.findOne(rsoplog, Query()); + OpTime remoteTs = remoteOldestOp["ts"]._opTime(); + DEV log() << "replSet remoteOldestOp: " << remoteTs.toStringLong() << rsLog; + else LOG(3) << "replSet remoteOldestOp: " << remoteTs.toStringLong() << rsLog; + DEV { + log() << "replSet lastOpTimeWritten: " << lastOpTimeWritten.toStringLong() << rsLog; + log() << "replSet our state: " << state().toString() << rsLog; + } + if( startTs >= remoteTs ) { + return false; + } + + return true; + } + + Member* ReplSetImpl::_getOplogReader(OplogReader& r, const OpTime& minTS) { + Member *target = 0, *stale = 0; + BSONObj oldest; + + assert(r.conn() == 0); + + while ((target = getMemberToSyncTo()) != 0) { + string current = target->fullName(); + + if( !r.connect(current) ) { + log(2) << "replSet can't connect to " << current << " to read operations" << rsLog; + r.resetConnection(); + veto(current); + continue; + } + + if( !minTS.isNull() && _isStale(r, minTS, oldest) ) { + r.resetConnection(); + veto(current, 600); + stale = target; + continue; + } + + // if we made it here, the target is up and not stale + return target; + } + + // the only viable sync target was stale + if (stale) { + log() << "replSet error RS102 too stale to catch up, at least from " << stale->fullName() << rsLog; + log() << "replSet our last optime : " << lastOpTimeWritten.toStringLong() << rsLog; + log() << "replSet oldest at " << stale->fullName() << " : " << oldest["ts"]._opTime().toStringLong() << rsLog; + log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog; + + // reset minvalid so that we can't become primary prematurely + { + writelock lk("local.replset.minvalid"); + Helpers::putSingleton("local.replset.minvalid", oldest); + } + + sethbmsg("error RS102 too stale to catch up"); + changeState(MemberState::RS_RECOVERING); + sleepsecs(120); + } + + return 0; + } + + /* tail an oplog. ok to return, will be re-called. */ + void ReplSetImpl::syncTail() { + // todo : locking vis a vis the mgr... + OplogReader r; + string hn; + + // find a target to sync from the last op time written + Member* target = _getOplogReader(r, lastOpTimeWritten); + + // no server found + if (target == 0) { + // if there is no one to sync from + OpTime minvalid; + tryToGoLiveAsASecondary(minvalid); + return; + } + + r.tailingQueryGTE(rsoplog, lastOpTimeWritten); + // if target cut connections between connecting and querying (for + // example, because it stepped down) we might not have a cursor + if ( !r.haveCursor() ) { + return; + } + + uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() ); + + { + if( !r.more() ) { + /* maybe we are ahead and need to roll back? */ + try { + bo theirLastOp = r.getLastOp(rsoplog); + if( theirLastOp.isEmpty() ) { + log() << "replSet error empty query result from " << hn << " oplog" << rsLog; + sleepsecs(2); + return; + } + OpTime theirTS = theirLastOp["ts"]._opTime(); + if( theirTS < lastOpTimeWritten ) { + log() << "replSet we are ahead of the primary, will try to roll back" << rsLog; + syncRollback(r); + return; + } + /* we're not ahead? maybe our new query got fresher data. best to come back and try again */ + log() << "replSet syncTail condition 1" << rsLog; + sleepsecs(1); + } + catch(DBException& e) { + log() << "replSet error querying " << hn << ' ' << e.toString() << rsLog; + veto(target->fullName()); + sleepsecs(2); + } + return; + } + + BSONObj o = r.nextSafe(); + OpTime ts = o["ts"]._opTime(); + long long h = o["h"].numberLong(); + if( ts != lastOpTimeWritten || h != lastH ) { + log() << "replSet our last op time written: " << lastOpTimeWritten.toStringPretty() << rsLog; + log() << "replset source's GTE: " << ts.toStringPretty() << rsLog; + syncRollback(r); + return; + } + } + + /* we have now checked if we need to rollback and we either don't have to or did it. */ + { + OpTime minvalid; + tryToGoLiveAsASecondary(minvalid); + } + + while( 1 ) { + { + Timer timeInWriteLock; + writelock lk(""); + while( 1 ) { + if( !r.moreInCurrentBatch() ) { + dbtemprelease tempRelease; + { + // we need to occasionally check some things. between + // batches is probably a good time. + if( state().recovering() ) { // perhaps we should check this earlier? but not before the rollback checks. + /* can we go to RS_SECONDARY state? we can if not too old and if minvalid achieved */ + OpTime minvalid; + bool golive = ReplSetImpl::tryToGoLiveAsASecondary(minvalid); + if( golive ) { + ; + } + else { + sethbmsg(str::stream() << "still syncing, not yet to minValid optime" << minvalid.toString()); + } + // todo: too stale capability + } + if( !target->hbinfo().hbstate.readable() ) { + return; + } + } + r.more(); // to make the requestmore outside the db lock, which obviously is quite important + } + if( timeInWriteLock.micros() > 1000 ) { + dbtemprelease tempRelease; + timeInWriteLock.reset(); + } + if( !r.more() ) + break; + { + BSONObj o = r.nextSafe(); // note we might get "not master" at some point + + int sd = myConfig().slaveDelay; + // ignore slaveDelay if the box is still initializing. once + // it becomes secondary we can worry about it. + if( sd && box.getState().secondary() ) { + const OpTime ts = o["ts"]._opTime(); + long long a = ts.getSecs(); + long long b = time(0); + long long lag = b - a; + long long sleeptime = sd - lag; + if( sleeptime > 0 ) { + dbtemprelease tempRelease; + uassert(12000, "rs slaveDelay differential too big check clocks and systems", sleeptime < 0x40000000); + if( sleeptime < 60 ) { + sleepsecs((int) sleeptime); + } + else { + log() << "replSet slavedelay sleep long time: " << sleeptime << rsLog; + // sleep(hours) would prevent reconfigs from taking effect & such! + long long waitUntil = b + sleeptime; + while( 1 ) { + sleepsecs(6); + if( time(0) >= waitUntil ) + break; + + if( !target->hbinfo().hbstate.readable() ) { + break; + } + + if( myConfig().slaveDelay != sd ) // reconf + break; + } + } + } + } // endif slaveDelay + + d.dbMutex.assertWriteLocked(); + try { + /* if we have become primary, we dont' want to apply things from elsewhere + anymore. assumePrimary is in the db lock so we are safe as long as + we check after we locked above. */ + if( box.getState().primary() ) { + log(0) << "replSet stopping syncTail we are now primary" << rsLog; + return; + } + + // TODO: make this whole method a member of SyncTail (SERVER-4444) + replset::SyncTail tail(""); + tail.syncApply(o); + _logOpObjRS(o); // with repl sets we write the ops to our oplog too + } + catch (DBException& e) { + sethbmsg(str::stream() << "syncTail: " << e.toString() << ", syncing: " << o); + veto(target->fullName(), 300); + sleepsecs(30); + return; + } + } + } // end while + } // end writelock scope + + r.tailCheck(); + if( !r.haveCursor() ) { + LOG(1) << "replSet end syncTail pass with " << hn << rsLog; + // TODO : reuse our connection to the primary. + return; + } + + if( !target->hbinfo().hbstate.readable() ) { + return; + } + // looping back is ok because this is a tailable cursor + } + } + + void ReplSetImpl::_syncThread() { + StateBox::SP sp = box.get(); + if( sp.state.primary() ) { + sleepsecs(1); + return; + } + if( _blockSync || sp.state.fatal() || sp.state.startup() ) { + sleepsecs(5); + return; + } + + /* do we have anything at all? */ + if( lastOpTimeWritten.isNull() ) { + syncDoInitialSync(); + return; // _syncThread will be recalled, starts from top again in case sync failed. + } + + /* we have some data. continue tailing. */ + syncTail(); + } + + void ReplSetImpl::syncThread() { + while( 1 ) { + // After a reconfig, we may not be in the replica set anymore, so + // check that we are in the set (and not an arbiter) before + // trying to sync with other replicas. + if( ! _self ) { + log() << "replSet warning did not detect own host and port, not syncing, config: " << theReplSet->config() << rsLog; + return; + } + if( myConfig().arbiterOnly ) { + return; + } + + try { + _syncThread(); + } + catch(DBException& e) { + sethbmsg(str::stream() << "syncThread: " << e.toString()); + sleepsecs(10); + } + catch(...) { + sethbmsg("unexpected exception in syncThread()"); + // TODO : SET NOT SECONDARY here? + sleepsecs(60); + } + sleepsecs(1); + + /* normally msgCheckNewState gets called periodically, but in a single node repl set there + are no heartbeat threads, so we do it here to be sure. this is relevant if the singleton + member has done a stepDown() and needs to come back up. + */ + OCCASIONALLY { + mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) ); + } + } + } + + void startSyncThread() { + static int n; + if( n != 0 ) { + log() << "replSet ERROR : more than one sync thread?" << rsLog; + assert( n == 0 ); + } + n++; + + Client::initThread("rsSync"); + cc().iAmSyncThread(); // for isSyncThread() (which is used not used much, is used in secondary create index code + replLocalAuth(); + theReplSet->syncThread(); + cc().shutdown(); + } + + void GhostSync::starting() { + Client::initThread("rsGhostSync"); + replLocalAuth(); + } + + void ReplSetImpl::blockSync(bool block) { + _blockSync = block; + if (_blockSync) { + // syncing is how we get into SECONDARY state, so we'll be stuck in + // RECOVERING until we unblock + changeState(MemberState::RS_RECOVERING); + } + } + + void GhostSync::associateSlave(const BSONObj& id, const int memberId) { + const OID rid = id["_id"].OID(); + rwlock lk( _lock , true ); + shared_ptr<GhostSlave> &g = _ghostCache[rid]; + if( g.get() == 0 ) { + g.reset( new GhostSlave() ); + wassert( _ghostCache.size() < 10000 ); + } + GhostSlave &slave = *g; + if (slave.init) { + LOG(1) << "tracking " << slave.slave->h().toString() << " as " << rid << rsLog; + return; + } + + slave.slave = (Member*)rs->findById(memberId); + if (slave.slave != 0) { + slave.init = true; + } + else { + log() << "replset couldn't find a slave with id " << memberId + << ", not tracking " << rid << rsLog; + } + } + + void GhostSync::updateSlave(const mongo::OID& rid, const OpTime& last) { + rwlock lk( _lock , false ); + MAP::iterator i = _ghostCache.find( rid ); + if ( i == _ghostCache.end() ) { + OCCASIONALLY warning() << "couldn't update slave " << rid << " no entry" << rsLog; + return; + } + + GhostSlave& slave = *(i->second); + if (!slave.init) { + OCCASIONALLY log() << "couldn't update slave " << rid << " not init" << rsLog; + return; + } + + ((ReplSetConfig::MemberCfg)slave.slave->config()).updateGroups(last); + } + + void GhostSync::percolate(const BSONObj& id, const OpTime& last) { + const OID rid = id["_id"].OID(); + GhostSlave* slave; + { + rwlock lk( _lock , false ); + + MAP::iterator i = _ghostCache.find( rid ); + if ( i == _ghostCache.end() ) { + OCCASIONALLY log() << "couldn't percolate slave " << rid << " no entry" << rsLog; + return; + } + + slave = i->second.get(); + if (!slave->init) { + OCCASIONALLY log() << "couldn't percolate slave " << rid << " not init" << rsLog; + return; + } + } + + assert(slave->slave); + + const Member *target = rs->_currentSyncTarget; + if (!target || rs->box.getState().primary() + // we are currently syncing from someone who's syncing from us + // the target might end up with a new Member, but s.slave never + // changes so we'll compare the names + || target == slave->slave || target->fullName() == slave->slave->fullName()) { + LOG(1) << "replica set ghost target no good" << endl; + return; + } + + try { + if (!slave->reader.haveCursor()) { + if (!slave->reader.connect(id, slave->slave->id(), target->fullName())) { + // error message logged in OplogReader::connect + return; + } + slave->reader.ghostQueryGTE(rsoplog, last); + } + + LOG(1) << "replSet last: " << slave->last.toString() << " to " << last.toString() << rsLog; + if (slave->last > last) { + return; + } + + while (slave->last <= last) { + if (!slave->reader.more()) { + // we'll be back + return; + } + + BSONObj o = slave->reader.nextSafe(); + slave->last = o["ts"]._opTime(); + } + LOG(2) << "now last is " << slave->last.toString() << rsLog; + } + catch (DBException& e) { + // we'll be back + LOG(2) << "replSet ghost sync error: " << e.what() << " for " + << slave->slave->fullName() << rsLog; + slave->reader.resetConnection(); + } + } +} diff --git a/src/mongo/db/repl/test.html b/src/mongo/db/repl/test.html new file mode 100644 index 00000000000..295ad2ef0e0 --- /dev/null +++ b/src/mongo/db/repl/test.html @@ -0,0 +1,11 @@ +<HTML>
+<BODY>
+<!-- see also jstests/rs/ -->
+<iframe src="http://127.0.0.1:28000/_replSet" width="100%" height="50%" frameborder=1>
+</iframe>
+
+<iframe src="http://127.0.0.1:28001/_replSet" width="100%" height="50%" frameborder=1>
+</iframe>
+
+</BODY>
+</HTML>
diff --git a/src/mongo/db/repl/testing.js b/src/mongo/db/repl/testing.js new file mode 100644 index 00000000000..d741cf3a644 --- /dev/null +++ b/src/mongo/db/repl/testing.js @@ -0,0 +1,42 @@ +// helpers for testing repl sets
+// run
+// mongo --shell <host:port> testing.js
+
+cfg = {
+ _id: 'asdf',
+ members: [
+ { _id : 0, host : "dm_hp" },
+ { _id : 2, host : "dm_hp:27002" }
+ ]
+};
+c2 = {
+ _id: 'asdf',
+ members: [
+ { _id: 0, host: "dmthink" },
+ { _id: 2, host: "dmthink:27002" }
+ ]
+};
+
+db = db.getSisterDB("admin");
+local = db.getSisterDB("local");
+
+print("\n\ndb = admin db on localhost:27017");
+print("b = admin on localhost:27002");
+print("rc(x) = db.runCommand(x)");
+print("cfg = samp replset config");
+print("i() = replSetInitiate(cfg)");
+print("ism() = rc('ismaster')");
+print("\n\n");
+
+function rc(c) { return db.runCommand(c); }
+function i() { return rc({ replSetInitiate: cfg }); }
+function ism() { return rc("isMaster"); }
+
+b = 0;
+try {
+ b = new Mongo("localhost:27002").getDB("admin");
+}
+catch (e) {
+ print("\nCouldn't connect to b mongod instance\n");
+}
+
diff --git a/src/mongo/db/repl_block.cpp b/src/mongo/db/repl_block.cpp new file mode 100644 index 00000000000..1776225505c --- /dev/null +++ b/src/mongo/db/repl_block.cpp @@ -0,0 +1,256 @@ +// repl_block.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "repl.h" +#include "repl_block.h" +#include "instance.h" +#include "dbhelpers.h" +#include "../util/background.h" +#include "../util/mongoutils/str.h" +#include "../client/dbclient.h" +#include "replutil.h" + +//#define REPLDEBUG(x) log() << "replBlock: " << x << endl; +#define REPLDEBUG(x) + +namespace mongo { + + using namespace mongoutils; + + class SlaveTracking : public BackgroundJob { + public: + string name() const { return "SlaveTracking"; } + + static const char * NS; + + struct Ident { + + Ident(const BSONObj& r, const string& h, const string& n) { + BSONObjBuilder b; + b.appendElements( r ); + b.append( "host" , h ); + b.append( "ns" , n ); + obj = b.obj(); + } + + bool operator<( const Ident& other ) const { + return obj["_id"].OID() < other.obj["_id"].OID(); + } + + BSONObj obj; + }; + + struct Info { + Info() : loc(0) {} + ~Info() { + if ( loc && owned ) { + delete loc; + } + } + bool owned; // true if loc is a pointer of our creation (and not a pointer into a MMF) + OpTime * loc; + }; + + SlaveTracking() : _mutex("SlaveTracking") { + _dirty = false; + _started = false; + } + + void run() { + Client::initThread( "slaveTracking" ); + DBDirectClient db; + while ( ! inShutdown() ) { + sleepsecs( 1 ); + + if ( ! _dirty ) + continue; + + writelock lk(NS); + + list< pair<BSONObj,BSONObj> > todo; + + { + scoped_lock mylk(_mutex); + + for ( map<Ident,Info>::iterator i=_slaves.begin(); i!=_slaves.end(); i++ ) { + BSONObjBuilder temp; + temp.appendTimestamp( "syncedTo" , i->second.loc[0].asDate() ); + todo.push_back( pair<BSONObj,BSONObj>( i->first.obj.getOwned() , + BSON( "$set" << temp.obj() ).getOwned() ) ); + } + } + + for ( list< pair<BSONObj,BSONObj> >::iterator i=todo.begin(); i!=todo.end(); i++ ) { + db.update( NS , i->first , i->second , true ); + } + + _dirty = false; + } + } + + void reset() { + scoped_lock mylk(_mutex); + _slaves.clear(); + } + + void update( const BSONObj& rid , const string& host , const string& ns , OpTime last ) { + REPLDEBUG( host << " " << rid << " " << ns << " " << last ); + + scoped_lock mylk(_mutex); + +#ifdef _DEBUG + MongoFileAllowWrites allowWrites; +#endif + + Ident ident(rid,host,ns); + Info& i = _slaves[ ident ]; + + if (theReplSet && theReplSet->isPrimary()) { + theReplSet->ghost->updateSlave(ident.obj["_id"].OID(), last); + } + + if ( i.loc ) { + if( i.owned ) + i.loc[0] = last; + else + getDur().setNoJournal(i.loc, &last, sizeof(last)); + return; + } + + d.dbMutex.assertAtLeastReadLocked(); + + BSONObj res; + if ( Helpers::findOne( NS , ident.obj , res ) ) { + assert( res["syncedTo"].type() ); + i.owned = false; + i.loc = (OpTime*)res["syncedTo"].value(); + getDur().setNoJournal(i.loc, &last, sizeof(last)); + return; + } + + i.owned = true; + i.loc = new OpTime(last); + _dirty = true; + + if ( ! _started ) { + // start background thread here since we definitely need it + _started = true; + go(); + } + + } + + bool opReplicatedEnough( OpTime op , BSONElement w ) { + RARELY { + REPLDEBUG( "looking for : " << op << " w=" << w ); + } + + if (w.isNumber()) { + return replicatedToNum(op, w.numberInt()); + } + + if (!theReplSet) { + return false; + } + + string wStr = w.String(); + if (wStr == "majority") { + // use the entire set, including arbiters, to prevent writing + // to a majority of the set but not a majority of voters + return replicatedToNum(op, theReplSet->config().getMajority()); + } + + map<string,ReplSetConfig::TagRule*>::const_iterator it = theReplSet->config().rules.find(wStr); + uassert(14830, str::stream() << "unrecognized getLastError mode: " << wStr, + it != theReplSet->config().rules.end()); + + return op <= (*it).second->last; + } + + bool replicatedToNum(OpTime& op, int w) { + if ( w <= 1 || ! _isMaster() ) + return true; + + w--; // now this is the # of slaves i need + scoped_lock mylk(_mutex); + for ( map<Ident,Info>::iterator i=_slaves.begin(); i!=_slaves.end(); i++) { + OpTime s = *(i->second.loc); + if ( s < op ) { + continue; + } + if ( --w == 0 ) + return true; + } + return w <= 0; + } + + unsigned getSlaveCount() const { + scoped_lock mylk(_mutex); + + return _slaves.size(); + } + + // need to be careful not to deadlock with this + mutable mongo::mutex _mutex; + map<Ident,Info> _slaves; + bool _dirty; + bool _started; + + } slaveTracking; + + const char * SlaveTracking::NS = "local.slaves"; + + void updateSlaveLocation( CurOp& curop, const char * ns , OpTime lastOp ) { + if ( lastOp.isNull() ) + return; + + assert( str::startsWith(ns, "local.oplog.") ); + + Client * c = curop.getClient(); + assert(c); + BSONObj rid = c->getRemoteID(); + if ( rid.isEmpty() ) + return; + + slaveTracking.update( rid , curop.getRemoteString( false ) , ns , lastOp ); + + if (theReplSet && !theReplSet->isPrimary()) { + // we don't know the slave's port, so we make the replica set keep + // a map of rids to slaves + log(2) << "percolating " << lastOp.toString() << " from " << rid << endl; + theReplSet->ghost->send( boost::bind(&GhostSync::percolate, theReplSet->ghost, rid, lastOp) ); + } + } + + bool opReplicatedEnough( OpTime op , BSONElement w ) { + return slaveTracking.opReplicatedEnough( op , w ); + } + + bool opReplicatedEnough( OpTime op , int w ) { + return slaveTracking.replicatedToNum( op , w ); + } + + void resetSlaveCache() { + slaveTracking.reset(); + } + + unsigned getSlaveCount() { + return slaveTracking.getSlaveCount(); + } +} diff --git a/src/mongo/db/repl_block.h b/src/mongo/db/repl_block.h new file mode 100644 index 00000000000..bb74deea10f --- /dev/null +++ b/src/mongo/db/repl_block.h @@ -0,0 +1,39 @@ +// repl_block.h - blocking on writes for replication + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../pch.h" +#include "client.h" +#include "curop.h" + +/** + local.slaves - current location for all slaves + + */ +namespace mongo { + + void updateSlaveLocation( CurOp& curop, const char * oplog_ns , OpTime lastOp ); + + /** @return true if op has made it to w servers */ + bool opReplicatedEnough( OpTime op , int w ); + bool opReplicatedEnough( OpTime op , BSONElement w ); + + void resetSlaveCache(); + unsigned getSlaveCount(); +} diff --git a/src/mongo/db/replutil.h b/src/mongo/db/replutil.h new file mode 100644 index 00000000000..6f4dbb875d2 --- /dev/null +++ b/src/mongo/db/replutil.h @@ -0,0 +1,102 @@ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "db.h" +#include "dbhelpers.h" +#include "json.h" +#include "../client/dbclient.h" +#include "repl.h" +#include "cmdline.h" +#include "repl/rs.h" +#include "ops/query.h" + +namespace mongo { + + extern const char *replAllDead; + + /* note we always return true for the "local" namespace. + + we should not allow most operations when not the master + also we report not master if we are "dead". + + See also CmdIsMaster. + + If 'client' is not specified, the current client is used. + */ + inline bool _isMaster() { + if( replSet ) { + if( theReplSet ) + return theReplSet->isPrimary(); + return false; + } + + if( ! replSettings.slave ) + return true; + + if ( replAllDead ) + return false; + + if( replSettings.master ) { + // if running with --master --slave, allow. + return true; + } + + if ( cc().isGod() ) + return true; + + return false; + } + inline bool isMaster(const char * dbname = 0) { + if( _isMaster() ) + return true; + if ( ! dbname ) { + Database *database = cc().database(); + assert( database ); + dbname = database->name.c_str(); + } + return strcmp( dbname , "local" ) == 0; + } + inline bool isMasterNs( const char *ns ) { + if ( _isMaster() ) + return true; + assert( ns ); + if ( ! str::startsWith( ns , "local" ) ) + return false; + return ns[5] == 0 || ns[5] == '.'; + } + + inline void notMasterUnless(bool expr) { + uassert( 10107 , "not master" , expr ); + } + + /** we allow queries to SimpleSlave's */ + inline void replVerifyReadsOk(ParsedQuery& pq) { + if( replSet ) { + /* todo: speed up the secondary case. as written here there are 2 mutex entries, it can b 1. */ + if( isMaster() ) return; + uassert(13435, "not master and slaveOk=false", pq.hasOption(QueryOption_SlaveOk)); + uassert(13436, "not master or secondary; cannot currently read from this replSet member", theReplSet && theReplSet->isSecondary() ); + } + else { + notMasterUnless(isMaster() || pq.hasOption(QueryOption_SlaveOk) || replSettings.slave == SimpleSlave ); + } + } + + + +} // namespace mongo diff --git a/src/mongo/db/resource.h b/src/mongo/db/resource.h new file mode 100644 index 00000000000..9ba1ed26a0c --- /dev/null +++ b/src/mongo/db/resource.h @@ -0,0 +1,16 @@ +//{{NO_DEPENDENCIES}} +// Microsoft Visual C++ generated include file. +// Used by db.rc +// +#define IDI_ICON2 102 + +// Next default values for new objects +// +#ifdef APSTUDIO_INVOKED +#ifndef APSTUDIO_READONLY_SYMBOLS +#define _APS_NEXT_RESOURCE_VALUE 104 +#define _APS_NEXT_COMMAND_VALUE 40001 +#define _APS_NEXT_CONTROL_VALUE 1001 +#define _APS_NEXT_SYMED_VALUE 101 +#endif +#endif diff --git a/src/mongo/db/restapi.cpp b/src/mongo/db/restapi.cpp new file mode 100644 index 00000000000..370051354a2 --- /dev/null +++ b/src/mongo/db/restapi.cpp @@ -0,0 +1,294 @@ +/** @file resetapi.cpp + web rest api +*/ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "../util/net/miniwebserver.h" +#include "../util/mongoutils/html.h" +#include "../util/md5.hpp" +#include "instance.h" +#include "dbwebserver.h" +#include "dbhelpers.h" +#include "repl.h" +#include "replutil.h" +#include "clientcursor.h" +#include "background.h" + +#include "restapi.h" + +namespace mongo { + + extern const char *replInfo; + bool getInitialSyncCompleted(); + + using namespace bson; + using namespace mongoutils::html; + + class RESTHandler : public DbWebHandler { + public: + RESTHandler() : DbWebHandler( "DUMMY REST" , 1000 , true ) {} + + virtual bool handles( const string& url ) const { + return + url[0] == '/' && + url.find_last_of( '/' ) > 0; + } + + virtual void handle( const char *rq, string url, BSONObj params, + string& responseMsg, int& responseCode, + vector<string>& headers, const SockAddr &from ) { + + string::size_type first = url.find( "/" , 1 ); + if ( first == string::npos ) { + responseCode = 400; + return; + } + + string method = MiniWebServer::parseMethod( rq ); + string dbname = url.substr( 1 , first - 1 ); + string coll = url.substr( first + 1 ); + string action = ""; + + string::size_type last = coll.find_last_of( "/" ); + if ( last == string::npos ) { + action = coll; + coll = "_defaultCollection"; + } + else { + action = coll.substr( last + 1 ); + coll = coll.substr( 0 , last ); + } + + for ( string::size_type i=0; i<coll.size(); i++ ) + if ( coll[i] == '/' ) + coll[i] = '.'; + + string fullns = MiniWebServer::urlDecode(dbname + "." + coll); + + headers.push_back( (string)"x-action: " + action ); + headers.push_back( (string)"x-ns: " + fullns ); + + bool html = false; + + stringstream ss; + + if ( method == "GET" ) { + responseCode = 200; + html = handleRESTQuery( fullns , action , params , responseCode , ss ); + } + else if ( method == "POST" ) { + responseCode = 201; + handlePost( fullns , MiniWebServer::body( rq ) , params , responseCode , ss ); + } + else { + responseCode = 400; + headers.push_back( "X_err: bad request" ); + ss << "don't know how to handle a [" << method << "]"; + out() << "don't know how to handle a [" << method << "]" << endl; + } + + if( html ) + headers.push_back("Content-Type: text/html;charset=utf-8"); + else + headers.push_back("Content-Type: text/plain;charset=utf-8"); + + responseMsg = ss.str(); + } + + bool handleRESTQuery( string ns , string action , BSONObj & params , int & responseCode , stringstream & out ) { + Timer t; + + int html = _getOption( params["html"] , 0 ); + int skip = _getOption( params["skip"] , 0 ); + int num = _getOption( params["limit"] , _getOption( params["count" ] , 1000 ) ); // count is old, limit is new + + int one = 0; + if ( params["one"].type() == String && tolower( params["one"].valuestr()[0] ) == 't' ) { + num = 1; + one = 1; + } + + BSONObjBuilder queryBuilder; + + BSONObjIterator i(params); + while ( i.more() ) { + BSONElement e = i.next(); + string name = e.fieldName(); + if ( ! name.find( "filter_" ) == 0 ) + continue; + + string field = name.substr(7); + const char * val = e.valuestr(); + + char * temp; + + // TODO: this is how i guess if something is a number. pretty lame right now + double number = strtod( val , &temp ); + if ( temp != val ) + queryBuilder.append( field , number ); + else + queryBuilder.append( field , val ); + } + + BSONObj query = queryBuilder.obj(); + auto_ptr<DBClientCursor> cursor = db.query( ns.c_str() , query, num , skip ); + uassert( 13085 , "query failed for dbwebserver" , cursor.get() ); + + if ( one ) { + if ( cursor->more() ) { + BSONObj obj = cursor->next(); + out << obj.jsonString(Strict,html?1:0) << '\n'; + } + else { + responseCode = 404; + } + return html != 0; + } + + if( html ) { + string title = string("query ") + ns; + out << start(title) + << p(title) + << "<pre>"; + } + else { + out << "{\n"; + out << " \"offset\" : " << skip << ",\n"; + out << " \"rows\": [\n"; + } + + int howMany = 0; + while ( cursor->more() ) { + if ( howMany++ && html == 0 ) + out << " ,\n"; + BSONObj obj = cursor->next(); + if( html ) { + if( out.tellp() > 4 * 1024 * 1024 ) { + out << "Stopping output: more than 4MB returned and in html mode\n"; + break; + } + out << obj.jsonString(Strict, html?1:0) << "\n\n"; + } + else { + if( out.tellp() > 50 * 1024 * 1024 ) // 50MB limit - we are using ram + break; + out << " " << obj.jsonString(); + } + } + + if( html ) { + out << "</pre>\n"; + if( howMany == 0 ) out << p("Collection is empty"); + out << _end(); + } + else { + out << "\n ],\n\n"; + out << " \"total_rows\" : " << howMany << " ,\n"; + out << " \"query\" : " << query.jsonString() << " ,\n"; + out << " \"millis\" : " << t.millis() << '\n'; + out << "}\n"; + } + + return html != 0; + } + + // TODO Generate id and revision per couch POST spec + void handlePost( string ns, const char *body, BSONObj& params, int & responseCode, stringstream & out ) { + try { + BSONObj obj = fromjson( body ); + db.insert( ns.c_str(), obj ); + } + catch ( ... ) { + responseCode = 400; // Bad Request. Seems reasonable for now. + out << "{ \"ok\" : false }"; + return; + } + + responseCode = 201; + out << "{ \"ok\" : true }"; + } + + int _getOption( BSONElement e , int def ) { + if ( e.isNumber() ) + return e.numberInt(); + if ( e.type() == String ) + return atoi( e.valuestr() ); + return def; + } + + DBDirectClient db; + + } restHandler; + + bool RestAdminAccess::haveAdminUsers() const { + readlocktryassert rl("admin.system.users", 10000); + Client::Context cx( "admin.system.users", dbpath, false ); + return ! Helpers::isEmpty("admin.system.users", false); + } + + BSONObj RestAdminAccess::getAdminUser( const string& username ) const { + Client::GodScope gs; + readlocktryassert rl("admin.system.users", 10000); + Client::Context cx( "admin.system.users" ); + BSONObj user; + if ( Helpers::findOne( "admin.system.users" , BSON( "user" << username ) , user ) ) + return user.copy(); + return BSONObj(); + } + + class LowLevelMongodStatus : public WebStatusPlugin { + public: + LowLevelMongodStatus() : WebStatusPlugin( "overview" , 5 , "(only reported if can acquire read lock quickly)" ) {} + + virtual void init() {} + + void _gotLock( int millis , stringstream& ss ) { + ss << "<pre>\n"; + ss << "time to get readlock: " << millis << "ms\n"; + ss << "# databases: " << dbHolder().sizeInfo() << '\n'; + ss << "# Cursors: " << ClientCursor::numCursors() << '\n'; + ss << "replication: "; + if( *replInfo ) + ss << "\nreplInfo: " << replInfo << "\n\n"; + if( replSet ) { + ss << a("", "see replSetGetStatus link top of page") << "--replSet </a>" << cmdLine._replSet; + } + if ( replAllDead ) + ss << "\n<b>replication replAllDead=" << replAllDead << "</b>\n"; + else { + ss << "\nmaster: " << replSettings.master << '\n'; + ss << "slave: " << replSettings.slave << '\n'; + ss << '\n'; + } + + BackgroundOperation::dump(ss); + ss << "</pre>\n"; + } + + virtual void run( stringstream& ss ) { + Timer t; + readlocktry lk( "" , 300 ); + if ( lk.got() ) { + _gotLock( t.millis() , ss ); + } + else { + ss << "\n<b>timed out getting lock</b>\n"; + } + } + } lowLevelMongodStatus; +} diff --git a/src/mongo/db/restapi.h b/src/mongo/db/restapi.h new file mode 100644 index 00000000000..e5ac52083fe --- /dev/null +++ b/src/mongo/db/restapi.h @@ -0,0 +1,34 @@ +/** @file restapi.h + */ + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "../util/admin_access.h" + +namespace mongo { + + class RestAdminAccess : public AdminAccess { + public: + virtual ~RestAdminAccess() { } + + virtual bool haveAdminUsers() const; + virtual BSONObj getAdminUser( const string& username ) const; + }; + +} // namespace mongo diff --git a/src/mongo/db/scanandorder.cpp b/src/mongo/db/scanandorder.cpp new file mode 100644 index 00000000000..b5e282a5866 --- /dev/null +++ b/src/mongo/db/scanandorder.cpp @@ -0,0 +1,105 @@ +/* scanandorder.cpp + Order results (that aren't already indexes and in order.) +*/ + +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "pch.h" +#include "scanandorder.h" + +namespace mongo { + + const unsigned ScanAndOrder::MaxScanAndOrderBytes = 32 * 1024 * 1024; + + void ScanAndOrder::_add(BSONObj& k, BSONObj o, DiskLoc* loc) { + if (!loc) { + _best.insert(make_pair(k.getOwned(),o.getOwned())); + } + else { + BSONObjBuilder b; + b.appendElements(o); + b.append("$diskLoc", loc->toBSONObj()); + _best.insert(make_pair(k.getOwned(), b.obj().getOwned())); + } + } + + void ScanAndOrder::_addIfBetter(BSONObj& k, BSONObj o, BestMap::iterator i, DiskLoc* loc) { + /* todo : we don't correct _approxSize here. */ + const BSONObj& worstBestKey = i->first; + int c = worstBestKey.woCompare(k, _order._spec.keyPattern); + if ( c > 0 ) { + // k is better, 'upgrade' + _best.erase(i); + _add(k, o, loc); + } + } + + + void ScanAndOrder::add(BSONObj o, DiskLoc* loc) { + assert( o.isValid() ); + BSONObj k; + try { + k = _order.getKeyFromObject(o); + } + catch (UserException &e) { + if ( e.getCode() == ParallelArraysCode ) { // cannot get keys for parallel arrays + // fix lasterror text to be more accurate. + uasserted( 15925, "cannot sort with keys that are parallel arrays" ); + } + else + throw; + } + + if ( k.isEmpty() ) { + return; + } + if ( (int) _best.size() < _limit ) { + _approxSize += k.objsize(); + _approxSize += o.objsize(); + + /* note : adjust when bson return limit adjusts. note this limit should be a bit higher. */ + uassert( 10128 , "too much data for sort() with no index. add an index or specify a smaller limit", _approxSize < MaxScanAndOrderBytes ); + + _add(k, o, loc); + return; + } + BestMap::iterator i; + assert( _best.end() != _best.begin() ); + i = _best.end(); + i--; + _addIfBetter(k, o, i, loc); + } + + + void ScanAndOrder::fill(BufBuilder& b, Projection *filter, int& nout ) const { + int n = 0; + int nFilled = 0; + for ( BestMap::const_iterator i = _best.begin(); i != _best.end(); i++ ) { + n++; + if ( n <= _startFrom ) + continue; + const BSONObj& o = i->second; + fillQueryResultFromObj(b, filter, o); + nFilled++; + if ( nFilled >= _limit ) + break; + uassert( 10129 , "too much data for sort() with no index", b.len() < (int)MaxScanAndOrderBytes ); // appserver limit + } + nout = nFilled; + } + +} // namespace mongo diff --git a/src/mongo/db/scanandorder.h b/src/mongo/db/scanandorder.h new file mode 100644 index 00000000000..33e76f61f67 --- /dev/null +++ b/src/mongo/db/scanandorder.h @@ -0,0 +1,111 @@ +/* scanandorder.h + Order results (that aren't already indexes and in order.) +*/ + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "indexkey.h" +#include "queryutil.h" +#include "projection.h" + +namespace mongo { + + /* todo: + _ limit amount of data + */ + + class KeyType : boost::noncopyable { + public: + IndexSpec _spec; + FieldRangeVector _keyCutter; + public: + KeyType(BSONObj pattern, const FieldRangeSet &frs): + _spec((assert(!pattern.isEmpty()),pattern)), + _keyCutter(frs, _spec, 1) { + } + + /** + * @return first key of the object that would be encountered while + * scanning index with keySpec 'pattern' using constraints 'frs', or + * BSONObj() if no such key. + */ + BSONObj getKeyFromObject(BSONObj o) { + return _keyCutter.firstMatch(o); + } + }; + + /* todo: + _ respect limit + _ check for excess mem usage + _ response size limit from runquery; push it up a bit. + */ + + inline void fillQueryResultFromObj(BufBuilder& bb, Projection *filter, const BSONObj& js, DiskLoc* loc=NULL) { + if ( filter ) { + BSONObjBuilder b( bb ); + filter->transform( js , b ); + if (loc) + b.append("$diskLoc", loc->toBSONObj()); + b.done(); + } + else if (loc) { + BSONObjBuilder b( bb ); + b.appendElements(js); + b.append("$diskLoc", loc->toBSONObj()); + b.done(); + } + else { + bb.appendBuf((void*) js.objdata(), js.objsize()); + } + } + + typedef multimap<BSONObj,BSONObj,BSONObjCmp> BestMap; + class ScanAndOrder { + public: + static const unsigned MaxScanAndOrderBytes; + + ScanAndOrder(int startFrom, int limit, BSONObj order, const FieldRangeSet &frs) : + _best( BSONObjCmp( order ) ), + _startFrom(startFrom), _order(order, frs) { + _limit = limit > 0 ? limit + _startFrom : 0x7fffffff; + _approxSize = 0; + } + + int size() const { return _best.size(); } + + void add(BSONObj o, DiskLoc* loc); + + /* scanning complete. stick the query result in b for n objects. */ + void fill(BufBuilder& b, Projection *filter, int& nout ) const; + + private: + + void _add(BSONObj& k, BSONObj o, DiskLoc* loc); + + void _addIfBetter(BSONObj& k, BSONObj o, BestMap::iterator i, DiskLoc* loc); + + BestMap _best; // key -> full object + int _startFrom; + int _limit; // max to send back. + KeyType _order; + unsigned _approxSize; + + }; + +} // namespace mongo diff --git a/src/mongo/db/security.cpp b/src/mongo/db/security.cpp new file mode 100644 index 00000000000..c9b9bb40326 --- /dev/null +++ b/src/mongo/db/security.cpp @@ -0,0 +1,106 @@ +// security.cpp + +/** + * Copyright (C) 2009 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "pch.h" +#include "security.h" +#include "security_common.h" +#include "instance.h" +#include "client.h" +#include "curop-inl.h" +#include "db.h" +#include "dbhelpers.h" + +// this is the _mongod only_ implementation of security.h + +namespace mongo { + + bool AuthenticationInfo::_warned = false; + /* + void AuthenticationInfo::print() const { + cout << "AuthenticationInfo: " << this << '\n'; + for ( MA::const_iterator i=_dbs.begin(); i!=_dbs.end(); i++ ) { + cout << "\t" << i->first << "\t" << i->second.level << '\n'; + } + cout << "END" << endl; + } + */ + + string AuthenticationInfo::getUser( const string& dbname ) const { + scoped_spinlock lk(_lock); + + MA::const_iterator i = _dbs.find(dbname); + if ( i == _dbs.end() ) + return ""; + + return i->second.user; + } + + + bool AuthenticationInfo::_isAuthorizedSpecialChecks( const string& dbname ) const { + if ( cc().isGod() ) + return true; + + if ( isLocalHost ) { + Client::GodScope gs; + Client::ReadContext ctx("admin.system.users"); + BSONObj result; + if( ! Helpers::getSingleton("admin.system.users", result) ) { + if( ! _warned ) { + // you could get a few of these in a race, but that's ok + _warned = true; + log() << "note: no users configured in admin.system.users, allowing localhost access" << endl; + } + return true; + } + } + + return false; + } + + bool CmdAuthenticate::getUserObj(const string& dbname, const string& user, BSONObj& userObj, string& pwd) { + if (user == internalSecurity.user) { + uassert(15889, "key file must be used to log in with internal user", cmdLine.keyFile); + pwd = internalSecurity.pwd; + } + else { + // static BSONObj userPattern = fromjson("{\"user\":1}"); + string systemUsers = dbname + ".system.users"; + // OCCASIONALLY Helpers::ensureIndex(systemUsers.c_str(), userPattern, false, "user_1"); + { + BSONObjBuilder b; + b << "user" << user; + BSONObj query = b.done(); + if( !Helpers::findOne(systemUsers.c_str(), query, userObj) ) { + log() << "auth: couldn't find user " << user << ", " << systemUsers << endl; + return false; + } + } + + pwd = userObj.getStringField("pwd"); + } + return true; + } + + bool CmdLogout::run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + AuthenticationInfo *ai = cc().getAuthenticationInfo(); + ai->logout(dbname); + return true; + } + +} // namespace mongo + diff --git a/src/mongo/db/security.h b/src/mongo/db/security.h new file mode 100755 index 00000000000..f193f305def --- /dev/null +++ b/src/mongo/db/security.h @@ -0,0 +1,113 @@ +// security.h + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "nonce.h" +#include "concurrency.h" +#include "security_common.h" +#include "../util/concurrency/spin_lock.h" + +// this is used by both mongos and mongod + +namespace mongo { + + /* + * for a particular db + * levels + * 0 : none + * 1 : read + * 2 : write + */ + struct Auth { + + enum Level { NONE = 0 , READ = 1 , WRITE = 2 }; + + Auth() { level = NONE; } + Level level; + string user; + }; + + class AuthenticationInfo : boost::noncopyable { + public: + bool isLocalHost; + + AuthenticationInfo(){ isLocalHost = false; } + ~AuthenticationInfo() {} + + // -- modifiers ---- + + void logout(const string& dbname ) { + scoped_spinlock lk(_lock); + _dbs.erase(dbname); + } + void authorize(const string& dbname , const string& user ) { + scoped_spinlock lk(_lock); + _dbs[dbname].level = Auth::WRITE; + _dbs[dbname].user = user; + } + void authorizeReadOnly(const string& dbname , const string& user ) { + scoped_spinlock lk(_lock); + _dbs[dbname].level = Auth::READ; + _dbs[dbname].user = user; + } + + // -- accessors --- + + bool isAuthorized(const string& dbname) const { + return _isAuthorized( dbname, Auth::WRITE ); + } + + bool isAuthorizedReads(const string& dbname) const { + return _isAuthorized( dbname, Auth::READ ); + } + + /** + * @param lockType - this is from dbmutex 1 is write, 0 is read + */ + bool isAuthorizedForLock(const string& dbname, int lockType ) const { + return _isAuthorized( dbname , lockType > 0 ? Auth::WRITE : Auth::READ ); + } + + bool isAuthorizedForLevel( const string& dbname , Auth::Level level ) const { + return _isAuthorized( dbname , level ); + } + + string getUser( const string& dbname ) const; + + void print() const; + + protected: + /** takes a lock */ + bool _isAuthorized(const string& dbname, Auth::Level level) const; + + bool _isAuthorizedSingle_inlock(const string& dbname, Auth::Level level) const; + + /** cannot call this locked */ + bool _isAuthorizedSpecialChecks( const string& dbname ) const ; + + private: + mutable SpinLock _lock; + + typedef map<string,Auth> MA; + MA _dbs; // dbname -> auth + + static bool _warned; + }; + +} // namespace mongo diff --git a/src/mongo/db/security_commands.cpp b/src/mongo/db/security_commands.cpp new file mode 100644 index 00000000000..33dbd597c83 --- /dev/null +++ b/src/mongo/db/security_commands.cpp @@ -0,0 +1,150 @@ +// security_commands.cpp +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +// security.cpp links with both dbgrid and db. this file db only -- at least for now. + +// security.cpp + +#include "pch.h" +#include "security.h" +#include "../util/md5.hpp" +#include "json.h" +#include "pdfile.h" +#include "db.h" +#include "dbhelpers.h" +#include "commands.h" +#include "jsobj.h" +#include "client.h" + +namespace mongo { + + /* authentication + + system.users contains + { user : <username>, pwd : <pwd_digest>, ... } + + getnonce sends nonce to client + + client then sends { authenticate:1, nonce64:<nonce_str>, user:<username>, key:<key> } + + where <key> is md5(<nonce_str><username><pwd_digest_str>) as a string + */ + + boost::thread_specific_ptr<nonce64> lastNonce; + + class CmdGetNonce : public Command { + public: + virtual bool requiresAuth() { return false; } + virtual bool logTheOp() { return false; } + virtual bool slaveOk() const { + return true; + } + void help(stringstream& h) const { h << "internal"; } + virtual LockType locktype() const { return NONE; } + CmdGetNonce() : Command("getnonce") {} + bool run(const string&, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + nonce64 *n = new nonce64(Security::getNonce()); + stringstream ss; + ss << hex << *n; + result.append("nonce", ss.str() ); + lastNonce.reset(n); + return true; + } + } cmdGetNonce; + + CmdLogout cmdLogout; + + bool CmdAuthenticate::run(const string& dbname , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + log() << " authenticate: " << cmdObj << endl; + + string user = cmdObj.getStringField("user"); + string key = cmdObj.getStringField("key"); + string received_nonce = cmdObj.getStringField("nonce"); + + if( user.empty() || key.empty() || received_nonce.empty() ) { + log() << "field missing/wrong type in received authenticate command " + << dbname + << endl; + errmsg = "auth fails"; + sleepmillis(10); + return false; + } + + stringstream digestBuilder; + + { + bool reject = false; + nonce64 *ln = lastNonce.release(); + if ( ln == 0 ) { + reject = true; + log(1) << "auth: no lastNonce" << endl; + } + else { + digestBuilder << hex << *ln; + reject = digestBuilder.str() != received_nonce; + if ( reject ) log(1) << "auth: different lastNonce" << endl; + } + + if ( reject ) { + log() << "auth: bad nonce received or getnonce not called. could be a driver bug or a security attack. db:" << dbname << endl; + errmsg = "auth fails"; + sleepmillis(30); + return false; + } + } + + BSONObj userObj; + string pwd; + if (!getUserObj(dbname, user, userObj, pwd)) { + errmsg = "auth fails"; + return false; + } + + md5digest d; + { + digestBuilder << user << pwd; + string done = digestBuilder.str(); + + md5_state_t st; + md5_init(&st); + md5_append(&st, (const md5_byte_t *) done.c_str(), done.size()); + md5_finish(&st, d); + } + + string computed = digestToString( d ); + + if ( key != computed ) { + log() << "auth: key mismatch " << user << ", ns:" << dbname << endl; + errmsg = "auth fails"; + return false; + } + + bool readOnly = userObj["readOnly"].trueValue(); + authenticate(dbname, user, readOnly ); + + + result.append( "dbname" , dbname ); + result.append( "user" , user ); + result.appendBool( "readOnly" , readOnly ); + + + return true; + } + + CmdAuthenticate cmdAuthenticate; + +} // namespace mongo diff --git a/src/mongo/db/security_common.cpp b/src/mongo/db/security_common.cpp new file mode 100644 index 00000000000..a480919c27e --- /dev/null +++ b/src/mongo/db/security_common.cpp @@ -0,0 +1,148 @@ +// security_common.cpp +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** + * This file contains inter-mongo instance security helpers. Due to the + * requirement that it be possible to compile this into mongos and mongod, it + * should not depend on much external stuff. + */ + +#include "pch.h" +#include "security.h" +#include "security_common.h" +#include "../client/dbclient.h" +#include "commands.h" +#include "nonce.h" +#include "../util/md5.hpp" +#include "client_common.h" +#include <sys/stat.h> + +namespace mongo { + + bool noauth = true; + AuthInfo internalSecurity; + + bool setUpSecurityKey(const string& filename) { + struct stat stats; + + // check obvious file errors + if (stat(filename.c_str(), &stats) == -1) { + log() << "error getting file " << filename << ": " << strerror(errno) << endl; + return false; + } + +#if !defined(_WIN32) + // check permissions: must be X00, where X is >= 4 + if ((stats.st_mode & (S_IRWXG|S_IRWXO)) != 0) { + log() << "permissions on " << filename << " are too open" << endl; + return false; + } +#endif + + const unsigned long long fileLength = stats.st_size; + if (fileLength < 6 || fileLength > 1024) { + log() << " key file " << filename << " has length " << stats.st_size + << ", must be between 6 and 1024 chars" << endl; + return false; + } + + FILE* file = fopen( filename.c_str(), "rb" ); + if (!file) { + log() << "error opening file: " << filename << ": " << strerror(errno) << endl; + return false; + } + + string str = ""; + + // strip key file + unsigned long long read = 0; + while (read < fileLength) { + char buf; + int readLength = fread(&buf, 1, 1, file); + if (readLength < 1) { + log() << "error reading file " << filename << endl; + return false; + } + read++; + + // check for whitespace + if ((buf >= '\x09' && buf <= '\x0D') || buf == ' ') { + continue; + } + + // check valid base64 + if ((buf < 'A' || buf > 'Z') && (buf < 'a' || buf > 'z') && (buf < '0' || buf > '9') && buf != '+' && buf != '/') { + log() << "invalid char in key file " << filename << ": " << buf << endl; + return false; + } + + str += buf; + } + + if (str.size() < 6) { + log() << "security key must be at least 6 characters" << endl; + return false; + } + + log(1) << "security key: " << str << endl; + + // createPWDigest should really not be a member func + DBClientConnection conn; + internalSecurity.pwd = conn.createPasswordDigest(internalSecurity.user, str); + + return true; + } + + void CmdAuthenticate::authenticate(const string& dbname, const string& user, const bool readOnly) { + ClientBasic* c = ClientBasic::getCurrent(); + assert(c); + AuthenticationInfo *ai = c->getAuthenticationInfo(); + + if ( readOnly ) { + ai->authorizeReadOnly( dbname , user ); + } + else { + ai->authorize( dbname , user ); + } + } + + + bool AuthenticationInfo::_isAuthorized(const string& dbname, Auth::Level level) const { + { + scoped_spinlock lk(_lock); + + if ( _isAuthorizedSingle_inlock( dbname , level ) ) + return true; + + if ( noauth ) + return true; + + if ( _isAuthorizedSingle_inlock( "admin" , level ) ) + return true; + + if ( _isAuthorizedSingle_inlock( "local" , level ) ) + return true; + } + return _isAuthorizedSpecialChecks( dbname ); + } + + bool AuthenticationInfo::_isAuthorizedSingle_inlock(const string& dbname, Auth::Level level) const { + MA::const_iterator i = _dbs.find(dbname); + return i != _dbs.end() && i->second.level >= level; + } + +} // namespace mongo diff --git a/src/mongo/db/security_common.h b/src/mongo/db/security_common.h new file mode 100644 index 00000000000..6615c6e573e --- /dev/null +++ b/src/mongo/db/security_common.h @@ -0,0 +1,85 @@ +// security_common.h + +/** +* Copyright (C) 2009 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "commands.h" +#include "concurrency.h" +#include "../util/concurrency/spin_lock.h" + +namespace mongo { + + /** + * Internal secret key info. + */ + struct AuthInfo { + AuthInfo() { + user = "__system"; + } + string user; + string pwd; + }; + + // --noauth cmd line option + extern bool noauth; + extern AuthInfo internalSecurity; + + /** + * This method checks the validity of filename as a security key, hashes its + * contents, and stores it in the internalSecurity variable. Prints an + * error message to the logs if there's an error. + * @param filename the file containing the key + * @return if the key was successfully stored + */ + bool setUpSecurityKey(const string& filename); + + class CmdAuthenticate : public Command { + public: + virtual bool requiresAuth() { return false; } + virtual bool logTheOp() { + return false; + } + virtual bool slaveOk() const { + return true; + } + virtual LockType locktype() const { return READ; } + virtual void help(stringstream& ss) const { ss << "internal"; } + CmdAuthenticate() : Command("authenticate") {} + bool run(const string& dbname , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl); + void authenticate(const string& dbname, const string& user, const bool readOnly); + private: + bool getUserObj(const string& dbname, const string& user, BSONObj& userObj, string& pwd); + }; + + extern CmdAuthenticate cmdAuthenticate; + + class CmdLogout : public Command { + public: + virtual bool logTheOp() { + return false; + } + virtual bool slaveOk() const { + return true; + } + void help(stringstream& h) const { h << "de-authenticate"; } + virtual LockType locktype() const { return NONE; } + CmdLogout() : Command("logout") {} + bool run(const string& dbname , BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl); + }; + +} // namespace mongo diff --git a/src/mongo/db/stats/counters.cpp b/src/mongo/db/stats/counters.cpp new file mode 100644 index 00000000000..889e8a86c4c --- /dev/null +++ b/src/mongo/db/stats/counters.cpp @@ -0,0 +1,207 @@ +// counters.cpp +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + +#include "pch.h" +#include "../jsobj.h" +#include "counters.h" + +namespace mongo { + + OpCounters::OpCounters() { + int zero = 0; + + BSONObjBuilder b; + b.append( "insert" , zero ); + b.append( "query" , zero ); + b.append( "update" , zero ); + b.append( "delete" , zero ); + b.append( "getmore" , zero ); + b.append( "command" , zero ); + _obj = b.obj(); + + _insert = (AtomicUInt*)_obj["insert"].value(); + _query = (AtomicUInt*)_obj["query"].value(); + _update = (AtomicUInt*)_obj["update"].value(); + _delete = (AtomicUInt*)_obj["delete"].value(); + _getmore = (AtomicUInt*)_obj["getmore"].value(); + _command = (AtomicUInt*)_obj["command"].value(); + } + + void OpCounters::gotOp( int op , bool isCommand ) { + switch ( op ) { + case dbInsert: /*gotInsert();*/ break; // need to handle multi-insert + case dbQuery: + if ( isCommand ) + gotCommand(); + else + gotQuery(); + break; + + case dbUpdate: gotUpdate(); break; + case dbDelete: gotDelete(); break; + case dbGetMore: gotGetMore(); break; + case dbKillCursors: + case opReply: + case dbMsg: + break; + default: log() << "OpCounters::gotOp unknown op: " << op << endl; + } + } + + BSONObj& OpCounters::getObj() { + const unsigned MAX = 1 << 30; + RARELY { + bool wrap = + _insert->get() > MAX || + _query->get() > MAX || + _update->get() > MAX || + _delete->get() > MAX || + _getmore->get() > MAX || + _command->get() > MAX; + + if ( wrap ) { + _insert->zero(); + _query->zero(); + _update->zero(); + _delete->zero(); + _getmore->zero(); + _command->zero(); + } + + } + return _obj; + } + + IndexCounters::IndexCounters() { + _memSupported = _pi.blockCheckSupported(); + + _btreeMemHits = 0; + _btreeMemMisses = 0; + _btreeAccesses = 0; + + + _maxAllowed = ( numeric_limits< long long >::max() ) / 2; + _resets = 0; + + _sampling = 0; + _samplingrate = 100; + } + + void IndexCounters::append( BSONObjBuilder& b ) { + if ( ! _memSupported ) { + b.append( "note" , "not supported on this platform" ); + return; + } + + BSONObjBuilder bb( b.subobjStart( "btree" ) ); + bb.appendNumber( "accesses" , _btreeAccesses ); + bb.appendNumber( "hits" , _btreeMemHits ); + bb.appendNumber( "misses" , _btreeMemMisses ); + + bb.append( "resets" , _resets ); + + bb.append( "missRatio" , (_btreeAccesses ? (_btreeMemMisses / (double)_btreeAccesses) : 0) ); + + bb.done(); + + if ( _btreeAccesses > _maxAllowed ) { + _btreeAccesses = 0; + _btreeMemMisses = 0; + _btreeMemHits = 0; + _resets++; + } + } + + FlushCounters::FlushCounters() + : _total_time(0) + , _flushes(0) + , _last() + {} + + void FlushCounters::flushed(int ms) { + _flushes++; + _total_time += ms; + _last_time = ms; + _last = jsTime(); + } + + void FlushCounters::append( BSONObjBuilder& b ) { + b.appendNumber( "flushes" , _flushes ); + b.appendNumber( "total_ms" , _total_time ); + b.appendNumber( "average_ms" , (_flushes ? (_total_time / double(_flushes)) : 0.0) ); + b.appendNumber( "last_ms" , _last_time ); + b.append("last_finished", _last); + } + + + void GenericCounter::hit( const string& name , int count ) { + scoped_lock lk( _mutex ); + _counts[name]++; + } + + BSONObj GenericCounter::getObj() { + BSONObjBuilder b(128); + { + mongo::mutex::scoped_lock lk( _mutex ); + for ( map<string,long long>::iterator i=_counts.begin(); i!=_counts.end(); i++ ) { + b.appendNumber( i->first , i->second ); + } + } + return b.obj(); + } + + + void NetworkCounter::hit( long long bytesIn , long long bytesOut ) { + const long long MAX = 1ULL << 60; + + // don't care about the race as its just a counter + bool overflow = _bytesIn > MAX || _bytesOut > MAX; + + if ( overflow ) { + _lock.lock(); + _overflows++; + _bytesIn = bytesIn; + _bytesOut = bytesOut; + _requests = 1; + _lock.unlock(); + } + else { + _lock.lock(); + _bytesIn += bytesIn; + _bytesOut += bytesOut; + _requests++; + _lock.unlock(); + } + } + + void NetworkCounter::append( BSONObjBuilder& b ) { + _lock.lock(); + b.appendNumber( "bytesIn" , _bytesIn ); + b.appendNumber( "bytesOut" , _bytesOut ); + b.appendNumber( "numRequests" , _requests ); + _lock.unlock(); + } + + + OpCounters globalOpCounters; + OpCounters replOpCounters; + IndexCounters globalIndexCounters; + FlushCounters globalFlushCounters; + NetworkCounter networkCounter; + +} diff --git a/src/mongo/db/stats/counters.h b/src/mongo/db/stats/counters.h new file mode 100644 index 00000000000..0cb29aa49aa --- /dev/null +++ b/src/mongo/db/stats/counters.h @@ -0,0 +1,159 @@ +// counters.h +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "../../pch.h" +#include "../jsobj.h" +#include "../../util/net/message.h" +#include "../../util/processinfo.h" +#include "../../util/concurrency/spin_lock.h" + +namespace mongo { + + /** + * for storing operation counters + * note: not thread safe. ok with that for speed + */ + class OpCounters { + public: + + OpCounters(); + + AtomicUInt * getInsert() { return _insert; } + AtomicUInt * getQuery() { return _query; } + AtomicUInt * getUpdate() { return _update; } + AtomicUInt * getDelete() { return _delete; } + AtomicUInt * getGetMore() { return _getmore; } + AtomicUInt * getCommand() { return _command; } + + void incInsertInWriteLock(int n) { _insert->x += n; } + void gotInsert() { _insert[0]++; } + void gotQuery() { _query[0]++; } + void gotUpdate() { _update[0]++; } + void gotDelete() { _delete[0]++; } + void gotGetMore() { _getmore[0]++; } + void gotCommand() { _command[0]++; } + + void gotOp( int op , bool isCommand ); + + BSONObj& getObj(); + + private: + BSONObj _obj; + + // todo: there will be a lot of cache line contention on these. need to do something + // else eventually. + AtomicUInt * _insert; + AtomicUInt * _query; + AtomicUInt * _update; + AtomicUInt * _delete; + AtomicUInt * _getmore; + AtomicUInt * _command; + }; + + extern OpCounters globalOpCounters; + extern OpCounters replOpCounters; + + + class IndexCounters { + public: + IndexCounters(); + + // used without a mutex intentionally (can race) + void btree( char * node ) { + if ( ! _memSupported ) + return; + if ( _sampling++ % _samplingrate ) + return; + btree( _pi.blockInMemory( node ) ); + } + + void btree( bool memHit ) { + if ( memHit ) + _btreeMemHits++; + else + _btreeMemMisses++; + _btreeAccesses++; + } + void btreeHit() { _btreeMemHits++; _btreeAccesses++; } + void btreeMiss() { _btreeMemMisses++; _btreeAccesses++; } + + void append( BSONObjBuilder& b ); + + private: + ProcessInfo _pi; + bool _memSupported; + + int _sampling; + int _samplingrate; + + int _resets; + long long _maxAllowed; + + long long _btreeMemMisses; + long long _btreeMemHits; + long long _btreeAccesses; + }; + + extern IndexCounters globalIndexCounters; + + class FlushCounters { + public: + FlushCounters(); + + void flushed(int ms); + + void append( BSONObjBuilder& b ); + + private: + long long _total_time; + long long _flushes; + int _last_time; + Date_t _last; + }; + + extern FlushCounters globalFlushCounters; + + + class GenericCounter { + public: + GenericCounter() : _mutex("GenericCounter") { } + void hit( const string& name , int count=0 ); + BSONObj getObj(); + private: + map<string,long long> _counts; // TODO: replace with thread safe map + mongo::mutex _mutex; + }; + + class NetworkCounter { + public: + NetworkCounter() : _bytesIn(0), _bytesOut(0), _requests(0), _overflows(0) {} + void hit( long long bytesIn , long long bytesOut ); + void append( BSONObjBuilder& b ); + private: + long long _bytesIn; + long long _bytesOut; + long long _requests; + + long long _overflows; + + SpinLock _lock; + }; + + extern NetworkCounter networkCounter; +} diff --git a/src/mongo/db/stats/fine_clock.h b/src/mongo/db/stats/fine_clock.h new file mode 100644 index 00000000000..02600e718c4 --- /dev/null +++ b/src/mongo/db/stats/fine_clock.h @@ -0,0 +1,67 @@ +// fine_clock.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef DB_STATS_FINE_CLOCK_HEADER +#define DB_STATS_FINE_CLOCK_HEADER + +#include <time.h> // struct timespec + +namespace mongo { + + /** + * This is a nano-second precision clock. We're skipping the + * harware TSC in favor of clock_gettime() which in some systems + * does not involve a trip to the OS (VDSO). + * + * We're exporting a type WallTime that is and should remain + * opaque. The business of getting accurate time is still ongoing + * and we may change the internal representation of this class. + * (http://lwn.net/Articles/388188/) + * + * Really, you shouldn't be using this class in hot code paths for + * platforms you're not sure whether the overhead is low. + */ + class FineClock { + public: + + typedef timespec WallTime; + + static WallTime now() { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts; + } + + static uint64_t diffInNanos( WallTime end, WallTime start ) { + uint64_t diff; + if ( end.tv_nsec < start.tv_nsec ) { + diff = 1000000000 * ( end.tv_sec - start.tv_sec - 1); + diff += 1000000000 + end.tv_nsec - start.tv_nsec; + } + else { + diff = 1000000000 * ( end.tv_sec - start.tv_sec ); + diff += end.tv_nsec - start.tv_nsec; + } + return diff; + } + + }; +} + +#endif // DB_STATS_FINE_CLOCK_HEADER + diff --git a/src/mongo/db/stats/service_stats.cpp b/src/mongo/db/stats/service_stats.cpp new file mode 100644 index 00000000000..d69147fe969 --- /dev/null +++ b/src/mongo/db/stats/service_stats.cpp @@ -0,0 +1,68 @@ +// service_stats.cpp + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <sstream> + +#include "../../util/histogram.h" +#include "service_stats.h" + +namespace mongo { + + using std::ostringstream; + + ServiceStats::ServiceStats() { + // Time histogram covers up to 128msec in exponential intervals + // starting at 125usec. + Histogram::Options timeOpts; + timeOpts.numBuckets = 12; + timeOpts.bucketSize = 125; + timeOpts.exponential = true; + _timeHistogram = new Histogram( timeOpts ); + + // Space histogram covers up to 1MB in exponentialintervals starting + // at 1K. + Histogram::Options spaceOpts; + spaceOpts.numBuckets = 12; + spaceOpts.bucketSize = 1024; + spaceOpts.exponential = true; + _spaceHistogram = new Histogram( spaceOpts ); + } + + ServiceStats::~ServiceStats() { + delete _timeHistogram; + delete _spaceHistogram; + } + + void ServiceStats::logResponse( uint64_t duration, uint64_t bytes ) { + _spinLock.lock(); + _timeHistogram->insert( duration / 1000 /* in usecs */ ); + _spaceHistogram->insert( bytes ); + _spinLock.unlock(); + } + + string ServiceStats::toHTML() const { + ostringstream res ; + res << "Cumulative wire stats\n" + << "Response times\n" << _timeHistogram->toHTML() + << "Response sizes\n" << _spaceHistogram->toHTML() + << '\n'; + + return res.str(); + } + +} // mongo diff --git a/src/mongo/db/stats/service_stats.h b/src/mongo/db/stats/service_stats.h new file mode 100644 index 00000000000..5b0e75fdcb9 --- /dev/null +++ b/src/mongo/db/stats/service_stats.h @@ -0,0 +1,66 @@ +// service_stats.h + +/** +* Copyright (C) 2010 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef DB_STATS_SERVICE_STATS_HEADER +#define DB_STATS_SERVICE_STATS_HEADER + +#include <string> + +#include "../../util/concurrency/spin_lock.h" + +namespace mongo { + + using std::string; + + class Histogram; + + /** + * ServiceStats keeps track of the time a request/response message + * took inside a service as well as the size of the response + * generated. + */ + class ServiceStats { + public: + ServiceStats(); + ~ServiceStats(); + + /** + * Record the 'duration' in microseconds a request/response + * message took and the size in bytes of the generated + * response. + */ + void logResponse( uint64_t duration, uint64_t bytes ); + + /** + * Render the histogram as string that can be used inside an + * HTML doc. + */ + string toHTML() const; + + private: + SpinLock _spinLock; // protects state below + Histogram* _timeHistogram; + Histogram* _spaceHistogram; + + ServiceStats( const ServiceStats& ); + ServiceStats operator=( const ServiceStats& ); + }; + +} // namespace mongo + +#endif // DB_STATS_SERVICE_STATS_HEADER diff --git a/src/mongo/db/stats/snapshots.cpp b/src/mongo/db/stats/snapshots.cpp new file mode 100644 index 00000000000..900cc4ff1ad --- /dev/null +++ b/src/mongo/db/stats/snapshots.cpp @@ -0,0 +1,227 @@ +// snapshots.cpp + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "pch.h" +#include "snapshots.h" +#include "../client.h" +#include "../clientcursor.h" +#include "../dbwebserver.h" +#include "../../util/mongoutils/html.h" + +/** + handles snapshotting performance metrics and other such things + */ +namespace mongo { + + void SnapshotData::takeSnapshot() { + _created = curTimeMicros64(); + _globalUsage = Top::global.getGlobalData(); +// _totalWriteLockedTime = d.dbMutex.info().getTimeLocked(); + Top::global.cloneMap(_usage); + } + + SnapshotDelta::SnapshotDelta( const SnapshotData& older , const SnapshotData& newer ) + : _older( older ) , _newer( newer ) { + assert( _newer._created > _older._created ); + _elapsed = _newer._created - _older._created; + } + + Top::CollectionData SnapshotDelta::globalUsageDiff() { + return Top::CollectionData( _older._globalUsage , _newer._globalUsage ); + } + Top::UsageMap SnapshotDelta::collectionUsageDiff() { + assert( _newer._created > _older._created ); + Top::UsageMap u; + + for ( Top::UsageMap::const_iterator i=_newer._usage.begin(); i != _newer._usage.end(); i++ ) { + Top::UsageMap::const_iterator j = _older._usage.find(i->first); + if (j != _older._usage.end()) + u[i->first] = Top::CollectionData( j->second , i->second ); + else + u[i->first] = i->second; + } + return u; + } + + Snapshots::Snapshots(int n) + : _lock("Snapshots"), _n(n) + , _snapshots(new SnapshotData[n]) + , _loc(0) + , _stored(0) + {} + + const SnapshotData* Snapshots::takeSnapshot() { + scoped_lock lk(_lock); + _loc = ( _loc + 1 ) % _n; + _snapshots[_loc].takeSnapshot(); + if ( _stored < _n ) + _stored++; + return &_snapshots[_loc]; + } + + auto_ptr<SnapshotDelta> Snapshots::computeDelta( int numBack ) { + scoped_lock lk(_lock); + auto_ptr<SnapshotDelta> p; + if ( numBack < numDeltas() ) + p.reset( new SnapshotDelta( getPrev(numBack+1) , getPrev(numBack) ) ); + return p; + } + + const SnapshotData& Snapshots::getPrev( int numBack ) { + int x = _loc - numBack; + if ( x < 0 ) + x += _n; + return _snapshots[x]; + } + + void Snapshots::outputLockInfoHTML( stringstream& ss ) { + scoped_lock lk(_lock); + ss << "\n<div>"; + for ( int i=0; i<numDeltas(); i++ ) { + SnapshotDelta d( getPrev(i+1) , getPrev(i) ); + unsigned e = (unsigned) d.elapsed() / 1000; + ss << (unsigned)(100*d.percentWriteLocked()); + if( e < 3900 || e > 4100 ) + ss << '(' << e / 1000.0 << "s)"; + ss << ' '; + } + ss << "</div>\n"; + } + + void SnapshotThread::run() { + Client::initThread("snapshotthread"); + Client& client = cc(); + + long long numLoops = 0; + + const SnapshotData* prev = 0; + + while ( ! inShutdown() ) { + try { + const SnapshotData* s = statsSnapshots.takeSnapshot(); + + if ( prev && cmdLine.cpu ) { + unsigned long long elapsed = s->_created - prev->_created; + SnapshotDelta d( *prev , *s ); + log() << "cpu: elapsed:" << (elapsed/1000) <<" writelock: " << (int)(100*d.percentWriteLocked()) << "%" << endl; + } + + prev = s; + } + catch ( std::exception& e ) { + log() << "ERROR in SnapshotThread: " << e.what() << endl; + } + + numLoops++; + sleepsecs(4); + } + + client.shutdown(); + } + + using namespace mongoutils::html; + + class WriteLockStatus : public WebStatusPlugin { + public: + WriteLockStatus() : WebStatusPlugin( "write lock" , 51 , "% time in write lock, by 4 sec periods" ) {} + virtual void init() {} + + virtual void run( stringstream& ss ) { + statsSnapshots.outputLockInfoHTML( ss ); + + ss << "<a " + "href=\"http://www.mongodb.org/pages/viewpage.action?pageId=7209296\" " + "title=\"snapshot: was the db in the write lock when this page was generated?\">"; + ss << "write locked now:</a> " << (d.dbMutex.info().isLocked() ? "true" : "false") << "\n"; + } + + } writeLockStatus; + + class DBTopStatus : public WebStatusPlugin { + public: + DBTopStatus() : WebStatusPlugin( "dbtop" , 50 , "(occurrences|percent of elapsed)" ) {} + + void display( stringstream& ss , double elapsed , const Top::UsageData& usage ) { + ss << "<td>"; + ss << usage.count; + ss << "</td><td>"; + double per = 100 * ((double)usage.time)/elapsed; + if( per == (int) per ) + ss << (int) per; + else + ss << setprecision(1) << fixed << per; + ss << '%'; + ss << "</td>"; + } + + void display( stringstream& ss , double elapsed , const string& ns , const Top::CollectionData& data ) { + if ( ns != "TOTAL" && data.total.count == 0 ) + return; + ss << "<tr><th>" << ns << "</th>"; + + display( ss , elapsed , data.total ); + + display( ss , elapsed , data.readLock ); + display( ss , elapsed , data.writeLock ); + + display( ss , elapsed , data.queries ); + display( ss , elapsed , data.getmore ); + display( ss , elapsed , data.insert ); + display( ss , elapsed , data.update ); + display( ss , elapsed , data.remove ); + + ss << "</tr>\n"; + } + + void run( stringstream& ss ) { + auto_ptr<SnapshotDelta> delta = statsSnapshots.computeDelta(); + if ( ! delta.get() ) + return; + + ss << "<table border=1 cellpadding=2 cellspacing=0>"; + ss << "<tr align='left'><th>"; + ss << a("http://www.mongodb.org/display/DOCS/Developer+FAQ#DeveloperFAQ-What%27sa%22namespace%22%3F", "namespace") << + "NS</a></th>" + "<th colspan=2>total</th>" + "<th colspan=2>Reads</th>" + "<th colspan=2>Writes</th>" + "<th colspan=2>Queries</th>" + "<th colspan=2>GetMores</th>" + "<th colspan=2>Inserts</th>" + "<th colspan=2>Updates</th>" + "<th colspan=2>Removes</th>"; + ss << "</tr>\n"; + + display( ss , (double) delta->elapsed() , "TOTAL" , delta->globalUsageDiff() ); + + Top::UsageMap usage = delta->collectionUsageDiff(); + for ( Top::UsageMap::iterator i=usage.begin(); i != usage.end(); i++ ) { + display( ss , (double) delta->elapsed() , i->first , i->second ); + } + + ss << "</table>"; + + } + + virtual void init() {} + } dbtopStatus; + + Snapshots statsSnapshots; + SnapshotThread snapshotThread; + +} diff --git a/src/mongo/db/stats/snapshots.h b/src/mongo/db/stats/snapshots.h new file mode 100644 index 00000000000..d9b8e5eb901 --- /dev/null +++ b/src/mongo/db/stats/snapshots.h @@ -0,0 +1,114 @@ +// snapshots.h + +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once +#include "../../pch.h" +#include "../jsobj.h" +#include "top.h" +#include "../../util/background.h" + +/** + handles snapshotting performance metrics and other such things + */ +namespace mongo { + + class SnapshotThread; + + /** + * stores a point in time snapshot + * i.e. all counters at a given time + */ + class SnapshotData { + void takeSnapshot(); + + unsigned long long _created; + Top::CollectionData _globalUsage; + unsigned long long _totalWriteLockedTime; // micros of total time locked + Top::UsageMap _usage; + + friend class SnapshotThread; + friend class SnapshotDelta; + friend class Snapshots; + }; + + /** + * contains performance information for a time period + */ + class SnapshotDelta { + public: + SnapshotDelta( const SnapshotData& older , const SnapshotData& newer ); + + unsigned long long start() const { + return _older._created; + } + + unsigned long long elapsed() const { + return _elapsed; + } + + unsigned long long timeInWriteLock() const { + return _newer._totalWriteLockedTime - _older._totalWriteLockedTime; + } + double percentWriteLocked() const { + double e = (double) elapsed(); + double w = (double) timeInWriteLock(); + return w/e; + } + + Top::CollectionData globalUsageDiff(); + Top::UsageMap collectionUsageDiff(); + + private: + const SnapshotData& _older; + const SnapshotData& _newer; + + unsigned long long _elapsed; + }; + + class Snapshots { + public: + Snapshots(int n=100); + + const SnapshotData* takeSnapshot(); + + int numDeltas() const { return _stored-1; } + + const SnapshotData& getPrev( int numBack = 0 ); + auto_ptr<SnapshotDelta> computeDelta( int numBack = 0 ); + + + void outputLockInfoHTML( stringstream& ss ); + private: + mongo::mutex _lock; + int _n; + boost::scoped_array<SnapshotData> _snapshots; + int _loc; + int _stored; + }; + + class SnapshotThread : public BackgroundJob { + public: + virtual string name() const { return "snapshot"; } + void run(); + }; + + extern Snapshots statsSnapshots; + extern SnapshotThread snapshotThread; + + +} diff --git a/src/mongo/db/stats/top.cpp b/src/mongo/db/stats/top.cpp new file mode 100644 index 00000000000..f5b6ee42f1c --- /dev/null +++ b/src/mongo/db/stats/top.cpp @@ -0,0 +1,183 @@ +// top.cpp +/* + * Copyright (C) 2010 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + +#include "pch.h" +#include "top.h" +#include "../../util/net/message.h" +#include "../commands.h" + +namespace mongo { + + Top::UsageData::UsageData( const UsageData& older , const UsageData& newer ) { + // this won't be 100% accurate on rollovers and drop(), but at least it won't be negative + time = (newer.time >= older.time) ? (newer.time - older.time) : newer.time; + count = (newer.count >= older.count) ? (newer.count - older.count) : newer.count; + } + + Top::CollectionData::CollectionData( const CollectionData& older , const CollectionData& newer ) + : total( older.total , newer.total ) , + readLock( older.readLock , newer.readLock ) , + writeLock( older.writeLock , newer.writeLock ) , + queries( older.queries , newer.queries ) , + getmore( older.getmore , newer.getmore ) , + insert( older.insert , newer.insert ) , + update( older.update , newer.update ) , + remove( older.remove , newer.remove ), + commands( older.commands , newer.commands ) { + + } + + void Top::record( const string& ns , int op , int lockType , long long micros , bool command ) { + if ( ns[0] == '?' ) + return; + + //cout << "record: " << ns << "\t" << op << "\t" << command << endl; + scoped_lock lk(_lock); + + if ( ( command || op == dbQuery ) && ns == _lastDropped ) { + _lastDropped = ""; + return; + } + + CollectionData& coll = _usage[ns]; + _record( coll , op , lockType , micros , command ); + _record( _global , op , lockType , micros , command ); + } + + void Top::_record( CollectionData& c , int op , int lockType , long long micros , bool command ) { + c.total.inc( micros ); + + if ( lockType > 0 ) + c.writeLock.inc( micros ); + else if ( lockType < 0 ) + c.readLock.inc( micros ); + + switch ( op ) { + case 0: + // use 0 for unknown, non-specific + break; + case dbUpdate: + c.update.inc( micros ); + break; + case dbInsert: + c.insert.inc( micros ); + break; + case dbQuery: + if ( command ) + c.commands.inc( micros ); + else + c.queries.inc( micros ); + break; + case dbGetMore: + c.getmore.inc( micros ); + break; + case dbDelete: + c.remove.inc( micros ); + break; + case dbKillCursors: + break; + case opReply: + case dbMsg: + log() << "unexpected op in Top::record: " << op << endl; + break; + default: + log() << "unknown op in Top::record: " << op << endl; + } + + } + + void Top::collectionDropped( const string& ns ) { + //cout << "collectionDropped: " << ns << endl; + scoped_lock lk(_lock); + _usage.erase(ns); + _lastDropped = ns; + } + + void Top::cloneMap(Top::UsageMap& out) const { + scoped_lock lk(_lock); + out = _usage; + } + + void Top::append( BSONObjBuilder& b ) { + scoped_lock lk( _lock ); + _appendToUsageMap( b , _usage ); + } + + void Top::_appendToUsageMap( BSONObjBuilder& b , const UsageMap& map ) const { + for ( UsageMap::const_iterator i=map.begin(); i!=map.end(); i++ ) { + BSONObjBuilder bb( b.subobjStart( i->first ) ); + + const CollectionData& coll = i->second; + + _appendStatsEntry( b , "total" , coll.total ); + + _appendStatsEntry( b , "readLock" , coll.readLock ); + _appendStatsEntry( b , "writeLock" , coll.writeLock ); + + _appendStatsEntry( b , "queries" , coll.queries ); + _appendStatsEntry( b , "getmore" , coll.getmore ); + _appendStatsEntry( b , "insert" , coll.insert ); + _appendStatsEntry( b , "update" , coll.update ); + _appendStatsEntry( b , "remove" , coll.remove ); + _appendStatsEntry( b , "commands" , coll.commands ); + + bb.done(); + } + } + + void Top::_appendStatsEntry( BSONObjBuilder& b , const char * statsName , const UsageData& map ) const { + BSONObjBuilder bb( b.subobjStart( statsName ) ); + bb.appendNumber( "time" , map.time ); + bb.appendNumber( "count" , map.count ); + bb.done(); + } + + class TopCmd : public Command { + public: + TopCmd() : Command( "top", true ) {} + + virtual bool slaveOk() const { return true; } + virtual bool adminOnly() const { return true; } + virtual LockType locktype() const { return READ; } + virtual void help( stringstream& help ) const { help << "usage by collection, in micros "; } + + virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { + { + BSONObjBuilder b( result.subobjStart( "totals" ) ); + b.append( "note" , "all times in microseconds" ); + Top::global.append( b ); + b.done(); + } + return true; + } + + } topCmd; + + Top Top::global; + + TopOld::T TopOld::_snapshotStart = TopOld::currentTime(); + TopOld::D TopOld::_snapshotDuration; + TopOld::UsageMap TopOld::_totalUsage; + TopOld::UsageMap TopOld::_snapshotA; + TopOld::UsageMap TopOld::_snapshotB; + TopOld::UsageMap &TopOld::_snapshot = TopOld::_snapshotA; + TopOld::UsageMap &TopOld::_nextSnapshot = TopOld::_snapshotB; + mongo::mutex TopOld::topMutex("topMutex"); + + +} diff --git a/src/mongo/db/stats/top.h b/src/mongo/db/stats/top.h new file mode 100644 index 00000000000..9645ed1a3a6 --- /dev/null +++ b/src/mongo/db/stats/top.h @@ -0,0 +1,247 @@ +// top.h : DB usage monitor. + +/* Copyright 2009 10gen Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include <boost/date_time/posix_time/posix_time.hpp> +#undef assert +#define assert MONGO_assert + +namespace mongo { + + /** + * tracks usage by collection + */ + class Top { + + public: + Top() : _lock("Top") { } + + struct UsageData { + UsageData() : time(0) , count(0) {} + UsageData( const UsageData& older , const UsageData& newer ); + long long time; + long long count; + + void inc( long long micros ) { + count++; + time += micros; + } + }; + + struct CollectionData { + /** + * constructs a diff + */ + CollectionData() {} + CollectionData( const CollectionData& older , const CollectionData& newer ); + + UsageData total; + + UsageData readLock; + UsageData writeLock; + + UsageData queries; + UsageData getmore; + UsageData insert; + UsageData update; + UsageData remove; + UsageData commands; + }; + + typedef map<string,CollectionData> UsageMap; + + public: + void record( const string& ns , int op , int lockType , long long micros , bool command ); + void append( BSONObjBuilder& b ); + void cloneMap(UsageMap& out) const; + CollectionData getGlobalData() const { return _global; } + void collectionDropped( const string& ns ); + + public: // static stuff + static Top global; + + private: + void _appendToUsageMap( BSONObjBuilder& b , const UsageMap& map ) const; + void _appendStatsEntry( BSONObjBuilder& b , const char * statsName , const UsageData& map ) const; + void _record( CollectionData& c , int op , int lockType , long long micros , bool command ); + + mutable mongo::mutex _lock; + CollectionData _global; + UsageMap _usage; + string _lastDropped; + }; + + /* Records per namespace utilization of the mongod process. + No two functions of this class may be called concurrently. + */ + class TopOld { + typedef boost::posix_time::ptime T; + typedef boost::posix_time::time_duration D; + typedef boost::tuple< D, int, int, int > UsageData; + public: + TopOld() : _read(false), _write(false) { } + + /* these are used to record activity: */ + + void clientStart( const char *client ) { + clientStop(); + _currentStart = currentTime(); + _current = client; + } + + /* indicate current request is a read operation. */ + void setRead() { _read = true; } + + void setWrite() { _write = true; } + + void clientStop() { + if ( _currentStart == T() ) + return; + D d = currentTime() - _currentStart; + + { + scoped_lock L(topMutex); + recordUsage( _current, d ); + } + + _currentStart = T(); + _read = false; + _write = false; + } + + /* these are used to fetch the stats: */ + + struct Usage { + string ns; + D time; + double pct; + int reads, writes, calls; + }; + + static void usage( vector< Usage > &res ) { + scoped_lock L(topMutex); + + // Populate parent namespaces + UsageMap snapshot; + UsageMap totalUsage; + fillParentNamespaces( snapshot, _snapshot ); + fillParentNamespaces( totalUsage, _totalUsage ); + + multimap< D, string, more > sorted; + for( UsageMap::iterator i = snapshot.begin(); i != snapshot.end(); ++i ) + sorted.insert( make_pair( i->second.get<0>(), i->first ) ); + for( multimap< D, string, more >::iterator i = sorted.begin(); i != sorted.end(); ++i ) { + if ( trivialNs( i->second.c_str() ) ) + continue; + Usage u; + u.ns = i->second; + u.time = totalUsage[ u.ns ].get<0>(); + u.pct = _snapshotDuration != D() ? 100.0 * i->first.ticks() / _snapshotDuration.ticks() : 0; + u.reads = snapshot[ u.ns ].get<1>(); + u.writes = snapshot[ u.ns ].get<2>(); + u.calls = snapshot[ u.ns ].get<3>(); + res.push_back( u ); + } + for( UsageMap::iterator i = totalUsage.begin(); i != totalUsage.end(); ++i ) { + if ( snapshot.count( i->first ) != 0 || trivialNs( i->first.c_str() ) ) + continue; + Usage u; + u.ns = i->first; + u.time = i->second.get<0>(); + u.pct = 0; + u.reads = 0; + u.writes = 0; + u.calls = 0; + res.push_back( u ); + } + } + + static void completeSnapshot() { + scoped_lock L(topMutex); + + if ( &_snapshot == &_snapshotA ) { + _snapshot = _snapshotB; + _nextSnapshot = _snapshotA; + } + else { + _snapshot = _snapshotA; + _nextSnapshot = _snapshotB; + } + _snapshotDuration = currentTime() - _snapshotStart; + _snapshotStart = currentTime(); + _nextSnapshot.clear(); + } + + private: + static mongo::mutex topMutex; + static bool trivialNs( const char *ns ) { + const char *ret = strrchr( ns, '.' ); + return ret && ret[ 1 ] == '\0'; + } + typedef map<string,UsageData> UsageMap; // duration, # reads, # writes, # total calls + static T currentTime() { + return boost::posix_time::microsec_clock::universal_time(); + } + void recordUsage( const string &client, D duration ) { + recordUsageForMap( _totalUsage, client, duration ); + recordUsageForMap( _nextSnapshot, client, duration ); + } + void recordUsageForMap( UsageMap &map, const string &client, D duration ) { + UsageData& g = map[client]; + g.get< 0 >() += duration; + if ( _read && !_write ) + g.get< 1 >()++; + else if ( !_read && _write ) + g.get< 2 >()++; + g.get< 3 >()++; + } + static void fillParentNamespaces( UsageMap &to, const UsageMap &from ) { + for( UsageMap::const_iterator i = from.begin(); i != from.end(); ++i ) { + string current = i->first; + size_t dot = current.rfind( "." ); + if ( dot == string::npos || dot != current.length() - 1 ) { + inc( to[ current ], i->second ); + } + while( dot != string::npos ) { + current = current.substr( 0, dot ); + inc( to[ current ], i->second ); + dot = current.rfind( "." ); + } + } + } + static void inc( UsageData &to, const UsageData &from ) { + to.get<0>() += from.get<0>(); + to.get<1>() += from.get<1>(); + to.get<2>() += from.get<2>(); + to.get<3>() += from.get<3>(); + } + struct more { bool operator()( const D &a, const D &b ) { return a > b; } }; + string _current; + T _currentStart; + static T _snapshotStart; + static D _snapshotDuration; + static UsageMap _totalUsage; + static UsageMap _snapshotA; + static UsageMap _snapshotB; + static UsageMap &_snapshot; + static UsageMap &_nextSnapshot; + bool _read; + bool _write; + }; + +} // namespace mongo diff --git a/src/mongo/db/taskqueue.h b/src/mongo/db/taskqueue.h new file mode 100644 index 00000000000..005bd986f11 --- /dev/null +++ b/src/mongo/db/taskqueue.h @@ -0,0 +1,106 @@ +// @file deferredinvoker.h + +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#pragma once + +#include "mongomutex.h" + +namespace mongo { + + /** defer work items by queueing them for invocation by another thread. presumption is that + consumer thread is outside of locks more than the source thread. Additional presumption + is that several objects or micro-tasks will be queued and that having a single thread + processing them in batch is hepful as they (in the first use case) use a common data + structure that can then be in local cpu classes. + + this class is in db/ as it is dbMutex (mongomutex) specific (so far). + + using a functor instead of go() might be more elegant too, once again, would like to test any + performance differential. also worry that operator() hides things? + + MT - copyable "micro task" object we can queue + must have a static method void MT::go(const MT&) + + see DefInvoke in dbtests/ for an example. + */ + template< class MT > + class TaskQueue { + public: + TaskQueue() : _which(0), _invokeMutex("deferredinvoker") { } + + void defer(MT mt) { + // only one writer allowed. however the invoke processing below can occur concurrently with + // writes (for the most part) + DEV d.dbMutex.assertWriteLocked(); + + _queues[_which].push_back(mt); + } + + /** call to process deferrals. + + concurrency: handled herein. multiple threads could call invoke(), but their efforts will be + serialized. the common case is that there is a single processor calling invoke(). + + normally, you call this outside of any lock. but if you want to fully drain the queue, + call from within a read lock. for example: + { + // drain with minimal time in lock + d.invoke(); + readlock lk; + d.invoke(); + ... + } + you can also call invoke periodically to do some work and then pick up later on more. + */ + void invoke() { + mutex::scoped_lock lk2(_invokeMutex); + int toDrain = 0; + { + // flip queueing to the other queue (we are double buffered) + readlocktry lk("", 5); + if( !lk.got() ) + return; + toDrain = _which; + _which = _which ^ 1; + wassert( _queues[_which].empty() ); // we are in dbMutex, so it should be/stay empty til we exit dbMutex + } + + _drain( _queues[toDrain] ); + assert( _queues[toDrain].empty() ); + } + + private: + int _which; // 0 or 1 + typedef vector< MT > Queue; + Queue _queues[2]; + + // lock order when multiple locks: dbMutex, _invokeMutex + mongo::mutex _invokeMutex; + + void _drain(Queue& queue) { + unsigned oldCap = queue.capacity(); + for( typename Queue::iterator i = queue.begin(); i != queue.end(); i++ ) { + const MT& v = *i; + MT::go(v); + } + queue.clear(); + DEV assert( queue.capacity() == oldCap ); // just checking that clear() doesn't deallocate, we don't want that + } + }; + +} diff --git a/src/mongo/db/tests.cpp b/src/mongo/db/tests.cpp new file mode 100644 index 00000000000..00f299e1bb6 --- /dev/null +++ b/src/mongo/db/tests.cpp @@ -0,0 +1,68 @@ +/** +* Copyright (C) 2008 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* tests.cpp + + unit test & such +*/ + +#include "pch.h" +#include "../util/mmap.h" + +namespace mongo { + + int test2_old9() { + out() << "test2" << endl; + printStackTrace(); + if ( 1 ) + return 1; + + MemoryMappedFile f; + + unsigned long long len = 64*1024*1024; + char *p = (char *) f.map("/tmp/test.dat", len); + char *start = p; + char *end = p + 64*1024*1024-2; + end[1] = 'z'; + int i; + while ( p < end ) { + *p++ = ' '; + if ( ++i%64 == 0 ) { + *p++ = '\n'; + *p++ = 'x'; + } + } + *p = 'a'; + + f.flush(true); + out() << "done" << endl; + + char *x = start + 32 * 1024 * 1024; + char *y = start + 48 * 1024 * 1024; + char *z = start + 62 * 1024 * 1024; + + strcpy(z, "zfoo"); + out() << "y" << endl; + strcpy(y, "yfoo"); + strcpy(x, "xfoo"); + strcpy(start, "xfoo"); + + dbexit( EXIT_TEST ); + + return 1; + } + +} // namespace mongo |